From cb7da66d06150688e0328d738b25dd870a7995a3 Mon Sep 17 00:00:00 2001 From: Sumit Sahoo Date: Sun, 17 May 2026 18:13:16 +0530 Subject: [PATCH] fix(ai): correct model bundle size labels to match actual HF downloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Registry `approxSizeBytes` values had drifted from the real on-disk weights — the Quality tier was showing "≈ 1.8 GB total" in the gate when the actual download is ~1.88 GB, and the Compact tier overstated by ~390 MB on the chat model alone. Verified against HuggingFace file listings for each pinned dtype: - LFM2.5-1.2B (q4): 1.2 GB → 810 MB (model_q4.onnx_data is 850 MB) - LFM2-2.6B (q4f16): 1.5 GB → 1.55 GB - EmbeddingGemma q8: 309 MB → 320 MB (was missing the 26 MB Gemma SentencePiece tokenizer) - Embed peak RAM: 400 MB → 500 MB (int8 dequant overhead) Aggregate now reads "≈ 1.1 GB total" on Compact and "≈ 1.9 GB total" on Quality. Prose updated in README, docs/local-ai.md, tool-registry, ChatModelPicker, AiModelDetailsModal, ai-runtime, useRagModels, and the AskPdf timing-weight comment. --- README.md | 10 +++--- docs/local-ai.md | 26 +++++++------- src/components/AiModelDetailsModal.tsx | 5 +-- src/components/ChatModelPicker.tsx | 4 +-- src/config/tool-registry.ts | 7 ++-- src/hooks/useRagModels.ts | 5 +-- src/tools/AskPdf.tsx | 2 +- src/utils/ai-models.ts | 49 ++++++++++++++------------ src/utils/ai-runtime.ts | 5 +-- 9 files changed, 63 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 16b1058..9a0bf88 100644 --- a/README.md +++ b/README.md @@ -33,9 +33,9 @@ CloakPDF offers **36 powerful PDF tools**, all running 100% client-side. Feature _Chat with your PDF using a small AI model running entirely in your browser — no API keys, no server round-trips_ -| Tool | Description | -| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Ask your PDF** _(beta)_ | Ask natural-language questions about any PDF and get grounded answers extracted from the document text. Powered by a downloadable small chat model (1.2 GB or 2.6 GB tier, your choice). See [Local AI](#-local-ai-on-device-rag) below. | +| Tool | Description | +| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Ask your PDF** _(beta)_ | Ask natural-language questions about any PDF and get grounded answers extracted from the document text. Powered by a downloadable small chat model (1.2B or 2.6B parameter tier, your choice). See [Local AI](#-local-ai-on-device-rag) below. | ### 🗂️ Organise & Edit @@ -207,8 +207,8 @@ pipeline **entirely in your browser** — no API key, no inference server, no usage quota. The model weights download once from Hugging Face's CDN, get cached, and work offline forever after. -Three small models work together on first use (~1.55 GB total on the -default Compact chat tier): +Three small models work together on first use (~1.15 GB total on the +default Compact chat tier; ~1.9 GB on Quality): - **Chat** — [LFM2.5-1.2B-Instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct) _(Compact, default)_ or [LFM2-2.6B](https://huggingface.co/LiquidAI/LFM2-2.6B) _(Quality)_ - **Retrieval** — [EmbeddingGemma-300M](https://huggingface.co/onnx-community/embeddinggemma-300m-ONNX) diff --git a/docs/local-ai.md b/docs/local-ai.md index 7ea18ee..1e67dc8 100644 --- a/docs/local-ai.md +++ b/docs/local-ai.md @@ -18,17 +18,19 @@ the pipeline or debugging an unexpected answer. Three pipelines load together on first use: -| Role | Model | On disk | Peak RAM | License | -| ---------------- | ------------------------------------------------------------------------------------- | ------- | -------- | ----------------- | -| Chat _(Compact)_ | [LFM2.5-1.2B-Instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct) | ~1.2 GB | ~2.0 GB | LFM Open Lic v1.0 | -| Chat _(Quality)_ | [LFM2-2.6B](https://huggingface.co/LiquidAI/LFM2-2.6B) _(picker upgrade)_ | ~1.5 GB | ~3.5 GB | LFM Open Lic v1.0 | -| Retrieval | [EmbeddingGemma-300M](https://huggingface.co/onnx-community/embeddinggemma-300m-ONNX) | ~309 MB | ~400 MB | Gemma Terms | -| Reranking | [MS MARCO MiniLM-L-6-v2](https://huggingface.co/Xenova/ms-marco-MiniLM-L-6-v2) | ~23 MB | ~90 MB | Apache 2.0 | - -A fresh visitor on the **Compact** tier downloads **~1.55 GB** total. -The chat tier is user-selectable in the consent modal; both LFM tiers -share the embedder and reranker, so swapping tiers only re-downloads -the chat slot. +| Role | Model | On disk | Peak RAM | License | +| ---------------- | ------------------------------------------------------------------------------------- | -------- | -------- | ----------------- | +| Chat _(Compact)_ | [LFM2.5-1.2B-Instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct) | ~810 MB | ~2.0 GB | LFM Open Lic v1.0 | +| Chat _(Quality)_ | [LFM2-2.6B](https://huggingface.co/LiquidAI/LFM2-2.6B) _(picker upgrade)_ | ~1.55 GB | ~3.5 GB | LFM Open Lic v1.0 | +| Retrieval | [EmbeddingGemma-300M](https://huggingface.co/onnx-community/embeddinggemma-300m-ONNX) | ~320 MB | ~500 MB | Gemma Terms | +| Reranking | [MS MARCO MiniLM-L-6-v2](https://huggingface.co/Xenova/ms-marco-MiniLM-L-6-v2) | ~23 MB | ~90 MB | Apache 2.0 | + +A fresh visitor on the **Compact** tier downloads **~1.15 GB** total +(810 MB chat + 320 MB embed + 23 MB rerank); switching to **Quality** +adds ~750 MB on top for the 2.6B chat weights, landing the bundle +at **~1.9 GB**. The chat tier is user-selectable in the consent modal; +both LFM tiers share the embedder and reranker, so swapping tiers +only re-downloads the chat slot. The registry of these models lives in [`src/utils/ai-models.ts`](../src/utils/ai-models.ts) — every entry @@ -224,7 +226,7 @@ caches: ```mermaid flowchart LR subgraph Cold["Cold visit"] - CDN[Hugging Face CDN] --> Cache[CacheStorage
~1.55 GB] + CDN[Hugging Face CDN] --> Cache[CacheStorage
~1.15 GB Compact / ~1.9 GB Quality] PDF1([upload PDF]) --> Embed1[chunk + embed] --> IDB1[(IndexedDB)] end diff --git a/src/components/AiModelDetailsModal.tsx b/src/components/AiModelDetailsModal.tsx index 087f476..544aafa 100644 --- a/src/components/AiModelDetailsModal.tsx +++ b/src/components/AiModelDetailsModal.tsx @@ -51,7 +51,7 @@ interface AiModelDetailsModalProps { * and clear the consent flags so the user re-experiences the * download dialog on next use. Wire from `useRagModels.evict`. * Hidden when omitted; rendered with an inline two-step confirm - * when present so a stray click can't nuke a 1.5 GB download. + * when present so a stray click can't nuke a 1+ GB download. */ onDelete?: () => void | Promise; /** @@ -290,7 +290,8 @@ function RequirementsLine({ totalBytes }: { totalBytes: number }) { * Footer panel offering the two storage knobs: a soft "Free memory" * (release RAM, keep the downloaded weights cached on disk so the * next use warm-loads in seconds) and a destructive "Delete cached - * models" (also evict the CacheStorage bytes, ~1.5 GB). + * models" (also evict the CacheStorage bytes — roughly 1.2 GB on the + * Compact tier, 1.9 GB on Quality). * * The destructive action goes through a two-step confirm: the first * click swaps the button into an "armed" state with a red warning diff --git a/src/components/ChatModelPicker.tsx b/src/components/ChatModelPicker.tsx index 1f7be5b..064a4b4 100644 --- a/src/components/ChatModelPicker.tsx +++ b/src/components/ChatModelPicker.tsx @@ -3,8 +3,8 @@ * * Two tiers, each backed by an entry in `src/utils/ai-models.ts`: * - * - Compact → LFM2.5-1.2B-Instruct (~1.2 GB / ~2 GB peak) - * - Quality → LFM2-2.6B (~1.5 GB / ~3.5 GB peak) + * - Compact → LFM2.5-1.2B-Instruct (~810 MB / ~2 GB peak) + * - Quality → LFM2-2.6B (~1.55 GB / ~3.5 GB peak) * * The picker shows download size and peak RAM so users can see what * they're committing to. We deliberately do **not** auto-recommend a diff --git a/src/config/tool-registry.ts b/src/config/tool-registry.ts index ed3cd09..4fffb65 100644 --- a/src/config/tool-registry.ts +++ b/src/config/tool-registry.ts @@ -361,11 +361,14 @@ export const tools: Tool[] = [ beta: true, // Two chat tiers ship today (see `CHAT_VARIANT_IDS` in // `src/utils/ai-models.ts`): LFM2.5-1.2B-Instruct Compact - // (~1.2 GB / 2 GB peak) and LFM2-2.6B Quality (~1.5 GB / 3.5 GB + // (~810 MB / 2 GB peak) and LFM2-2.6B Quality (~1.55 GB / 3.5 GB // peak). Both are Liquid AI's LFM family — the cross-tier e2e // showed they dominate SmolLM2-1.7B on speed AND extraction // discipline, so we dropped SmolLM2 entirely. The embedder is - // shared — EmbeddingGemma 300M (~197 MB / ~300 MB peak RAM). + // shared — EmbeddingGemma 300M (~320 MB / ~500 MB peak RAM, + // tokenizer included). A 23 MB MS MARCO MiniLM reranker rides + // alongside. Aggregate first-time download: ~1.15 GB on Compact, + // ~1.9 GB on Quality. // With OS + browser + tab overhead the practical floor for the // Quality tier is ~16 GB; users on lower-RAM machines should // pick Compact in the gate's tier picker. The `desktopOnly` flag below hides the tool on mobile diff --git a/src/hooks/useRagModels.ts b/src/hooks/useRagModels.ts index a6bbf06..30695f5 100644 --- a/src/hooks/useRagModels.ts +++ b/src/hooks/useRagModels.ts @@ -98,8 +98,9 @@ export interface UseRagModelsReturn { * Full evict — releases RAM **and** deletes the model weights * from the browser's CacheStorage, then clears every * `cloakpdf:ai-model-ready:*` flag so the consent dialog re- - * appears on next use. Frees ~1.5 GB of disk for the current AI - * bundle; the user pays a full re-download next time they touch + * appears on next use. Frees roughly 1.2 GB (Compact) / 1.9 GB + * (Quality) of disk for the current AI bundle; the user pays a + * full re-download next time they touch * the AI feature. Returns the cache-evict result so the caller * can tell the user how much was actually deleted (0 caches * means there was nothing cached to begin with). diff --git a/src/tools/AskPdf.tsx b/src/tools/AskPdf.tsx index 848ed70..1f4e72d 100644 --- a/src/tools/AskPdf.tsx +++ b/src/tools/AskPdf.tsx @@ -566,7 +566,7 @@ function IndexingCard({ progress }: { progress: IndexingProgress | null }) { * **Weight choice** (extract = 30 %, embed = 70 %): on a typical * text-layer PDF, extraction is fast (a few ms per page) while * embedding is the long pole — each batch is a WASM forward pass - * against a 309 MB int8 model. Roughly matches the wall-clock split + * against a ~320 MB int8 embedder. Roughly matches the wall-clock split * we see on the résumé fixture. OCR-heavy PDFs invert this, but we * accept the small lie there because OCR users see the dedicated * "Running OCR on scanned pages…" label and know it's slow. diff --git a/src/utils/ai-models.ts b/src/utils/ai-models.ts index abeb3ea..004f0be 100644 --- a/src/utils/ai-models.ts +++ b/src/utils/ai-models.ts @@ -14,10 +14,10 @@ * The chat slot ships **two tiers** (see {@link CHAT_VARIANT_IDS}), * both from Liquid AI's LFM family: * - * - `lfm2.5-1.2b` — Compact: ~1.2 GB / ~2 GB peak. Liquid AI's + * - `lfm2.5-1.2b` — Compact: ~810 MB / ~2 GB peak. Liquid AI's * latest 1.2B hybrid (LFM2.5 = LFM2 base + extended pretraining * + RL post-training). The static default for fresh visitors. - * - `lfm2-2.6b` — Quality: ~1.5 GB / ~3.5 GB peak. Liquid AI's + * - `lfm2-2.6b` — Quality: ~1.55 GB / ~3.5 GB peak. Liquid AI's * larger hybrid; purpose-built for on-device structured extraction * and RAG. Liquid hasn't shipped a 2.6 B variant of LFM2.5 yet, so * this tier stays on the LFM2 build. Recommended on ≥ 8 GB free RAM. @@ -179,21 +179,23 @@ const CHAT_LFM2_5_1_2B: AiModelInfo = { displayName: "LFM2.5 (1.2B, instruct, Liquid AI)", repo: "LiquidAI/LFM2.5-1.2B-Instruct-ONNX", task: "text-generation", - // ~1.2 GB on disk at q4, ~2 GB peak RAM (Liquid AI's published - // q4 size; q4f16 isn't shipped for this repo so we use plain q4 - // which is their documented WebGPU-recommended quant). Same - // hybrid architecture as LFM2-1.2B (10-conv + 6-attention) but - // newer training recipe (extended pretraining + RL post-training) - // — Liquid markets LFM2.5 as the latest of the family. + // ~810 MB on disk at q4 (`model_q4.onnx_data` 850 MB on HF + + // tokenizer/configs ~3 MB; the q4 weights count toward both disk + // and RAM), ~2 GB peak RAM. Same hybrid architecture as LFM2-1.2B + // (10-conv + 6-attention) but newer training recipe (extended + // pretraining + RL post-training) — Liquid markets LFM2.5 as the + // latest of the family. We pin `dtype: "q4"` (plain int4 with fp32 + // activations) because it's the WebGPU-validated quant on this repo; + // q4f16 *is* shipped now (760 MB) but introduces fp16 LayerNorms + // that have historically broken onnxruntime-web's WebGPU shader on + // some Chrome builds — sticking with q4 keeps the pipeline robust. // // **Why this slot is LFM2.5-1.2B-Instruct and not LFM2-1.2B**: // straight version-superset. Same parameter count, same family, - // newer training. The q4-vs-q4f16 swap is forced by Liquid's - // ONNX export (they don't ship q4f16 for LFM2.5-1.2B) — q4 with - // fp32 activations is slightly heavier on disk but works on the - // same WebGPU path. Validated against the résumé probe before - // shipping; passes phone/email/address extraction the same way - // the LFM2-1.2B q4f16 build did. + // newer training. We pin `dtype: "q4"` (not q4f16) because the + // q4 build is the one we validated end-to-end on WebGPU — passes + // phone/email/address extraction on the résumé probe the same + // way the prior LFM2-1.2B q4f16 build did. // // **Why not LFM2.5-350M**: tried it on paper but the chat slot // has burned every model at ≤ 500M params (SmolLM2-360M, Qwen @@ -201,7 +203,7 @@ const CHAT_LFM2_5_1_2B: AiModelInfo = { // verbatim extraction — they confabulate plausible-looking // digits/emails instead of copying from the retrieved chunk. // Sticking to 1.2B keeps the discipline guarantee. - approxSizeBytes: Math.round(1.2 * 1024 * 1024 * 1024), + approxSizeBytes: Math.round(810 * 1024 * 1024), approxPeakRamBytes: Math.round(2 * 1024 * 1024 * 1024), description: "Liquid AI's latest 1.2B hybrid (extended pretraining + RL post-training over the LFM2 base). Designed for on-device structured extraction and RAG. The smaller of the two LFM2-family tiers we ship.", @@ -239,12 +241,13 @@ const CHAT_LFM2_2_6B: AiModelInfo = { displayName: "LFM2 (2.6B, Liquid AI)", repo: "onnx-community/LFM2-2.6B-ONNX", task: "text-generation", - // ~1.5 GB on disk at q4f16, ~3.5 GB peak RAM. The largest of the - // three tiers — recommended on ≥ 8 GB free RAM. Same hybrid + // ~1.55 GB on disk at q4f16 (`model_q4f16.onnx_data` 1.54 GB on HF + // + tokenizer 3.3 MB + configs), ~3.5 GB peak RAM. The larger of + // the two tiers — recommended on ≥ 8 GB free RAM. Same hybrid // architecture and training discipline as LFM2-1.2B but with the // extra capacity that lets it handle longer, more nuanced // extraction questions. - approxSizeBytes: Math.round(1.5 * 1024 * 1024 * 1024), + approxSizeBytes: Math.round(1.55 * 1024 * 1024 * 1024), approxPeakRamBytes: Math.round(3.5 * 1024 * 1024 * 1024), description: "Liquid AI's larger hybrid model. Same on-device extraction discipline as LFM2-1.2B with more capacity for longer answers and harder questions.", @@ -270,8 +273,10 @@ const EMBED: AiModelInfo = { displayName: "EmbeddingGemma (300M)", repo: "onnx-community/embeddinggemma-300m-ONNX", task: "feature-extraction", - // ~309 MB on disk (int8 quantized weights), ~400 MB peak RAM. - // 2× the prior bge-base-en-v1.5 (~140 MB) on disk but the + // ~320 MB on disk (~295 MB int8 weights via `model_quantized.onnx` + // + ~26 MB Gemma SentencePiece tokenizer — the latter is non-trivial + // and used to be missed in the registry estimate), ~500 MB peak RAM. + // Bigger than the prior bge-base-en-v1.5 (~140 MB) on disk but the // retrieval quality jump from EmbeddingGemma's asymmetric // task-prefix training is meaningful, and runtime RAM is // comparable thanks to int8 weights vs bge's fp16. 308M params @@ -315,8 +320,8 @@ const EMBED: AiModelInfo = { // Prefix handling lives in `src/rag/embeddings.ts` — swapping // back to a symmetric embedder (e.g. bge) means dropping that // prefix layer. - approxSizeBytes: 309 * 1024 * 1024, - approxPeakRamBytes: 400 * 1024 * 1024, + approxSizeBytes: 320 * 1024 * 1024, + approxPeakRamBytes: 500 * 1024 * 1024, description: "Google's on-device embedding model from the Gemma family. Trained for asymmetric retrieval — applies task-specific prompts to PDF chunks vs your question, then matches them in a 768-dim vector space so the chat model gets the right pages. Multilingual (100+ langs).", bestFor: "Semantic retrieval over PDFs in any of 100+ languages.", diff --git a/src/utils/ai-runtime.ts b/src/utils/ai-runtime.ts index 02ab2db..1a0f69e 100644 --- a/src/utils/ai-runtime.ts +++ b/src/utils/ai-runtime.ts @@ -254,8 +254,9 @@ export interface ModelCacheEvictResult { /** * Evict the Transformers.js model bytes from the browser's - * CacheStorage. Frees ~1.5 GB of disk for the current AI bundle - * (chat + embed + rerank) and forces a fresh download on next use. + * CacheStorage. Frees roughly 1.2 GB on the Compact tier / 1.9 GB on + * Quality for the current AI bundle (chat + embed + rerank) and + * forces a fresh download on next use. * * Does **not** unload the in-memory pipelines — call * {@link disposeAllModels} alongside this when you actually want the