From 58d8ad66033992b1639dd09661bb3b1dc47f1eb0 Mon Sep 17 00:00:00 2001
From: quantumaikr <hi@quantumai.kr>
Date: Fri, 10 Apr 2026 23:52:59 +0900
Subject: [PATCH] docs(guide): add 'When to use which?' scenario table + C code
 in CTA

Address Reddit feedback: guide only showed KV compression benchmarks
vs llama.cpp but didn't explain when to use quant.cpp vs llama.cpp.

Changes:
1. Added "When to use which?" table after the PPL comparison with
   concrete scenarios (WASM 192KB, MCU, game engines, teaching)
   and explicit acknowledgment of llama.cpp strengths (GPU, models)
2. CTA now shows both Python AND C single-header code side by side,
   reinforcing the "one file" value proposition
3. Updated i18n strings for EN and KO

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 site/index.html | 43 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)
diff --git a/site/index.html b/site/index.html
index 68cc9c4..8624585 100644
--- a/site/index.html
+++ b/site/index.html
@@ -480,7 +480,7 @@ <h3 class="reveal">Compression vs Quality</h3>
       </table>
     </div>
 
-    <h3 class="reveal">vs llama.cpp</h3>
+    <h3 class="reveal">vs llama.cpp KV compression</h3>
     <p class="reveal">Same 4-bit budget, 3.5x less quality degradation:</p>
     <div class="viz reveal">
       <div class="viz-title">PPL Degradation at 4-bit (lower is better)</div>
@@ -494,6 +494,23 @@ <h3 class="reveal">vs llama.cpp</h3>
       </div>
     </div>
 
+    <h3 class="reveal" style="margin-top:3rem">When to use which?</h3>
+    <p class="reveal" style="color:var(--text2);margin-bottom:1rem">llama.cpp is excellent. The difference is integration scope, not capability:</p>
+    <div class="reveal" style="overflow-x:auto">
+      <table>
+        <thead><tr><th>Scenario</th><th>quant.cpp</th><th>llama.cpp</th></tr></thead>
+        <tbody>
+          <tr><td>WASM browser demo</td><td style="color:var(--green)">192 KB binary</td><td style="color:var(--text2)">Tensor graph too large</td></tr>
+          <tr><td>Microcontroller / RTOS</td><td style="color:var(--green)">#include only</td><td style="color:var(--text2)">Needs build system</td></tr>
+          <tr><td>Game engine plugin</td><td style="color:var(--green)">Drop one .h file</td><td style="color:var(--text2)">250K LOC build</td></tr>
+          <tr><td>Learn in an afternoon</td><td style="color:var(--green)">16K LOC</td><td style="color:var(--text2)">250K+ LOC</td></tr>
+          <tr><td>GPU throughput</td><td style="color:var(--text2)">Basic</td><td style="color:var(--green)">Full Metal/CUDA</td></tr>
+          <tr><td>Model coverage</td><td style="color:var(--text2)">7 architectures</td><td style="color:var(--green)">100+</td></tr>
+        </tbody>
+      </table>
+    </div>
+    <p class="reveal" style="color:var(--text2);font-size:0.85rem;margin-top:0.5rem">Use llama.cpp for speed on a workstation. Use quant.cpp when you need to ship LLM inference <em>inside</em> something.</p>
+
     <h3 class="reveal">Context Length on 8GB Mac</h3>
     <div class="reveal">
       <table>
@@ -572,12 +589,28 @@ <h2 class="reveal" data-i18n="gl.title">Glossary</h2>
 <section class="cta" style="background:var(--bg2)">
   <div class="container reveal">
     <h2 style="margin-bottom:1rem" data-i18n="cta.title">Try It Yourself</h2>
-    <p style="color:var(--text2);margin-bottom:2rem;max-width:500px;margin-left:auto;margin-right:auto" data-i18n="cta.desc">Three lines of Python. No GPU, no API key, no setup.</p>
-    <pre style="text-align:left;display:inline-block;margin-bottom:2rem"><code>pip install quantcpp
+    <p style="color:var(--text2);margin-bottom:2rem;max-width:560px;margin-left:auto;margin-right:auto" data-i18n="cta.desc">Python one-liner or C single-header. No GPU, no API key, no setup.</p>
+    <div style="display:flex;gap:1.5rem;flex-wrap:wrap;justify-content:center;margin-bottom:2rem;text-align:left">
+      <div>
+        <div style="font-size:0.75rem;color:var(--text2);margin-bottom:0.3rem;font-weight:600">Python</div>
+        <pre style="margin:0"><code>pip install quantcpp
 
 from quantcpp import Model
 m = Model.from_pretrained("Llama-3.2-1B")
 print(m.ask("What is gravity?"))</code></pre>
+      </div>
+      <div>
+        <div style="font-size:0.75rem;color:var(--text2);margin-bottom:0.3rem;font-weight:600">C (single header)</div>
+        <pre style="margin:0"><code>#include "quant.h"
+
+int main() {
+    quant_model* m = quant_load("model.gguf");
+    quant_generate(quant_new(m, NULL),
+        "Hello!", print_token, NULL);
+}
+// cc app.c -lm -lpthread</code></pre>
+      </div>
+    </div>
     <br>
     <a href="https://github.com/quantumaikr/quant.cpp" class="cta-btn cta-primary">GitHub</a>
     <a href="https://pypi.org/project/quantcpp/" class="cta-btn cta-secondary">PyPI</a>
@@ -715,7 +748,7 @@ <h2 style="margin-bottom:1rem" data-i18n="cta.title">Try It Yourself</h2>
     'ch5.label':'Chapter 5','ch5.title':'Benchmarks','ch5.desc':'All measurements on Llama 3.2 1B Instruct (Q8_0 GGUF), Apple M1 Pro, 8 threads.',
     'ch6.label':'Chapter 6','ch6.title':'Research Foundations','ch6.desc':'Each technique in quant.cpp is grounded in peer-reviewed research:',
     'gl.label':'Reference','gl.title':'Glossary',
-    'cta.title':'Try It Yourself','cta.desc':'Three lines of Python. No GPU, no API key, no setup.',
+    'cta.title':'Try It Yourself','cta.desc':'Python one-liner or C single-header. No GPU, no API key, no setup.',
   },
   ko: {
     'nav.problem':'문제점','nav.solution':'핵심 발견','nav.techniques':'4가지 기술',
@@ -748,7 +781,7 @@ <h2 style="margin-bottom:1rem" data-i18n="cta.title">Try It Yourself</h2>
     'ch5.label':'챕터 5','ch5.title':'벤치마크','ch5.desc':'모든 측정: Llama 3.2 1B Instruct (Q8_0 GGUF), Apple M1 Pro, 8 스레드.',
     'ch6.label':'챕터 6','ch6.title':'연구 기반','ch6.desc':'quant.cpp의 각 기술은 동료 심사를 거친 연구에 기반합니다:',
     'gl.label':'참조','gl.title':'용어집',
-    'cta.title':'직접 해보기','cta.desc':'Python 3줄. GPU도, API 키도, 설정도 필요 없습니다.',
+    'cta.title':'직접 해보기','cta.desc':'Python 한 줄 또는 C 헤더 하나. GPU도, API 키도, 설정도 필요 없습니다.',
   }
 };
 

Scenario	quant.cpp	llama.cpp
WASM browser demo	192 KB binary	Tensor graph too large
Microcontroller / RTOS	#include only	Needs build system
Game engine plugin	Drop one .h file	250K LOC build
Learn in an afternoon	16K LOC	250K+ LOC
GPU throughput	Basic	Full Metal/CUDA
Model coverage	7 architectures	100+