Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions bindings/python/quantcpp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@
"smollm2-135m-instruct-q8_0.gguf",
135,
),
"Qwen3-0.6B": (
"unsloth/Qwen3-0.6B-GGUF",
"Qwen3-0.6B-Q4_K_M.gguf",
378,
"Qwen3.5-0.8B": (
"unsloth/Qwen3.5-0.8B-GGUF",
"Qwen3.5-0.8B-Q4_K_M.gguf",
508,
),
"Llama-3.2-1B": (
"hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
Expand Down
23 changes: 5 additions & 18 deletions quant.h
Original file line number Diff line number Diff line change
Expand Up @@ -9982,24 +9982,11 @@ static tq_model_t* tq_load_safetensors(const char* path) {

free(tensors);

/* Qwen RMSNorm adjustment: Qwen's RMSNorm computes
* output = norm(x) * (1.0 + weight), NOT norm(x) * weight.
* We bake the "+1" into the weight so tq_rmsnorm can stay as
* out = x * rsqrt * weight.
*
* This applies to: input_layernorm, post_attention_layernorm,
* model.norm, q_norm, k_norm.
* It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated
* uses plain weight without +1).
*
* Applies to all Qwen-family models (qwen2, qwen3, qwen3_5, etc.)
* Detected by arch string or DeltaNet presence. */
int is_qwen_family = (model->config.delta_n_heads > 0);
if (model->gguf_ctx) {
const tq_gguf_ctx_t* gctx = (const tq_gguf_ctx_t*)model->gguf_ctx;
if (strstr(gctx->arch, "qwen") != NULL) is_qwen_family = 1;
}
if (is_qwen_family) {
/* Qwen3.5 (DeltaNet hybrid) RMSNorm adjustment.
* Only for non-GGUF models (raw checkpoints). GGUF files from
* llama.cpp already have +1 baked in by the converter.
* Qwen2/Qwen3 use standard RMSNorm and never need +1. */
if (model->config.delta_n_heads > 0 && !model->gguf_ctx) {
int dim_h = model->config.hidden_dim;
int head_dim_h = model->config.head_dim;

Expand Down
7 changes: 7 additions & 0 deletions src/engine/tq_model.c
Original file line number Diff line number Diff line change
Expand Up @@ -4065,6 +4065,13 @@ skip_q4_conversion: ;

#undef GGUF_KEY

/* NOTE: No runtime RMSNorm +1 adjustment for GGUF models.
* - Qwen2/Qwen3: standard RMSNorm (weight * norm(x)), no +1 needed.
* - Qwen3.5/Gemma: use (1+weight) convention, but llama.cpp's GGUF
* converter already bakes +1 into the weights during conversion.
* Adding +1 at runtime would double-apply and cause activation explosion.
* The Gemma heuristic above (mean > 2.0 check) handles the Gemma case. */

/* Initialize persistent Metal GPU buffers for layer-level compute */
#ifdef TQ_HAS_METAL
{
Expand Down
18 changes: 9 additions & 9 deletions wasm/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,11 @@ <h2>LLM in Your Browser</h2>
<p style="margin-bottom:16px; color:#6ee7b7; font-size:15px">No install. No API key. No server. Just click.</p>

<div class="model-cards" id="modelCards">
<div class="model-card recommended" onclick="loadDemoModel('qwen3-0.6b')">
<div class="name">Qwen3 0.6B</div>
<div class="meta">~378 MB download &middot; Q4_K_M</div>
<div class="model-card recommended" onclick="loadDemoModel('qwen3.5-0.8b')">
<div class="name">Qwen3.5 0.8B</div>
<div class="meta">~508 MB download &middot; Q4_K_M</div>
<span class="tag">Recommended</span>
<div class="meta" style="margin-top:4px">Fast, multilingual, good for demo</div>
<div class="meta" style="margin-top:4px">Fast, multilingual, best quality/size</div>
</div>
<div class="model-card" onclick="loadDemoModel('llama-3.2-1b')">
<div class="name">Llama 3.2 1B</div>
Expand Down Expand Up @@ -167,11 +167,11 @@ <h2>LLM in Your Browser</h2>

// ---- Model registry ----
const MODELS = {
'qwen3-0.6b': {
url: 'https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf',
name: 'Qwen3-0.6B Q4_K_M',
size: '~378 MB',
cacheKey: 'qwen3-0.6b-q4km',
'qwen3.5-0.8b': {
url: 'https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q4_K_M.gguf',
name: 'Qwen3.5-0.8B Q4_K_M',
size: '~508 MB',
cacheKey: 'qwen3.5-0.8b-q4km',
chatTemplate: (text) => `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`,
},
'llama-3.2-1b': {
Expand Down
Binary file modified wasm/quant.wasm
Binary file not shown.
Loading