From 4c839b976449d81947e62821fb7644463c2fb805 Mon Sep 17 00:00:00 2001 From: quantumaikr Date: Fri, 10 Apr 2026 15:16:07 +0900 Subject: [PATCH] Fix Qwen RMSNorm: revert runtime +1 for GGUF + switch demo to Qwen3.5 PR #23 incorrectly added RMSNorm +1 for all Qwen-family GGUF models. Investigation reveals: - Qwen2/Qwen3: standard RMSNorm (weight * norm(x)), no +1 needed - Qwen3.5/Gemma: use (1+weight), but llama.cpp's GGUF converter already bakes +1 into the weights during conversion - Runtime +1 was double-applying for Qwen3.5 and incorrectly applying for Qwen2/3, causing activation explosion Fix: skip runtime +1 for all GGUF models. Only apply for non-GGUF (raw checkpoint) DeltaNet models. Also switch WASM demo default from Qwen3-0.6B Q4_K_M (broken due to double-quantization on a tiny model) to Qwen3.5-0.8B Q4_K_M (~508 MB) which produces coherent output at 25 tok/s. Verified: - Qwen3.5 0.8B Q8_0: coherent English output - Llama 3.2 1B Q8_0: coherent English output (unchanged) - Qwen3 0.6B Q4_K_M: real words now (was garbage Unicode), but quality limited by double-quantization on 0.6B model Co-Authored-By: Claude Opus 4.6 (1M context) --- bindings/python/quantcpp/__init__.py | 8 ++++---- quant.h | 23 +++++------------------ src/engine/tq_model.c | 7 +++++++ wasm/index.html | 18 +++++++++--------- wasm/quant.wasm | Bin 244160 -> 244121 bytes 5 files changed, 25 insertions(+), 31 deletions(-) diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py index bea7137..01d273d 100644 --- a/bindings/python/quantcpp/__init__.py +++ b/bindings/python/quantcpp/__init__.py @@ -53,10 +53,10 @@ "smollm2-135m-instruct-q8_0.gguf", 135, ), - "Qwen3-0.6B": ( - "unsloth/Qwen3-0.6B-GGUF", - "Qwen3-0.6B-Q4_K_M.gguf", - 378, + "Qwen3.5-0.8B": ( + "unsloth/Qwen3.5-0.8B-GGUF", + "Qwen3.5-0.8B-Q4_K_M.gguf", + 508, ), "Llama-3.2-1B": ( "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF", diff --git a/quant.h b/quant.h index 2070249..e4e61c8 100644 --- a/quant.h +++ b/quant.h @@ -9982,24 +9982,11 @@ static tq_model_t* tq_load_safetensors(const char* path) { free(tensors); - /* Qwen RMSNorm adjustment: Qwen's RMSNorm computes - * output = norm(x) * (1.0 + weight), NOT norm(x) * weight. - * We bake the "+1" into the weight so tq_rmsnorm can stay as - * out = x * rsqrt * weight. - * - * This applies to: input_layernorm, post_attention_layernorm, - * model.norm, q_norm, k_norm. - * It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated - * uses plain weight without +1). - * - * Applies to all Qwen-family models (qwen2, qwen3, qwen3_5, etc.) - * Detected by arch string or DeltaNet presence. */ - int is_qwen_family = (model->config.delta_n_heads > 0); - if (model->gguf_ctx) { - const tq_gguf_ctx_t* gctx = (const tq_gguf_ctx_t*)model->gguf_ctx; - if (strstr(gctx->arch, "qwen") != NULL) is_qwen_family = 1; - } - if (is_qwen_family) { + /* Qwen3.5 (DeltaNet hybrid) RMSNorm adjustment. + * Only for non-GGUF models (raw checkpoints). GGUF files from + * llama.cpp already have +1 baked in by the converter. + * Qwen2/Qwen3 use standard RMSNorm and never need +1. */ + if (model->config.delta_n_heads > 0 && !model->gguf_ctx) { int dim_h = model->config.hidden_dim; int head_dim_h = model->config.head_dim; diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c index e0a0113..9b9eccb 100644 --- a/src/engine/tq_model.c +++ b/src/engine/tq_model.c @@ -4065,6 +4065,13 @@ skip_q4_conversion: ; #undef GGUF_KEY + /* NOTE: No runtime RMSNorm +1 adjustment for GGUF models. + * - Qwen2/Qwen3: standard RMSNorm (weight * norm(x)), no +1 needed. + * - Qwen3.5/Gemma: use (1+weight) convention, but llama.cpp's GGUF + * converter already bakes +1 into the weights during conversion. + * Adding +1 at runtime would double-apply and cause activation explosion. + * The Gemma heuristic above (mean > 2.0 check) handles the Gemma case. */ + /* Initialize persistent Metal GPU buffers for layer-level compute */ #ifdef TQ_HAS_METAL { diff --git a/wasm/index.html b/wasm/index.html index 42d42ad..0abd7ec 100644 --- a/wasm/index.html +++ b/wasm/index.html @@ -121,11 +121,11 @@

LLM in Your Browser

No install. No API key. No server. Just click.

-