diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py index bea7137..01d273d 100644 --- a/bindings/python/quantcpp/__init__.py +++ b/bindings/python/quantcpp/__init__.py @@ -53,10 +53,10 @@ "smollm2-135m-instruct-q8_0.gguf", 135, ), - "Qwen3-0.6B": ( - "unsloth/Qwen3-0.6B-GGUF", - "Qwen3-0.6B-Q4_K_M.gguf", - 378, + "Qwen3.5-0.8B": ( + "unsloth/Qwen3.5-0.8B-GGUF", + "Qwen3.5-0.8B-Q4_K_M.gguf", + 508, ), "Llama-3.2-1B": ( "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF", diff --git a/quant.h b/quant.h index 2070249..e4e61c8 100644 --- a/quant.h +++ b/quant.h @@ -9982,24 +9982,11 @@ static tq_model_t* tq_load_safetensors(const char* path) { free(tensors); - /* Qwen RMSNorm adjustment: Qwen's RMSNorm computes - * output = norm(x) * (1.0 + weight), NOT norm(x) * weight. - * We bake the "+1" into the weight so tq_rmsnorm can stay as - * out = x * rsqrt * weight. - * - * This applies to: input_layernorm, post_attention_layernorm, - * model.norm, q_norm, k_norm. - * It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated - * uses plain weight without +1). - * - * Applies to all Qwen-family models (qwen2, qwen3, qwen3_5, etc.) - * Detected by arch string or DeltaNet presence. */ - int is_qwen_family = (model->config.delta_n_heads > 0); - if (model->gguf_ctx) { - const tq_gguf_ctx_t* gctx = (const tq_gguf_ctx_t*)model->gguf_ctx; - if (strstr(gctx->arch, "qwen") != NULL) is_qwen_family = 1; - } - if (is_qwen_family) { + /* Qwen3.5 (DeltaNet hybrid) RMSNorm adjustment. + * Only for non-GGUF models (raw checkpoints). GGUF files from + * llama.cpp already have +1 baked in by the converter. + * Qwen2/Qwen3 use standard RMSNorm and never need +1. */ + if (model->config.delta_n_heads > 0 && !model->gguf_ctx) { int dim_h = model->config.hidden_dim; int head_dim_h = model->config.head_dim; diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c index e0a0113..9b9eccb 100644 --- a/src/engine/tq_model.c +++ b/src/engine/tq_model.c @@ -4065,6 +4065,13 @@ skip_q4_conversion: ; #undef GGUF_KEY + /* NOTE: No runtime RMSNorm +1 adjustment for GGUF models. + * - Qwen2/Qwen3: standard RMSNorm (weight * norm(x)), no +1 needed. + * - Qwen3.5/Gemma: use (1+weight) convention, but llama.cpp's GGUF + * converter already bakes +1 into the weights during conversion. + * Adding +1 at runtime would double-apply and cause activation explosion. + * The Gemma heuristic above (mean > 2.0 check) handles the Gemma case. */ + /* Initialize persistent Metal GPU buffers for layer-level compute */ #ifdef TQ_HAS_METAL { diff --git a/wasm/index.html b/wasm/index.html index 42d42ad..0abd7ec 100644 --- a/wasm/index.html +++ b/wasm/index.html @@ -121,11 +121,11 @@
No install. No API key. No server. Just click.