From 6d186cb9f91a4ff8a69427675875852e1254a1c1 Mon Sep 17 00:00:00 2001 From: isHuangXin Date: Thu, 21 May 2026 12:16:23 +0800 Subject: [PATCH] Add bitnet-embeddings-0.6b model adaptation with F16 and I2_S GGUF conversion - Add GGUF conversion tool for bitnet-embeddings-0.6b (safetensors -> F16/I2_S GGUF) - Add Qwen3 architecture support in llama.cpp submodule with per-projection RMSNorm - Add I2_S ternary quantization (2-bit packed -1/0/+1) for lossless precision - Add f16 norm weight support for correct embedding inference - Add AVX512BW SIMD paths for I2_S kernel (~2x throughput on AVX512-capable CPUs) - Guard bitnet-lut-kernels.h include with TL1/TL2 preprocessor checks - Update llama.cpp submodule to dev-bitnet-embedding-0.6b branch - Document F16 (from multilingual-e5-0.6b) and I2_S (from bitnet-embeddings-0.6b) conversion process --- 3rdparty/llama.cpp | 2 +- ...bitnet-embeddings-qwen3-gguf-conversion.md | 302 +++++++++++ src/ggml-bitnet-lut.cpp | 7 + src/ggml-bitnet-mad.cpp | 469 +++++++++++++++- utils/convert-bitnet-embedding-to-gguf.py | 502 ++++++++++++++++++ 5 files changed, 1277 insertions(+), 5 deletions(-) create mode 100644 docs/bitnet-embeddings-qwen3-gguf-conversion.md create mode 100644 utils/convert-bitnet-embedding-to-gguf.py diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index 1f86f058d..13e129947 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit 1f86f058de0c3f4098dedae2ae8653c335c868a1 +Subproject commit 13e129947db43cbbcbfa985c72c443c2f2757f15 diff --git a/docs/bitnet-embeddings-qwen3-gguf-conversion.md b/docs/bitnet-embeddings-qwen3-gguf-conversion.md new file mode 100644 index 000000000..9d63c9300 --- /dev/null +++ b/docs/bitnet-embeddings-qwen3-gguf-conversion.md @@ -0,0 +1,302 @@ +# BitNet Embeddings (Qwen3) GGUF Conversion Implementation + +## 1. Background + +`bitnet-embeddings-0.6b` is a Qwen3-based embedding model with BitNet per-projection RMSNorm (`BitLinear`). Each linear projection (q/k/v/o/gate/up/down) has a `.norm.weight` that applies RMSNorm to the input **before** the matmul: + +``` +x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary)) +``` + +This pattern does **not** exist in any standard llama.cpp architecture: +- Standard Qwen3: no per-projection norms +- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection) + +### Model Config + +- Architecture: `Qwen3Model` +- hidden_size: 1024, num_attention_heads: 16, num_key_value_heads: 8 +- head_dim: 128 (note: != hidden_size/num_heads = 64) +- intermediate_size: 3072, num_hidden_layers: 28 +- tie_word_embeddings: true +- rope_theta: 1000000, rms_norm_eps: 1e-06 + +### Per-Layer Tensors (7 extra norm tensors per layer) + +| Tensor | Shape | +|--------|-------| +| `self_attn.q_proj.norm.weight` | [1024] | +| `self_attn.k_proj.norm.weight` | [1024] | +| `self_attn.v_proj.norm.weight` | [1024] | +| `self_attn.o_proj.norm.weight` | [2048] | +| `mlp.gate_proj.norm.weight` | [1024] | +| `mlp.up_proj.norm.weight` | [1024] | +| `mlp.down_proj.norm.weight` | [3072] | + +--- + +## 2. GGUF Tensor Name Mapping + +| HF Name | GGUF Name | Notes | +|----------|-----------|-------| +| `embed_tokens.weight` | `token_embd.weight` | | +| `norm.weight` | `output_norm.weight` | | +| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | | +| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.ffn_norm.weight` | | +| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | | +| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | | +| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | | +| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | | +| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm | +| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm | +| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection | +| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | | +| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | | +| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | | +| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection | +| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection | + +--- + +## 3. Conversion Script + +### `utils/convert-bitnet-embedding-to-gguf.py` + +Standalone conversion script (safetensors → GGUF). Key features: + +- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter) +- Supports three output types: + - `--outtype f32`: all weights in float32 + - `--outtype f16`: 2D weights and embeddings as float16, norms as float16 + - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16 +- Writes `key_length` and `value_length` metadata for head_dim=128 (critical: default calculation would give wrong value 64) +- GPT-2 BPE tokenizer handling with pre-tokenizer hash verification +- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention) +- EOS token override: uses `<|endoftext|>` (151643) for correct last-token pooling +- Architecture string: `"qwen3"` + +### I2_S Ternary Packing + +The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation: + +- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)` +- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2` +- Every 128 values form a block, packed into 32 bytes +- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3` +- Scale (float32) is appended at the end of the packed data buffer + +### Tensor Type Assignment + +| Tensor Type | f16 mode | i2_s mode | +|-------------|----------|-----------| +| 2D linear weights | float16 | I2_S ternary packed | +| Embedding weights | float16 | float16 | +| Norm weights (1D) | float16 | float16 | + +Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation). + +--- + +## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`) + +### 4.1 New Tensor Enums + +Added 7 new entries after `LLM_TENSOR_FFN_SUB_NORM`: + +```cpp +LLM_TENSOR_ATTN_Q_NORM_IN, +LLM_TENSOR_ATTN_K_NORM_IN, +LLM_TENSOR_ATTN_V_NORM_IN, +LLM_TENSOR_ATTN_OUT_NORM_IN, +LLM_TENSOR_FFN_GATE_NORM_IN, +LLM_TENSOR_FFN_UP_NORM_IN, +LLM_TENSOR_FFN_DOWN_NORM_IN, +``` + +### 4.2 Tensor Name Mappings + +Added to `LLM_ARCH_QWEN3` tensor name map: + +```cpp +{ LLM_TENSOR_ATTN_Q_NORM_IN, "blk.%d.attn_q_norm_in" }, +{ LLM_TENSOR_ATTN_K_NORM_IN, "blk.%d.attn_k_norm_in" }, +{ LLM_TENSOR_ATTN_V_NORM_IN, "blk.%d.attn_v_norm_in" }, +{ LLM_TENSOR_ATTN_OUT_NORM_IN, "blk.%d.attn_output_norm_in" }, +{ LLM_TENSOR_FFN_GATE_NORM_IN, "blk.%d.ffn_gate_norm_in" }, +{ LLM_TENSOR_FFN_UP_NORM_IN, "blk.%d.ffn_up_norm_in" }, +{ LLM_TENSOR_FFN_DOWN_NORM_IN, "blk.%d.ffn_down_norm_in" }, +``` + +### 4.3 Layer Struct Fields + +Added to `struct llama_layer`: + +```cpp +struct ggml_tensor * attn_q_norm_in; +struct ggml_tensor * attn_k_norm_in; +struct ggml_tensor * attn_v_norm_in; +struct ggml_tensor * attn_out_norm_in; +struct ggml_tensor * ffn_gate_norm_in; +struct ggml_tensor * ffn_up_norm_in; +struct ggml_tensor * ffn_down_norm_in; +``` + +### 4.4 load_tensors (LLM_ARCH_QWEN3) + +Added optional loading with `TENSOR_NOT_REQUIRED`: + +```cpp +layer.attn_q_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_k_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_v_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_V_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_out_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM_IN, "weight", i), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED); +layer.ffn_gate_norm_in = create_tensor(tn(LLM_TENSOR_FFN_GATE_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.ffn_up_norm_in = create_tensor(tn(LLM_TENSOR_FFN_UP_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.ffn_down_norm_in = create_tensor(tn(LLM_TENSOR_FFN_DOWN_NORM_IN, "weight", i), {n_ff}, TENSOR_NOT_REQUIRED); +``` + +Note: `o_proj.norm` input dimension is `n_embd_head_k * n_head` (=2048), `down_proj.norm` input dimension is `n_ff` (=3072). + +### 4.5 build_qwen3() Graph Modifications + +The `build_qwen3()` function was modified to conditionally apply per-projection RMSNorm. The logic is fully backward compatible — when no `*_norm_in` tensors exist, behavior is identical to original. + +**Attention per-projection norms:** +``` +// Before Q/K/V matmul: +if (layer.attn_q_norm_in) { + cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); + cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in); +} else { + cur_q = cur; +} +Qcur = ggml_mul_mat(ctx, layer.wq, cur_q); +// Similarly for K, V +``` + +**O_proj norm** requires special handling because `llm_build_kv()` normally applies `wo` internally. Solution: pass `wo=NULL` to `llm_build_kv()`, then apply norm + wo manually: + +``` +cur = llm_build_kv(..., wo=NULL, ...); // returns attention output without o_proj +if (layer.attn_out_norm_in) { + cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); + cur = ggml_mul(ctx, cur, layer.attn_out_norm_in); +} +cur = ggml_mul_mat(ctx, layer.wo, cur); +``` + +**FFN per-projection norms:** +``` +// Instead of llm_build_ffn(), manually: +if (layer.ffn_gate_norm_in) { + tmp_gate = rms_norm(cur) * gate_norm_in; +} else { + tmp_gate = cur; +} +tmp_gate = matmul(gate_proj, tmp_gate); +// Similarly for up_proj +tmp = silu(tmp_gate) * tmp_up; + +if (layer.ffn_down_norm_in) { + tmp = rms_norm(tmp) * down_norm_in; +} +cur = matmul(down_proj, tmp); +``` + +--- + +## 5. GGUF Conversion Process + +There are two GGUF files to produce, from **two different source models**: + +| GGUF Output | Source Model | Description | +|-------------|-------------|-------------| +| `embeddings-0.6b-f16.gguf` | `multilingual-e5-0.6b` (standard Qwen3) | F16 baseline, standard float16 weights | +| `bitnet-embeddings-0.6b-f16-i2_s.gguf` | `bitnet-embeddings-0.6b` (BitNet ternary) | I2_S ternary packed weights | + +### 5.1 F16 GGUF: from multilingual-e5-0.6b + +The F16 GGUF is converted from the **standard (non-BitNet) model** `multilingual-e5-0.6b`, which has normal float weights and no per-projection RMSNorm. This uses llama.cpp's standard converter since it is a vanilla Qwen3 model: + +```bash +python3 /path/to/llama.cpp/convert_hf_to_gguf.py \ + /path/to/multilingual-e5-0.6b \ + --outtype f16 \ + --outfile embeddings-0.6b-f16.gguf +``` + +**What happens:** +1. Load `model.safetensors` (standard Qwen3 weights, bfloat16) +2. Convert all 2D weights (projections, embeddings) to float16 +3. Convert norm weights to float32 +4. Write GGUF with `qwen3` architecture metadata and tokenizer + +**Output:** ~1.11 GiB (595.78M params) + +### 5.2 I2_S GGUF: from bitnet-embeddings-0.6b + +The I2_S GGUF is converted from the **BitNet ternary model** `bitnet-embeddings-0.6b`, which has ternary weights {-1, 0, +1} and 7 extra per-projection RMSNorm tensors per layer. This uses the custom converter because the standard llama.cpp converter does not handle per-projection norms or I2_S quantization: + +```bash +python3 utils/convert-bitnet-embedding-to-gguf.py \ + /path/to/bitnet-embeddings-0.6b \ + --outfile bitnet-embeddings-0.6b-f16-i2_s.gguf --outtype i2_s +``` + +**What happens:** +1. Load `model.safetensors` (BitNet ternary weights, bfloat16) +2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer (see Section 2) +3. For each 2D linear weight (q/k/v/o/gate/up/down projections): + - Compute scale: `scale = 1 / mean(|w|)` + - Quantize: `q = round(w * scale).clamp(-1, 1)` + - Encode: `-1 -> 0`, `0 -> 1`, `+1 -> 2` + - Pack every 128 values into 32 bytes (4 values per byte, 2 bits each) + - Append per-row float32 scale +4. Keep embeddings (`token_embd.weight`) in float16 (not ternary) +5. Keep all norm weights in float16 +6. Skip `output.weight` (lm_head, not needed for embedding models) +7. Write GGUF with `I2_S` type tag for quantized tensors + +**Output:** ~699 MiB (~50% of F16 size) + +### 5.3 Why Two Different Source Models? + +- `multilingual-e5-0.6b` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference +- `bitnet-embeddings-0.6b` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference +- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization + +### 5.4 Tensor Type Summary + +| Tensor | F16 (from e5-0.6b) | I2_S (from bitnet-0.6b) | +|--------|---------------------|-------------------------| +| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) | +| Embedding (`token_embd.weight`) | float16 | float16 | +| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 | +| Layer norms (`attn_norm`, `ffn_norm`) | float32 | float16 | +| QK head norms (`attn_q_norm`, `attn_k_norm`) | float32 | float32 | +| `output.weight` (lm_head) | present | skipped | + +--- + +## 6. Build and Run + +```bash +# Build with BitNet repo (includes I2_S support) +cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release +cmake --build build --target llama-embedding llama-bench -j$(nproc) + +# Run embedding inference +build/bin/llama-embedding -m bitnet-embeddings-0.6b-f16-i2_s.gguf \ + -p "hello world" --embd-normalize 2 --embd-output-format array + +# Benchmark: F16 vs I2_S +build/bin/llama-bench -m embeddings-0.6b-f16.gguf \ + -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 + +build/bin/llama-bench -m bitnet-embeddings-0.6b-f16-i2_s.gguf \ + -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0 +``` diff --git a/src/ggml-bitnet-lut.cpp b/src/ggml-bitnet-lut.cpp index 59422d548..beef726f7 100644 --- a/src/ggml-bitnet-lut.cpp +++ b/src/ggml-bitnet-lut.cpp @@ -5,9 +5,16 @@ #include #include +#ifdef __x86_64__ +#include +#endif + #include "ggml-bitnet.h" #include "ggml-quants.h" + +#if defined(GGML_BITNET_ARM_TL1) || defined(GGML_BITNET_X86_TL2) #include "bitnet-lut-kernels.h" +#endif #if defined(GGML_BITNET_ARM_TL1) diff --git a/src/ggml-bitnet-mad.cpp b/src/ggml-bitnet-mad.cpp index 4ba9d6509..f99368bbd 100644 --- a/src/ggml-bitnet-mad.cpp +++ b/src/ggml-bitnet-mad.cpp @@ -24,6 +24,12 @@ static inline int hsum_i32_8(const __m256i a) { const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } +#if defined(__AVX512F__) && defined(__AVX512BW__) +// horizontally add 16 int32_t +static inline int hsum_i32_16(const __m512i a) { + return _mm512_reduce_add_epi32(a); +} +#endif #elif defined(__loongarch_asx) // horizontally add 8 int32_t static inline int hsum_i32_8(const __m256i a) { @@ -196,7 +202,153 @@ size_t quantize_i2_s(const float * src, void * dst, int64_t nrow, int64_t n_per_ } void ggml_vec_dot_i2_i8_s_1x1(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if defined(__AVX2__) +#if defined(__AVX512F__) && defined(__AVX512BW__) + const uint8_t * x = (uint8_t *)vx; + const int8_t * y = (int8_t *)vy; + + const int nb = n / QK_I2_S; + const int group32_num = nb / 32; + const int la_num = nb % 32; + const int groupla_num = nb % 32 != 0 ? 1 : 0; + + const __m512i mask = _mm512_set1_epi8(0x03); + const __m512i one16 = _mm512_set1_epi16(1); + + for (int row = 0; row < nrc; row++) { + __m512i accu = _mm512_setzero_si512(); + + const uint8_t * x_row = x + row * bx / 4; + + for (int i = 0; i < group32_num; i++) { + const uint8_t *px = x_row + i * 1024; + const int8_t *py = y + i * 4096; + __m512i accu32 = _mm512_setzero_si512(); + + // Process 2 blocks per iteration (j+=2), 16 iterations instead of 32 + int j = 0; + for (; j + 1 < 32; j += 2) { + // Load 2 consecutive 32-byte weight blocks into one 512-bit register + __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px)); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + // Load 2 consecutive 128-byte activation blocks (256 bytes total = 4 x 64) + __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py)); + __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64)); + __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128)); + __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192)); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_0, xq8_1)); + accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_2, xq8_3)); + + px += 64; + py += 256; + } + // Handle odd remaining block + if (j < 32) { + __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px)); + __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py)); + __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32)); + __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64)); + __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96)); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(yq8_0_256)); + xq8_1 = _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(yq8_1_256)); + xq8_2 = _mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(yq8_2_256)); + xq8_3 = _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(yq8_3_256)); + + accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_0, xq8_1)); + accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_2, xq8_3)); + } + accu = _mm512_add_epi32(_mm512_madd_epi16(accu32, one16), accu); + } + + for (int i = 0; i < groupla_num; i++) { + __m512i accula = _mm512_setzero_si512(); + const uint8_t *px = x_row + group32_num * 1024; + const int8_t *py = y + group32_num * 4096; + + int j = 0; + for (; j + 1 < la_num; j += 2) { + __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px)); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py)); + __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64)); + __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128)); + __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192)); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_0, xq8_1)); + accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_2, xq8_3)); + + px += 64; + py += 256; + } + if (j < la_num) { + __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px)); + __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py)); + __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32)); + __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64)); + __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96)); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(yq8_0_256)); + xq8_1 = _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(yq8_1_256)); + xq8_2 = _mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(yq8_2_256)); + xq8_3 = _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(yq8_3_256)); + + accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_0, xq8_1)); + accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_2, xq8_3)); + } + accu = _mm512_add_epi32(accu, _mm512_madd_epi16(accula, one16)); + } + + int sumi = hsum_i32_16(accu); + s[row] = (float)sumi; + } +#elif defined(__AVX2__) const uint8_t * x = (uint8_t *)vx; const int8_t * y = (int8_t *)vy; @@ -510,7 +662,184 @@ void ggml_vec_dot_i2_i8_s_1x4_32W(int n, float * s, size_t bs, const void * vx, } void ggml_vec_dot_i2_i8_s_1xN(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if defined(__AVX2__) +#if defined(__AVX512F__) && defined(__AVX512BW__) + const uint8_t * x = (uint8_t *)vx; + const int8_t * y = (int8_t *)vy; + + const int nb = n / QK_I2_S; + const int group32_num = nb / 32; + const int la_num = nb % 32; + const int groupla_num = nb % 32 != 0 ? 1 : 0; + + const __m512i mask = _mm512_set1_epi8(0x03); + const __m512i one16 = _mm512_set1_epi16(1); + + for (int row = 0; row < nrc; row += PARALLEL_SIZE) { + __m512i accu[PARALLEL_SIZE]; + const uint8_t * x_row[PARALLEL_SIZE]; + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + accu[rb] = _mm512_setzero_si512(); + x_row[rb] = x + (row + rb) * bx / 4; + } + + for (int i = 0; i < group32_num; i++) { + const uint8_t * px[PARALLEL_SIZE]; + __m512i accu32[PARALLEL_SIZE]; + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + px[rb] = x_row[rb] + i * 1024; + accu32[rb] = _mm512_setzero_si512(); + } + const int8_t *py = y + i * 4096; + + int j = 0; + for (; j + 1 < 32; j += 2) { + __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py)); + __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64)); + __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128)); + __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192)); + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px[rb])); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_0, xq8_1)); + accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_2, xq8_3)); + + px[rb] += 64; + } + py += 256; + } + if (j < 32) { + __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py)); + __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32)); + __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64)); + __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96)); + __m512i yq8_0 = _mm512_castsi256_si512(yq8_0_256); + __m512i yq8_1 = _mm512_castsi256_si512(yq8_1_256); + __m512i yq8_2 = _mm512_castsi256_si512(yq8_2_256); + __m512i yq8_3 = _mm512_castsi256_si512(yq8_3_256); + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px[rb])); + __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_0, xq8_1)); + accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_2, xq8_3)); + + px[rb] += 32; + } + } + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + accu[rb] = _mm512_add_epi32(_mm512_madd_epi16(accu32[rb], one16), accu[rb]); + } + } + + for (int i = 0; i < groupla_num; i++) { + const int8_t *py = y + group32_num * 4096; + const uint8_t * px[PARALLEL_SIZE]; + __m512i accula[PARALLEL_SIZE]; + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + px[rb] = x_row[rb] + group32_num * 1024; + accula[rb] = _mm512_setzero_si512(); + } + + int j = 0; + for (; j + 1 < la_num; j += 2) { + __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py)); + __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64)); + __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128)); + __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192)); + + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px[rb])); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_0, xq8_1)); + accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_2, xq8_3)); + + px[rb] += 64; + } + py += 256; + } + if (j < la_num) { + __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py)); + __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32)); + __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64)); + __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96)); + __m512i yq8_0 = _mm512_castsi256_si512(yq8_0_256); + __m512i yq8_1 = _mm512_castsi256_si512(yq8_1_256); + __m512i yq8_2 = _mm512_castsi256_si512(yq8_2_256); + __m512i yq8_3 = _mm512_castsi256_si512(yq8_3_256); + + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px[rb])); + __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256); + __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2); + __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4); + __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6); + + xq8_3 = _mm512_and_si512(xq8_3, mask); + xq8_2 = _mm512_and_si512(xq8_2, mask); + xq8_1 = _mm512_and_si512(xq8_1, mask); + xq8_0 = _mm512_and_si512(xq8_0, mask); + + xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0); + xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1); + xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2); + xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3); + + accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_0, xq8_1)); + accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_2, xq8_3)); + + px[rb] += 32; + } + } + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + accu[rb] = _mm512_add_epi32(accu[rb], _mm512_madd_epi16(accula[rb], one16)); + } + } + + for (int rb = 0; rb < PARALLEL_SIZE; rb++) { + int sumi = hsum_i32_16(accu[rb]); + s[row + rb] = (float)sumi; + } + } +#elif defined(__AVX2__) const uint8_t * x = (uint8_t *)vx; const int8_t * y = (int8_t *)vy; @@ -789,7 +1118,139 @@ void ggml_vec_dot_i2_i8_s_1xN(int n, float * s, size_t bs, const void * vx, size } void ggml_vec_dot_i2_i8_s_Nx1(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if defined(__AVX2__) +#if defined(__AVX512F__) && defined(__AVX512BW__) + const uint8_t * x = (uint8_t *)vx; + const int8_t * y = (int8_t *)vy; + + const int nb = n / QK_I2_S; + const int group32_num = nb / 32; + const int la_num = nb % 32; + const int groupla_num = nb % 32 != 0 ? 1 : 0; + + const __m512i mask = _mm512_set1_epi8(0x03); + const __m512i one16 = _mm512_set1_epi16(1); + + for (int col = 0; col < nrc; col += PARALLEL_SIZE) { + __m512i accu[PARALLEL_SIZE]; + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu[iy] = _mm512_setzero_si512(); + } + + const int8_t * y_col = y + col * by; + + for (int i = 0; i < group32_num; i++) { + const uint8_t *px = x + i * 1024; + const int8_t *py = y_col + i * 4096; + __m512i accu32[PARALLEL_SIZE]; + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu32[iy] = _mm512_setzero_si512(); + } + + int j = 0; + for (; j + 1 < 32; j += 2) { + __m512i xq8 = _mm512_loadu_si512((const __m512i*)(px)); + __m512i xq8_3 = _mm512_and_si512(xq8, mask); + __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask); + __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask); + __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask); + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu32[iy] = _mm512_add_epi16(accu32[iy], _mm512_add_epi16( + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_loadu_si512((const __m512i*)(py + 0 * 64 + iy * by))), + _mm512_maddubs_epi16(xq8_1, _mm512_loadu_si512((const __m512i*)(py + 1 * 64 + iy * by)))), + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_loadu_si512((const __m512i*)(py + 2 * 64 + iy * by))), + _mm512_maddubs_epi16(xq8_3, _mm512_loadu_si512((const __m512i*)(py + 3 * 64 + iy * by)))))); + } + + px += 64; + py += 256; + } + if (j < 32) { + __m256i xq8_256 = _mm256_loadu_si256((const __m256i*)(px)); + __m512i xq8 = _mm512_castsi256_si512(xq8_256); + __m512i xq8_3 = _mm512_and_si512(xq8, mask); + __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask); + __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask); + __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask); + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu32[iy] = _mm512_add_epi16(accu32[iy], _mm512_add_epi16( + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 0 * 32 + iy * by)))), + _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 1 * 32 + iy * by))))), + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 2 * 32 + iy * by)))), + _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 3 * 32 + iy * by))))))); + } + + px += 32; + py += 128; + } + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu[iy] = _mm512_add_epi32(_mm512_madd_epi16(accu32[iy], one16), accu[iy]); + } + } + + for (int i = 0; i < groupla_num; i++) { + const uint8_t *px = x + group32_num * 1024; + const int8_t *py = y_col + group32_num * 4096; + __m512i accula[PARALLEL_SIZE]; + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accula[iy] = _mm512_setzero_si512(); + } + + int j = 0; + for (; j + 1 < la_num; j += 2) { + __m512i xq8 = _mm512_loadu_si512((const __m512i*)(px)); + __m512i xq8_3 = _mm512_and_si512(xq8, mask); + __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask); + __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask); + __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask); + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accula[iy] = _mm512_add_epi16(accula[iy], _mm512_add_epi16( + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_loadu_si512((const __m512i*)(py + 0 * 64 + iy * by))), + _mm512_maddubs_epi16(xq8_1, _mm512_loadu_si512((const __m512i*)(py + 1 * 64 + iy * by)))), + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_loadu_si512((const __m512i*)(py + 2 * 64 + iy * by))), + _mm512_maddubs_epi16(xq8_3, _mm512_loadu_si512((const __m512i*)(py + 3 * 64 + iy * by)))))); + } + + px += 64; + py += 256; + } + if (j < la_num) { + __m256i xq8_256 = _mm256_loadu_si256((const __m256i*)(px)); + __m512i xq8 = _mm512_castsi256_si512(xq8_256); + __m512i xq8_3 = _mm512_and_si512(xq8, mask); + __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask); + __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask); + __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask); + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accula[iy] = _mm512_add_epi16(accula[iy], _mm512_add_epi16( + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 0 * 32 + iy * by)))), + _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 1 * 32 + iy * by))))), + _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 2 * 32 + iy * by)))), + _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 3 * 32 + iy * by))))))); + } + + px += 32; + py += 128; + } + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + accu[iy] = _mm512_add_epi32(_mm512_madd_epi16(accula[iy], one16), accu[iy]); + } + } + + for (int iy = 0; iy < PARALLEL_SIZE; iy++) { + int sumi = hsum_i32_16(accu[iy]); + s[(col + iy) * bs] = (float)sumi; + } + } +#elif defined(__AVX2__) const uint8_t * x = (uint8_t *)vx; const int8_t * y = (int8_t *)vy; @@ -808,7 +1269,7 @@ void ggml_vec_dot_i2_i8_s_Nx1(int n, float * s, size_t bs, const void * vx, size accu[iy] = _mm256_setzero_si256(); } - int8_t * y_col = y + col * by; + const int8_t * y_col = y + col * by; for (int i = 0; i < group32_num; i++) { const uint8_t *px = x + i * 1024; diff --git a/utils/convert-bitnet-embedding-to-gguf.py b/utils/convert-bitnet-embedding-to-gguf.py new file mode 100644 index 000000000..3a4340734 --- /dev/null +++ b/utils/convert-bitnet-embedding-to-gguf.py @@ -0,0 +1,502 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import json +import logging +import os +import sys +from hashlib import sha256 +from pathlib import Path +from typing import Any, Iterator + +import numpy as np +import torch + +# Allow using the local gguf-py if present +if "NO_LOCAL_GGUF" not in os.environ: + _local_gguf = Path(__file__).parent / "gguf-py" + if _local_gguf.exists(): + sys.path.insert(1, str(_local_gguf)) +import gguf + +logger = logging.getLogger("convert-bitnet-embedding") + +# --------------------------------------------------------------------------- +# Tensor name mapping: HuggingFace -> GGUF +# --------------------------------------------------------------------------- + +def build_tensor_name_map(n_layers: int) -> dict[str, str]: + """Build HF tensor name -> GGUF tensor name mapping.""" + mapping: dict[str, str] = { + "embed_tokens.weight": "token_embd.weight", + "norm.weight": "output_norm.weight", + } + + for i in range(n_layers): + pfx = f"layers.{i}" + blk = f"blk.{i}" + + mapping.update({ + # Layer norms + f"{pfx}.input_layernorm.weight": f"{blk}.attn_norm.weight", + f"{pfx}.post_attention_layernorm.weight": f"{blk}.ffn_norm.weight", + + # Self-attention projections + f"{pfx}.self_attn.q_proj.weight": f"{blk}.attn_q.weight", + f"{pfx}.self_attn.k_proj.weight": f"{blk}.attn_k.weight", + f"{pfx}.self_attn.v_proj.weight": f"{blk}.attn_v.weight", + f"{pfx}.self_attn.o_proj.weight": f"{blk}.attn_output.weight", + + # QK head norms (standard Qwen3) + f"{pfx}.self_attn.q_norm.weight": f"{blk}.attn_q_norm.weight", + f"{pfx}.self_attn.k_norm.weight": f"{blk}.attn_k_norm.weight", + + # Per-projection input norms (BitNet-specific) + f"{pfx}.self_attn.q_proj.norm.weight": f"{blk}.attn_q_norm_in.weight", + f"{pfx}.self_attn.k_proj.norm.weight": f"{blk}.attn_k_norm_in.weight", + f"{pfx}.self_attn.v_proj.norm.weight": f"{blk}.attn_v_norm_in.weight", + f"{pfx}.self_attn.o_proj.norm.weight": f"{blk}.attn_output_norm_in.weight", + + # MLP projections + f"{pfx}.mlp.gate_proj.weight": f"{blk}.ffn_gate.weight", + f"{pfx}.mlp.up_proj.weight": f"{blk}.ffn_up.weight", + f"{pfx}.mlp.down_proj.weight": f"{blk}.ffn_down.weight", + + # Per-projection input norms for MLP (BitNet-specific) + f"{pfx}.mlp.gate_proj.norm.weight": f"{blk}.ffn_gate_norm_in.weight", + f"{pfx}.mlp.up_proj.norm.weight": f"{blk}.ffn_up_norm_in.weight", + f"{pfx}.mlp.down_proj.norm.weight": f"{blk}.ffn_down_norm_in.weight", + }) + + return mapping + + +# --------------------------------------------------------------------------- +# Tokenizer handling (GPT-2 / BPE for Qwen3) +# --------------------------------------------------------------------------- + +def get_vocab_base_pre(tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer + + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.debug(f"chktok: {chktok}") + logger.debug(f"chkhsh: {chkhsh}") + + res = None + + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! + if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B + res = "llama-bpe" + if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base + res = "deepseek-llm" + if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base + res = "deepseek-coder" + if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": + # ref: https://huggingface.co/tiiuae/falcon-7b + res = "falcon" + if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": + # ref: https://huggingface.co/openai-community/gpt2 + res = "gpt-2" + if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": + # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B + res = "qwen2" + + if res is None: + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {chkhsh}") + logger.warning("**************************************************************************************") + logger.warning("\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") + logger.debug(f"chkhsh: {chkhsh}") + + return res + + +def _does_token_look_special(token: str) -> bool: + """Check if a token looks like a special token (e.g., <|...|>, <...>).""" + if not token: + return False + # Matches patterns like <|endoftext|>, , , [CLS], [SEP], etc. + if token.startswith(("<|", "<", "[")) and token.endswith(("|>", ">", "]")): + return True + return False + + +def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict): + """Set GPT-2 BPE vocab for Qwen3.""" + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + + tokpre = get_vocab_base_pre(tokenizer) + + tokens: list[str] = [] + toktypes: list[int] = [] + + reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + added_tokens_decoder = tokenizer.added_tokens_decoder + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + token = reverse_vocab[i] + + # Only encode-decode non-normalized tokens (matching llama.cpp upstream) + if not added_tokens_decoder[i].normalized: + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + + if added_tokens_decoder[i].special or _does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + # Pre-normalize user-defined spaces (for Gemma-style tokenizers) + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") + toktypes.append(gguf.TokenType.USER_DEFINED) + + tokens.append(token) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + gguf_writer.add_tokenizer_model("gpt2") + gguf_writer.add_tokenizer_pre(tokpre) + gguf_writer.add_token_list(tokens) + gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + # Override EOS token: PyTorch tokenizer appends <|endoftext|> (151643) as the + # sentence-end marker, not <|im_end|> (151645). For last-token pooling to work + # correctly, llama.cpp must append the same token. + special_vocab.special_token_ids["eos"] = 151643 + special_vocab.add_to_gguf(gguf_writer) + + # Embedding models need EOS token appended for last-token pooling + gguf_writer.add_add_eos_token(True) + + +# --------------------------------------------------------------------------- +# GGUF metadata +# --------------------------------------------------------------------------- + +def set_gguf_parameters(gguf_writer: gguf.GGUFWriter, hparams: dict, dir_model: Path, ftype: int): + gguf_writer.add_name(dir_model.name) + + n_layers = hparams["num_hidden_layers"] + n_embd = hparams["hidden_size"] + n_head = hparams["num_attention_heads"] + n_head_kv = hparams.get("num_key_value_heads", n_head) + n_ff = hparams["intermediate_size"] + + gguf_writer.add_block_count(n_layers) + gguf_writer.add_context_length(hparams.get("max_position_embeddings", 32768)) + gguf_writer.add_embedding_length(n_embd) + gguf_writer.add_feed_forward_length(n_ff) + gguf_writer.add_head_count(n_head) + gguf_writer.add_head_count_kv(n_head_kv) + gguf_writer.add_vocab_size(hparams["vocab_size"]) + + head_dim = hparams.get("head_dim", n_embd // n_head) + gguf_writer.add_rope_dimension_count(head_dim) + gguf_writer.add_key_length(head_dim) + gguf_writer.add_value_length(head_dim) + + if hparams.get("rope_theta") is not None: + gguf_writer.add_rope_freq_base(hparams["rope_theta"]) + if hparams.get("rms_norm_eps") is not None: + gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + + gguf_writer.add_file_type(ftype) + + # Pooling type for embedding models + # Try to read from modules.json / 1_Pooling/config.json (sentence-transformers convention) + pooling_type = None + module_path = dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"].endswith("Pooling"): + pooling_path = dir_model / mod["path"] / "config.json" + if pooling_path.is_file(): + with open(pooling_path, encoding="utf-8") as f: + pooling = json.load(f) + if pooling.get("pooling_mode_mean_tokens"): + pooling_type = gguf.PoolingType.MEAN + elif pooling.get("pooling_mode_cls_token"): + pooling_type = gguf.PoolingType.CLS + elif pooling.get("pooling_mode_lasttoken"): + pooling_type = gguf.PoolingType.LAST + break + if pooling_type is None: + # Default to MEAN pooling for embedding models + logger.info(" No pooling config found, defaulting to MEAN pooling") + pooling_type = gguf.PoolingType.MEAN + gguf_writer.add_pooling_type(pooling_type) + + logger.info(f" n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}") + + +# --------------------------------------------------------------------------- +# Tensor iteration from safetensors +# --------------------------------------------------------------------------- + +def iter_tensors(dir_model: Path) -> Iterator[tuple[str, torch.Tensor]]: + """Yield (name, tensor) from safetensors files.""" + from safetensors import safe_open + + safetensor_files = sorted(dir_model.glob("*.safetensors")) + if not safetensor_files: + raise FileNotFoundError(f"No .safetensors files in {dir_model}") + + for sf_path in safetensor_files: + logger.info(f"Loading {sf_path.name}") + with safe_open(str(sf_path), framework="pt", device="cpu") as f: + for name in f.keys(): + yield name, f.get_tensor(name) + + +# --------------------------------------------------------------------------- +# I2_S ternary packing (platform-independent) +# --------------------------------------------------------------------------- +# +# I2_S format (from dequantize_row_i2_s in ggml-quants.c): +# - Every 128 values form a block, packed into 32 bytes +# - Each byte stores 4 values at positions [0*32+gp, 1*32+gp, 2*32+gp, 3*32+gp] +# where gp is the byte index within the 32-byte group +# - Encoding per byte: c0=(b>>6)&3, c1=(b>>4)&3, c2=(b>>2)&3, c3=(b>>0)&3 +# - Value mapping: 0 -> -1, 1 -> 0, 2 -> +1, 3 -> 0 +# - Scale is stored as a separate tensor (tensor_name + "_scale") + +def quantize_to_i2_s(w: np.ndarray) -> np.ndarray: + """Quantize float weights to ternary and pack into I2_S layout. + + Uses the same quantization as BitLinear weight_quant_minmax(): + scale = 1.0 / mean(|w|) + q = round(w * scale).clamp(-1, 1) + dequant = q / scale = q * mean(|w|) + + The I2_S format is self-contained: packed ternary bytes followed by a f32 scale + appended at the end of the data buffer. + + Args: + w: float weight tensor of shape (M, K) + + Returns: + packed_data: uint8 array containing I2_S packed bytes + scale (as 4 trailing bytes) + """ + M, K = w.shape + n = M * K + w_flat = w.flatten().astype(np.float32) + + # BitLinear weight_quant_minmax: scale = 1/mean(|w|), then round & clamp + abs_mean = np.mean(np.abs(w_flat)) + abs_mean = max(abs_mean, 1e-5) + inv_scale = 1.0 / abs_mean + q_float = np.round(w_flat * inv_scale).clip(-1, 1) # ternary: {-1, 0, 1} + + # scale for dequantization = abs_mean (i.e., dequant = q * abs_mean) + scale = np.float32(abs_mean) + + # Map ternary {-1, 0, 1} -> I2_S encoding {0, 1, 2} + # -1 -> 0, 0 -> 1, +1 -> 2 + q = np.ones(n, dtype=np.uint8) # default to 1 (zero) + q[q_float > 0.5] = 2 # +1 -> 2 + q[q_float < -0.5] = 0 # -1 -> 0 + + # Pack into I2_S layout: 128-value blocks, interleaved into 32 bytes + # Pad to multiple of 128 + pad_len = (128 - n % 128) % 128 + if pad_len: + q = np.pad(q, (0, pad_len), constant_values=1) + + n_padded = len(q) + n_blocks = n_padded // 128 + + q = q.reshape(n_blocks, 4, 32) + + # Pack: byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3 + packed = (q[:, 0, :].astype(np.uint8) << 6) | \ + (q[:, 1, :].astype(np.uint8) << 4) | \ + (q[:, 2, :].astype(np.uint8) << 2) | \ + (q[:, 3, :].astype(np.uint8)) + + packed = packed.reshape(-1).astype(np.uint8) + + # I2_S format: packed_bytes + 32-byte aligned tail (scale in first 4 bytes of tail) + # Total size = n_elements / 4 + 32 (as defined in ggml.c) + packed_size = n // 4 + total_size = packed_size + 32 + result = np.zeros(total_size, dtype=np.uint8) + result[:len(packed)] = packed[:packed_size] + # Write scale as float32 at offset packed_size + result[packed_size:packed_size+4] = np.frombuffer(scale.tobytes(), dtype=np.uint8) + + return result + + +# --------------------------------------------------------------------------- +# Main conversion +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser(description="Convert bitnet-embeddings to GGUF") + parser.add_argument("model", type=Path, help="Model directory") + parser.add_argument("--outfile", type=Path, default=None, help="Output GGUF file") + parser.add_argument("--outtype", choices=["f32", "f16", "i2_s"], default="f16", + help="Output type: f32, f16, or i2_s (ternary quantized)") + parser.add_argument("--verbose", action="store_true") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + dir_model = args.model + if not dir_model.is_dir(): + logger.error(f"{dir_model} is not a directory") + sys.exit(1) + + # Default output filename + if args.outfile is None: + suffix = {"f32": "-f32", "f16": "-f16", "i2_s": "-f16-new-i2_s"}[args.outtype] + args.outfile = dir_model / f"{dir_model.name}{suffix}.gguf" + + # Load config + with open(dir_model / "config.json") as f: + hparams = json.load(f) + + arch = hparams.get("model_type", "qwen3") + assert arch == "qwen3", f"Expected qwen3 architecture, got {arch}" + + n_layers = hparams["num_hidden_layers"] + + # Determine ftype + if args.outtype == "f32": + ftype = 0 # GGML F32 + elif args.outtype == "f16": + ftype = 1 # GGML F16 + else: # i2_s + ftype = 40 # LLAMA_FTYPE_MOSTLY_I2_S + + logger.info(f"Converting {dir_model.name} to GGUF ({args.outtype})") + + # Create GGUF writer + gguf_writer = gguf.GGUFWriter(str(args.outfile), "qwen3") + + # Set parameters + set_gguf_parameters(gguf_writer, hparams, dir_model, ftype) + + # Set vocab + logger.info("Setting tokenizer/vocab...") + set_vocab(gguf_writer, dir_model, hparams) + + # Build tensor name map + tensor_map = build_tensor_name_map(n_layers) + + # Process tensors + logger.info("Processing tensors...") + tensor_count = 0 + for hf_name, data_torch in iter_tensors(dir_model): + # Skip tensors we don't need + if hf_name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + # Strip "model." prefix if present + name = hf_name + if name.startswith("model."): + name = name[len("model."):] + + # Look up GGUF name + gguf_name = tensor_map.get(name) + if gguf_name is None: + logger.warning(f"Skipping unmapped tensor: {hf_name}") + continue + + old_dtype = data_torch.dtype + + # Convert bf16 -> f32 first (bf16 not directly supported by gguf) + if data_torch.dtype == torch.bfloat16: + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + n_dims = len(data.shape) + data_shape = data.shape + + # Determine if this is a linear weight suitable for ternary quantization + is_norm = gguf_name.endswith("_norm.weight") or gguf_name.endswith("_norm_in.weight") + is_embed = gguf_name == "token_embd.weight" + is_linear_weight = n_dims == 2 and not is_norm and not is_embed + suit_i2 = is_linear_weight + + if args.outtype == "i2_s" and suit_i2: + # --- I2_S ternary packing (scale embedded in data) --- + packed = quantize_to_i2_s(data) + data_qtype = gguf.GGMLQuantizationType.I2_S + + shape_str = f"{{{', '.join(str(n) for n in reversed(data_shape))}}}" + logger.info(f" {gguf_name}: {list(data_shape)} {old_dtype} -> I2_S, shape = {shape_str}") + + gguf_writer.add_tensor(gguf_name, packed, raw_shape=data_shape, raw_dtype=data_qtype) + tensor_count += 1 + + elif args.outtype in ("f16", "i2_s") and (is_linear_weight or is_embed): + # 2D weight tensors (linear + embedding) -> f16 + data = data.astype(np.float16) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16") + gguf_writer.add_tensor(gguf_name, data) + tensor_count += 1 + + else: + # norms, 1D tensors + if args.outtype in ("f16", "i2_s"): + data = data.astype(np.float16) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16") + else: + if data.dtype != np.float32: + data = data.astype(np.float32) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float32") + gguf_writer.add_tensor(gguf_name, data) + tensor_count += 1 + + logger.info(f"Total tensors written: {tensor_count}") + + # Note: output.weight (lm_head) is skipped for embedding models — + # it is not needed (no token generation) and saves ~297MB for this model. + + # Write GGUF + logger.info(f"Writing to {args.outfile}...") + gguf_writer.write_header_to_file() + gguf_writer.write_kv_data_to_file() + gguf_writer.write_tensors_to_file() + gguf_writer.close() + + logger.info("Done!") + + +if __name__ == "__main__": + main()