From 6d186cb9f91a4ff8a69427675875852e1254a1c1 Mon Sep 17 00:00:00 2001
From: isHuangXin <huangxin.hust@gmail.com>
Date: Thu, 21 May 2026 12:16:23 +0800
Subject: [PATCH] Add bitnet-embeddings-0.6b model adaptation with F16 and I2_S
 GGUF conversion

- Add GGUF conversion tool for bitnet-embeddings-0.6b (safetensors -> F16/I2_S GGUF)
- Add Qwen3 architecture support in llama.cpp submodule with per-projection RMSNorm
- Add I2_S ternary quantization (2-bit packed -1/0/+1) for lossless precision
- Add f16 norm weight support for correct embedding inference
- Add AVX512BW SIMD paths for I2_S kernel (~2x throughput on AVX512-capable CPUs)
- Guard bitnet-lut-kernels.h include with TL1/TL2 preprocessor checks
- Update llama.cpp submodule to dev-bitnet-embedding-0.6b branch
- Document F16 (from multilingual-e5-0.6b) and I2_S (from bitnet-embeddings-0.6b) conversion process
---
 3rdparty/llama.cpp                            |   2 +-
 ...bitnet-embeddings-qwen3-gguf-conversion.md | 302 +++++++++++
 src/ggml-bitnet-lut.cpp                       |   7 +
 src/ggml-bitnet-mad.cpp                       | 469 +++++++++++++++-
 utils/convert-bitnet-embedding-to-gguf.py     | 502 ++++++++++++++++++
 5 files changed, 1277 insertions(+), 5 deletions(-)
 create mode 100644 docs/bitnet-embeddings-qwen3-gguf-conversion.md
 create mode 100644 utils/convert-bitnet-embedding-to-gguf.py

diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index 1f86f058d..13e129947 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit 1f86f058de0c3f4098dedae2ae8653c335c868a1
+Subproject commit 13e129947db43cbbcbfa985c72c443c2f2757f15
diff --git a/docs/bitnet-embeddings-qwen3-gguf-conversion.md b/docs/bitnet-embeddings-qwen3-gguf-conversion.md
new file mode 100644
index 000000000..9d63c9300
--- /dev/null
+++ b/docs/bitnet-embeddings-qwen3-gguf-conversion.md
@@ -0,0 +1,302 @@
+# BitNet Embeddings (Qwen3) GGUF Conversion Implementation
+
+## 1. Background
+
+`bitnet-embeddings-0.6b` is a Qwen3-based embedding model with BitNet per-projection RMSNorm (`BitLinear`). Each linear projection (q/k/v/o/gate/up/down) has a `.norm.weight` that applies RMSNorm to the input **before** the matmul:
+
+```
+x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary))
+```
+
+This pattern does **not** exist in any standard llama.cpp architecture:
+- Standard Qwen3: no per-projection norms
+- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection)
+
+### Model Config
+
+- Architecture: `Qwen3Model`
+- hidden_size: 1024, num_attention_heads: 16, num_key_value_heads: 8
+- head_dim: 128 (note: != hidden_size/num_heads = 64)
+- intermediate_size: 3072, num_hidden_layers: 28
+- tie_word_embeddings: true
+- rope_theta: 1000000, rms_norm_eps: 1e-06
+
+### Per-Layer Tensors (7 extra norm tensors per layer)
+
+| Tensor | Shape |
+|--------|-------|
+| `self_attn.q_proj.norm.weight` | [1024] |
+| `self_attn.k_proj.norm.weight` | [1024] |
+| `self_attn.v_proj.norm.weight` | [1024] |
+| `self_attn.o_proj.norm.weight` | [2048] |
+| `mlp.gate_proj.norm.weight` | [1024] |
+| `mlp.up_proj.norm.weight` | [1024] |
+| `mlp.down_proj.norm.weight` | [3072] |
+
+---
+
+## 2. GGUF Tensor Name Mapping
+
+| HF Name | GGUF Name | Notes |
+|----------|-----------|-------|
+| `embed_tokens.weight` | `token_embd.weight` | |
+| `norm.weight` | `output_norm.weight` | |
+| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | |
+| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.ffn_norm.weight` | |
+| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | |
+| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | |
+| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | |
+| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | |
+| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm |
+| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm |
+| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | |
+| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | |
+| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | |
+| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | BitNet per-projection |
+| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | BitNet per-projection |
+
+---
+
+## 3. Conversion Script
+
+### `utils/convert-bitnet-embedding-to-gguf.py`
+
+Standalone conversion script (safetensors → GGUF). Key features:
+
+- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter)
+- Supports three output types:
+  - `--outtype f32`: all weights in float32
+  - `--outtype f16`: 2D weights and embeddings as float16, norms as float16
+  - `--outtype i2_s`: ternary weights packed in I2_S layout, non-ternary weights as float16
+- Writes `key_length` and `value_length` metadata for head_dim=128 (critical: default calculation would give wrong value 64)
+- GPT-2 BPE tokenizer handling with pre-tokenizer hash verification
+- Pooling type auto-detection from `modules.json` / `1_Pooling/config.json` (sentence-transformers convention)
+- EOS token override: uses `<|endoftext|>` (151643) for correct last-token pooling
+- Architecture string: `"qwen3"`
+
+### I2_S Ternary Packing
+
+The I2_S format packs ternary weights {-1, 0, +1} into 2-bit representation:
+
+- Quantization: `scale = 1/mean(|w|)`, `q = round(w * scale).clamp(-1, 1)`
+- Encoding: `-1 → 0`, `0 → 1`, `+1 → 2`
+- Every 128 values form a block, packed into 32 bytes
+- Each byte stores 4 values: `byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3`
+- Scale (float32) is appended at the end of the packed data buffer
+
+### Tensor Type Assignment
+
+| Tensor Type | f16 mode | i2_s mode |
+|-------------|----------|-----------|
+| 2D linear weights | float16 | I2_S ternary packed |
+| Embedding weights | float16 | float16 |
+| Norm weights (1D) | float16 | float16 |
+
+Note: `output.weight` (lm_head) is skipped for embedding models — it is not needed (no token generation).
+
+---
+
+## 4. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`)
+
+### 4.1 New Tensor Enums
+
+Added 7 new entries after `LLM_TENSOR_FFN_SUB_NORM`:
+
+```cpp
+LLM_TENSOR_ATTN_Q_NORM_IN,
+LLM_TENSOR_ATTN_K_NORM_IN,
+LLM_TENSOR_ATTN_V_NORM_IN,
+LLM_TENSOR_ATTN_OUT_NORM_IN,
+LLM_TENSOR_FFN_GATE_NORM_IN,
+LLM_TENSOR_FFN_UP_NORM_IN,
+LLM_TENSOR_FFN_DOWN_NORM_IN,
+```
+
+### 4.2 Tensor Name Mappings
+
+Added to `LLM_ARCH_QWEN3` tensor name map:
+
+```cpp
+{ LLM_TENSOR_ATTN_Q_NORM_IN,   "blk.%d.attn_q_norm_in" },
+{ LLM_TENSOR_ATTN_K_NORM_IN,   "blk.%d.attn_k_norm_in" },
+{ LLM_TENSOR_ATTN_V_NORM_IN,   "blk.%d.attn_v_norm_in" },
+{ LLM_TENSOR_ATTN_OUT_NORM_IN, "blk.%d.attn_output_norm_in" },
+{ LLM_TENSOR_FFN_GATE_NORM_IN, "blk.%d.ffn_gate_norm_in" },
+{ LLM_TENSOR_FFN_UP_NORM_IN,   "blk.%d.ffn_up_norm_in" },
+{ LLM_TENSOR_FFN_DOWN_NORM_IN, "blk.%d.ffn_down_norm_in" },
+```
+
+### 4.3 Layer Struct Fields
+
+Added to `struct llama_layer`:
+
+```cpp
+struct ggml_tensor * attn_q_norm_in;
+struct ggml_tensor * attn_k_norm_in;
+struct ggml_tensor * attn_v_norm_in;
+struct ggml_tensor * attn_out_norm_in;
+struct ggml_tensor * ffn_gate_norm_in;
+struct ggml_tensor * ffn_up_norm_in;
+struct ggml_tensor * ffn_down_norm_in;
+```
+
+### 4.4 load_tensors (LLM_ARCH_QWEN3)
+
+Added optional loading with `TENSOR_NOT_REQUIRED`:
+
+```cpp
+layer.attn_q_norm_in   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.attn_k_norm_in   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.attn_v_norm_in   = create_tensor(tn(LLM_TENSOR_ATTN_V_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.attn_out_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM_IN, "weight", i), {n_embd_head_k * n_head},    TENSOR_NOT_REQUIRED);
+layer.ffn_gate_norm_in = create_tensor(tn(LLM_TENSOR_FFN_GATE_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.ffn_up_norm_in   = create_tensor(tn(LLM_TENSOR_FFN_UP_NORM_IN,   "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+layer.ffn_down_norm_in = create_tensor(tn(LLM_TENSOR_FFN_DOWN_NORM_IN, "weight", i), {n_ff},   TENSOR_NOT_REQUIRED);
+```
+
+Note: `o_proj.norm` input dimension is `n_embd_head_k * n_head` (=2048), `down_proj.norm` input dimension is `n_ff` (=3072).
+
+### 4.5 build_qwen3() Graph Modifications
+
+The `build_qwen3()` function was modified to conditionally apply per-projection RMSNorm. The logic is fully backward compatible — when no `*_norm_in` tensors exist, behavior is identical to original.
+
+**Attention per-projection norms:**
+```
+// Before Q/K/V matmul:
+if (layer.attn_q_norm_in) {
+    cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
+    cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in);
+} else {
+    cur_q = cur;
+}
+Qcur = ggml_mul_mat(ctx, layer.wq, cur_q);
+// Similarly for K, V
+```
+
+**O_proj norm** requires special handling because `llm_build_kv()` normally applies `wo` internally. Solution: pass `wo=NULL` to `llm_build_kv()`, then apply norm + wo manually:
+
+```
+cur = llm_build_kv(..., wo=NULL, ...);  // returns attention output without o_proj
+if (layer.attn_out_norm_in) {
+    cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps);
+    cur = ggml_mul(ctx, cur, layer.attn_out_norm_in);
+}
+cur = ggml_mul_mat(ctx, layer.wo, cur);
+```
+
+**FFN per-projection norms:**
+```
+// Instead of llm_build_ffn(), manually:
+if (layer.ffn_gate_norm_in) {
+    tmp_gate = rms_norm(cur) * gate_norm_in;
+} else {
+    tmp_gate = cur;
+}
+tmp_gate = matmul(gate_proj, tmp_gate);
+// Similarly for up_proj
+tmp = silu(tmp_gate) * tmp_up;
+
+if (layer.ffn_down_norm_in) {
+    tmp = rms_norm(tmp) * down_norm_in;
+}
+cur = matmul(down_proj, tmp);
+```
+
+---
+
+## 5. GGUF Conversion Process
+
+There are two GGUF files to produce, from **two different source models**:
+
+| GGUF Output | Source Model | Description |
+|-------------|-------------|-------------|
+| `embeddings-0.6b-f16.gguf` | `multilingual-e5-0.6b` (standard Qwen3) | F16 baseline, standard float16 weights |
+| `bitnet-embeddings-0.6b-f16-i2_s.gguf` | `bitnet-embeddings-0.6b` (BitNet ternary) | I2_S ternary packed weights |
+
+### 5.1 F16 GGUF: from multilingual-e5-0.6b
+
+The F16 GGUF is converted from the **standard (non-BitNet) model** `multilingual-e5-0.6b`, which has normal float weights and no per-projection RMSNorm. This uses llama.cpp's standard converter since it is a vanilla Qwen3 model:
+
+```bash
+python3 /path/to/llama.cpp/convert_hf_to_gguf.py \
+  /path/to/multilingual-e5-0.6b \
+  --outtype f16 \
+  --outfile embeddings-0.6b-f16.gguf
+```
+
+**What happens:**
+1. Load `model.safetensors` (standard Qwen3 weights, bfloat16)
+2. Convert all 2D weights (projections, embeddings) to float16
+3. Convert norm weights to float32
+4. Write GGUF with `qwen3` architecture metadata and tokenizer
+
+**Output:** ~1.11 GiB (595.78M params)
+
+### 5.2 I2_S GGUF: from bitnet-embeddings-0.6b
+
+The I2_S GGUF is converted from the **BitNet ternary model** `bitnet-embeddings-0.6b`, which has ternary weights {-1, 0, +1} and 7 extra per-projection RMSNorm tensors per layer. This uses the custom converter because the standard llama.cpp converter does not handle per-projection norms or I2_S quantization:
+
+```bash
+python3 utils/convert-bitnet-embedding-to-gguf.py \
+  /path/to/bitnet-embeddings-0.6b \
+  --outfile bitnet-embeddings-0.6b-f16-i2_s.gguf --outtype i2_s
+```
+
+**What happens:**
+1. Load `model.safetensors` (BitNet ternary weights, bfloat16)
+2. Map HF tensor names to GGUF names, including 7 extra `*_norm_in` tensors per layer (see Section 2)
+3. For each 2D linear weight (q/k/v/o/gate/up/down projections):
+   - Compute scale: `scale = 1 / mean(|w|)`
+   - Quantize: `q = round(w * scale).clamp(-1, 1)`
+   - Encode: `-1 -> 0`, `0 -> 1`, `+1 -> 2`
+   - Pack every 128 values into 32 bytes (4 values per byte, 2 bits each)
+   - Append per-row float32 scale
+4. Keep embeddings (`token_embd.weight`) in float16 (not ternary)
+5. Keep all norm weights in float16
+6. Skip `output.weight` (lm_head, not needed for embedding models)
+7. Write GGUF with `I2_S` type tag for quantized tensors
+
+**Output:** ~699 MiB (~50% of F16 size)
+
+### 5.3 Why Two Different Source Models?
+
+- `multilingual-e5-0.6b` is the **teacher/baseline model** with standard float weights, used as the F16 performance reference
+- `bitnet-embeddings-0.6b` is the **1-bit quantized student model** with ternary weights and per-projection BitLinear norms, converted to I2_S for efficient CPU inference
+- Benchmarking compares both to measure the throughput gain and quality trade-off of ternary quantization
+
+### 5.4 Tensor Type Summary
+
+| Tensor | F16 (from e5-0.6b) | I2_S (from bitnet-0.6b) |
+|--------|---------------------|-------------------------|
+| Linear projections (q/k/v/o/gate/up/down) | float16 | I2_S (2-bit packed + float32 scale) |
+| Embedding (`token_embd.weight`) | float16 | float16 |
+| Per-projection norms (`*_norm_in`) | N/A (not present) | float16 |
+| Layer norms (`attn_norm`, `ffn_norm`) | float32 | float16 |
+| QK head norms (`attn_q_norm`, `attn_k_norm`) | float32 | float32 |
+| `output.weight` (lm_head) | present | skipped |
+
+---
+
+## 6. Build and Run
+
+```bash
+# Build with BitNet repo (includes I2_S support)
+cmake -S /path/to/BitNet -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build --target llama-embedding llama-bench -j$(nproc)
+
+# Run embedding inference
+build/bin/llama-embedding -m bitnet-embeddings-0.6b-f16-i2_s.gguf \
+  -p "hello world" --embd-normalize 2 --embd-output-format array
+
+# Benchmark: F16 vs I2_S
+build/bin/llama-bench -m embeddings-0.6b-f16.gguf \
+  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
+
+build/bin/llama-bench -m bitnet-embeddings-0.6b-f16-i2_s.gguf \
+  -t 8 -p 128,256,512,1024,2048 -n 32,64 -r 3 -ngl 0
+```
diff --git a/src/ggml-bitnet-lut.cpp b/src/ggml-bitnet-lut.cpp
index 59422d548..beef726f7 100644
--- a/src/ggml-bitnet-lut.cpp
+++ b/src/ggml-bitnet-lut.cpp
@@ -5,9 +5,16 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef __x86_64__
+#include <immintrin.h>
+#endif
+
 #include "ggml-bitnet.h"
 #include "ggml-quants.h"
+
+#if defined(GGML_BITNET_ARM_TL1) || defined(GGML_BITNET_X86_TL2)
 #include "bitnet-lut-kernels.h"
+#endif
 
 #if defined(GGML_BITNET_ARM_TL1)
 
diff --git a/src/ggml-bitnet-mad.cpp b/src/ggml-bitnet-mad.cpp
index 4ba9d6509..f99368bbd 100644
--- a/src/ggml-bitnet-mad.cpp
+++ b/src/ggml-bitnet-mad.cpp
@@ -24,6 +24,12 @@ static inline int hsum_i32_8(const __m256i a) {
     const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
     return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
 }
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+// horizontally add 16 int32_t
+static inline int hsum_i32_16(const __m512i a) {
+    return _mm512_reduce_add_epi32(a);
+}
+#endif
 #elif defined(__loongarch_asx)
 // horizontally add 8 int32_t
 static inline int hsum_i32_8(const __m256i a) {
@@ -196,7 +202,153 @@ size_t quantize_i2_s(const float * src, void * dst, int64_t nrow, int64_t n_per_
 }
 
 void ggml_vec_dot_i2_i8_s_1x1(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+    const uint8_t *    x = (uint8_t *)vx;
+    const int8_t  *    y = (int8_t *)vy;
+
+    const int nb = n / QK_I2_S;
+    const int group32_num = nb / 32;
+    const int la_num = nb % 32;
+    const int groupla_num = nb % 32 != 0 ? 1 : 0;
+
+    const __m512i mask = _mm512_set1_epi8(0x03);
+    const __m512i one16 = _mm512_set1_epi16(1);
+
+    for (int row = 0; row < nrc; row++) {
+        __m512i accu = _mm512_setzero_si512();
+
+        const uint8_t * x_row = x + row * bx / 4;
+
+        for (int i = 0; i < group32_num; i++) {
+            const uint8_t *px = x_row + i * 1024;
+            const int8_t  *py = y + i * 4096;
+            __m512i accu32 = _mm512_setzero_si512();
+
+            // Process 2 blocks per iteration (j+=2), 16 iterations instead of 32
+            int j = 0;
+            for (; j + 1 < 32; j += 2) {
+                // Load 2 consecutive 32-byte weight blocks into one 512-bit register
+                __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px));
+                __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                xq8_3 = _mm512_and_si512(xq8_3, mask);
+                xq8_2 = _mm512_and_si512(xq8_2, mask);
+                xq8_1 = _mm512_and_si512(xq8_1, mask);
+                xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                // Load 2 consecutive 128-byte activation blocks (256 bytes total = 4 x 64)
+                __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py));
+                __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64));
+                __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128));
+                __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192));
+
+                xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_0, xq8_1));
+                accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_2, xq8_3));
+
+                px += 64;
+                py += 256;
+            }
+            // Handle odd remaining block
+            if (j < 32) {
+                __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px));
+                __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256);
+                __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                xq8_3 = _mm512_and_si512(xq8_3, mask);
+                xq8_2 = _mm512_and_si512(xq8_2, mask);
+                xq8_1 = _mm512_and_si512(xq8_1, mask);
+                xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py));
+                __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32));
+                __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64));
+                __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96));
+
+                xq8_0 = _mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(yq8_0_256));
+                xq8_1 = _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(yq8_1_256));
+                xq8_2 = _mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(yq8_2_256));
+                xq8_3 = _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(yq8_3_256));
+
+                accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_0, xq8_1));
+                accu32 = _mm512_add_epi16(accu32, _mm512_add_epi16(xq8_2, xq8_3));
+            }
+            accu = _mm512_add_epi32(_mm512_madd_epi16(accu32, one16), accu);
+        }
+
+        for (int i = 0; i < groupla_num; i++) {
+            __m512i accula = _mm512_setzero_si512();
+            const uint8_t *px = x_row + group32_num * 1024;
+            const int8_t  *py = y + group32_num * 4096;
+
+            int j = 0;
+            for (; j + 1 < la_num; j += 2) {
+                __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px));
+                __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                xq8_3 = _mm512_and_si512(xq8_3, mask);
+                xq8_2 = _mm512_and_si512(xq8_2, mask);
+                xq8_1 = _mm512_and_si512(xq8_1, mask);
+                xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py));
+                __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64));
+                __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128));
+                __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192));
+
+                xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_0, xq8_1));
+                accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_2, xq8_3));
+
+                px += 64;
+                py += 256;
+            }
+            if (j < la_num) {
+                __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px));
+                __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256);
+                __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                xq8_3 = _mm512_and_si512(xq8_3, mask);
+                xq8_2 = _mm512_and_si512(xq8_2, mask);
+                xq8_1 = _mm512_and_si512(xq8_1, mask);
+                xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py));
+                __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32));
+                __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64));
+                __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96));
+
+                xq8_0 = _mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(yq8_0_256));
+                xq8_1 = _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(yq8_1_256));
+                xq8_2 = _mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(yq8_2_256));
+                xq8_3 = _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(yq8_3_256));
+
+                accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_0, xq8_1));
+                accula = _mm512_add_epi16(accula, _mm512_add_epi16(xq8_2, xq8_3));
+            }
+            accu = _mm512_add_epi32(accu, _mm512_madd_epi16(accula, one16));
+        }
+
+        int sumi = hsum_i32_16(accu);
+        s[row] = (float)sumi;
+    }
+#elif defined(__AVX2__)
     const uint8_t *    x = (uint8_t *)vx;
     const int8_t  *    y = (int8_t *)vy;
 
@@ -510,7 +662,184 @@ void ggml_vec_dot_i2_i8_s_1x4_32W(int n, float * s, size_t bs, const void * vx,
 }
 
 void ggml_vec_dot_i2_i8_s_1xN(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+    const uint8_t *    x = (uint8_t *)vx;
+    const int8_t  *    y = (int8_t *)vy;
+
+    const int nb = n / QK_I2_S;
+    const int group32_num = nb / 32;
+    const int la_num = nb % 32;
+    const int groupla_num = nb % 32 != 0 ? 1 : 0;
+
+    const __m512i mask = _mm512_set1_epi8(0x03);
+    const __m512i one16 = _mm512_set1_epi16(1);
+
+    for (int row = 0; row < nrc; row += PARALLEL_SIZE) {
+        __m512i accu[PARALLEL_SIZE];
+        const uint8_t * x_row[PARALLEL_SIZE];
+        for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+            accu[rb] = _mm512_setzero_si512();
+            x_row[rb] = x + (row + rb) * bx / 4;
+        }
+
+        for (int i = 0; i < group32_num; i++) {
+            const uint8_t * px[PARALLEL_SIZE];
+            __m512i accu32[PARALLEL_SIZE];
+            for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                px[rb] = x_row[rb] + i * 1024;
+                accu32[rb] = _mm512_setzero_si512();
+            }
+            const int8_t  *py = y + i * 4096;
+
+            int j = 0;
+            for (; j + 1 < 32; j += 2) {
+                __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py));
+                __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64));
+                __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128));
+                __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192));
+                for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                    __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px[rb]));
+                    __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                    __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                    __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                    xq8_3 = _mm512_and_si512(xq8_3, mask);
+                    xq8_2 = _mm512_and_si512(xq8_2, mask);
+                    xq8_1 = _mm512_and_si512(xq8_1, mask);
+                    xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                    xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                    xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                    xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                    xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                    accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_0, xq8_1));
+                    accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_2, xq8_3));
+
+                    px[rb] += 64;
+                }
+                py += 256;
+            }
+            if (j < 32) {
+                __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py));
+                __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32));
+                __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64));
+                __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96));
+                __m512i yq8_0 = _mm512_castsi256_si512(yq8_0_256);
+                __m512i yq8_1 = _mm512_castsi256_si512(yq8_1_256);
+                __m512i yq8_2 = _mm512_castsi256_si512(yq8_2_256);
+                __m512i yq8_3 = _mm512_castsi256_si512(yq8_3_256);
+                for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                    __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px[rb]));
+                    __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256);
+                    __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                    __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                    __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                    xq8_3 = _mm512_and_si512(xq8_3, mask);
+                    xq8_2 = _mm512_and_si512(xq8_2, mask);
+                    xq8_1 = _mm512_and_si512(xq8_1, mask);
+                    xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                    xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                    xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                    xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                    xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                    accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_0, xq8_1));
+                    accu32[rb] = _mm512_add_epi16(accu32[rb], _mm512_add_epi16(xq8_2, xq8_3));
+
+                    px[rb] += 32;
+                }
+            }
+            for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                accu[rb] = _mm512_add_epi32(_mm512_madd_epi16(accu32[rb], one16), accu[rb]);
+            }
+        }
+
+        for (int i = 0; i < groupla_num; i++) {
+            const int8_t  *py = y + group32_num * 4096;
+            const uint8_t * px[PARALLEL_SIZE];
+            __m512i accula[PARALLEL_SIZE];
+            for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                px[rb] = x_row[rb] + group32_num * 1024;
+                accula[rb] = _mm512_setzero_si512();
+            }
+
+            int j = 0;
+            for (; j + 1 < la_num; j += 2) {
+                __m512i yq8_0 = _mm512_loadu_si512((const __m512i*)(py));
+                __m512i yq8_1 = _mm512_loadu_si512((const __m512i*)(py + 64));
+                __m512i yq8_2 = _mm512_loadu_si512((const __m512i*)(py + 128));
+                __m512i yq8_3 = _mm512_loadu_si512((const __m512i*)(py + 192));
+
+                for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                    __m512i xq8_3 = _mm512_loadu_si512((const __m512i*)(px[rb]));
+                    __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                    __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                    __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                    xq8_3 = _mm512_and_si512(xq8_3, mask);
+                    xq8_2 = _mm512_and_si512(xq8_2, mask);
+                    xq8_1 = _mm512_and_si512(xq8_1, mask);
+                    xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                    xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                    xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                    xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                    xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                    accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_0, xq8_1));
+                    accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_2, xq8_3));
+
+                    px[rb] += 64;
+                }
+                py += 256;
+            }
+            if (j < la_num) {
+                __m256i yq8_0_256 = _mm256_loadu_si256((const __m256i*)(py));
+                __m256i yq8_1_256 = _mm256_loadu_si256((const __m256i*)(py + 32));
+                __m256i yq8_2_256 = _mm256_loadu_si256((const __m256i*)(py + 64));
+                __m256i yq8_3_256 = _mm256_loadu_si256((const __m256i*)(py + 96));
+                __m512i yq8_0 = _mm512_castsi256_si512(yq8_0_256);
+                __m512i yq8_1 = _mm512_castsi256_si512(yq8_1_256);
+                __m512i yq8_2 = _mm512_castsi256_si512(yq8_2_256);
+                __m512i yq8_3 = _mm512_castsi256_si512(yq8_3_256);
+
+                for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                    __m256i xq8_3_256 = _mm256_loadu_si256((const __m256i*)(px[rb]));
+                    __m512i xq8_3 = _mm512_castsi256_si512(xq8_3_256);
+                    __m512i xq8_2 = _mm512_srli_epi16(xq8_3, 2);
+                    __m512i xq8_1 = _mm512_srli_epi16(xq8_3, 4);
+                    __m512i xq8_0 = _mm512_srli_epi16(xq8_3, 6);
+
+                    xq8_3 = _mm512_and_si512(xq8_3, mask);
+                    xq8_2 = _mm512_and_si512(xq8_2, mask);
+                    xq8_1 = _mm512_and_si512(xq8_1, mask);
+                    xq8_0 = _mm512_and_si512(xq8_0, mask);
+
+                    xq8_0 = _mm512_maddubs_epi16(xq8_0, yq8_0);
+                    xq8_1 = _mm512_maddubs_epi16(xq8_1, yq8_1);
+                    xq8_2 = _mm512_maddubs_epi16(xq8_2, yq8_2);
+                    xq8_3 = _mm512_maddubs_epi16(xq8_3, yq8_3);
+
+                    accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_0, xq8_1));
+                    accula[rb] = _mm512_add_epi16(accula[rb], _mm512_add_epi16(xq8_2, xq8_3));
+
+                    px[rb] += 32;
+                }
+            }
+            for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+                accu[rb] = _mm512_add_epi32(accu[rb], _mm512_madd_epi16(accula[rb], one16));
+            }
+        }
+
+        for (int rb = 0; rb < PARALLEL_SIZE; rb++) {
+            int sumi = hsum_i32_16(accu[rb]);
+            s[row + rb] = (float)sumi;
+        }
+    }
+#elif defined(__AVX2__)
     const uint8_t *    x = (uint8_t *)vx;
     const int8_t  *    y = (int8_t *)vy;
 
@@ -789,7 +1118,139 @@ void ggml_vec_dot_i2_i8_s_1xN(int n, float * s, size_t bs, const void * vx, size
 }
 
 void ggml_vec_dot_i2_i8_s_Nx1(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+    const uint8_t *    x = (uint8_t *)vx;
+    const int8_t  *    y = (int8_t *)vy;
+
+    const int nb = n / QK_I2_S;
+    const int group32_num = nb / 32;
+    const int la_num = nb % 32;
+    const int groupla_num = nb % 32 != 0 ? 1 : 0;
+
+    const __m512i mask = _mm512_set1_epi8(0x03);
+    const __m512i one16 = _mm512_set1_epi16(1);
+
+    for (int col = 0; col < nrc; col += PARALLEL_SIZE) {
+        __m512i accu[PARALLEL_SIZE];
+
+        for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+            accu[iy] = _mm512_setzero_si512();
+        }
+
+        const int8_t * y_col = y + col * by;
+
+        for (int i = 0; i < group32_num; i++) {
+            const uint8_t *px = x + i * 1024;
+            const int8_t  *py = y_col + i * 4096;
+            __m512i accu32[PARALLEL_SIZE];
+
+            for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                accu32[iy] = _mm512_setzero_si512();
+            }
+
+            int j = 0;
+            for (; j + 1 < 32; j += 2) {
+                __m512i xq8   = _mm512_loadu_si512((const __m512i*)(px));
+                __m512i xq8_3 = _mm512_and_si512(xq8, mask);
+                __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask);
+                __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask);
+                __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask);
+
+                for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                    accu32[iy] = _mm512_add_epi16(accu32[iy], _mm512_add_epi16(
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_loadu_si512((const __m512i*)(py + 0 * 64 + iy * by))),
+                                                    _mm512_maddubs_epi16(xq8_1, _mm512_loadu_si512((const __m512i*)(py + 1 * 64 + iy * by)))),
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_loadu_si512((const __m512i*)(py + 2 * 64 + iy * by))),
+                                                    _mm512_maddubs_epi16(xq8_3, _mm512_loadu_si512((const __m512i*)(py + 3 * 64 + iy * by))))));
+                }
+
+                px += 64;
+                py += 256;
+            }
+            if (j < 32) {
+                __m256i xq8_256 = _mm256_loadu_si256((const __m256i*)(px));
+                __m512i xq8   = _mm512_castsi256_si512(xq8_256);
+                __m512i xq8_3 = _mm512_and_si512(xq8, mask);
+                __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask);
+                __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask);
+                __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask);
+
+                for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                    accu32[iy] = _mm512_add_epi16(accu32[iy], _mm512_add_epi16(
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 0 * 32 + iy * by)))),
+                                                    _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 1 * 32 + iy * by))))),
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 2 * 32 + iy * by)))),
+                                                    _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 3 * 32 + iy * by)))))));
+                }
+
+                px += 32;
+                py += 128;
+            }
+
+            for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                accu[iy] = _mm512_add_epi32(_mm512_madd_epi16(accu32[iy], one16), accu[iy]);
+            }
+        }
+
+        for (int i = 0; i < groupla_num; i++) {
+            const uint8_t *px = x + group32_num * 1024;
+            const int8_t  *py = y_col + group32_num * 4096;
+            __m512i accula[PARALLEL_SIZE];
+
+            for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                accula[iy] = _mm512_setzero_si512();
+            }
+
+            int j = 0;
+            for (; j + 1 < la_num; j += 2) {
+                __m512i xq8   = _mm512_loadu_si512((const __m512i*)(px));
+                __m512i xq8_3 = _mm512_and_si512(xq8, mask);
+                __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask);
+                __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask);
+                __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask);
+
+                for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                    accula[iy] = _mm512_add_epi16(accula[iy], _mm512_add_epi16(
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_loadu_si512((const __m512i*)(py + 0 * 64 + iy * by))),
+                                                    _mm512_maddubs_epi16(xq8_1, _mm512_loadu_si512((const __m512i*)(py + 1 * 64 + iy * by)))),
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_loadu_si512((const __m512i*)(py + 2 * 64 + iy * by))),
+                                                    _mm512_maddubs_epi16(xq8_3, _mm512_loadu_si512((const __m512i*)(py + 3 * 64 + iy * by))))));
+                }
+
+                px += 64;
+                py += 256;
+            }
+            if (j < la_num) {
+                __m256i xq8_256 = _mm256_loadu_si256((const __m256i*)(px));
+                __m512i xq8   = _mm512_castsi256_si512(xq8_256);
+                __m512i xq8_3 = _mm512_and_si512(xq8, mask);
+                __m512i xq8_2 = _mm512_and_si512(_mm512_srli_epi16(xq8, 2), mask);
+                __m512i xq8_1 = _mm512_and_si512(_mm512_srli_epi16(xq8, 4), mask);
+                __m512i xq8_0 = _mm512_and_si512(_mm512_srli_epi16(xq8, 6), mask);
+
+                for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                    accula[iy] = _mm512_add_epi16(accula[iy], _mm512_add_epi16(
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_0, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 0 * 32 + iy * by)))),
+                                                    _mm512_maddubs_epi16(xq8_1, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 1 * 32 + iy * by))))),
+                                    _mm512_add_epi16(_mm512_maddubs_epi16(xq8_2, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 2 * 32 + iy * by)))),
+                                                    _mm512_maddubs_epi16(xq8_3, _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(py + 3 * 32 + iy * by)))))));
+                }
+
+                px += 32;
+                py += 128;
+            }
+
+            for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+                accu[iy] = _mm512_add_epi32(_mm512_madd_epi16(accula[iy], one16), accu[iy]);
+            }
+        }
+
+        for (int iy = 0; iy < PARALLEL_SIZE; iy++) {
+            int sumi = hsum_i32_16(accu[iy]);
+            s[(col + iy) * bs] = (float)sumi;
+        }
+    }
+#elif defined(__AVX2__)
     const uint8_t *    x = (uint8_t *)vx;
     const int8_t  *    y = (int8_t *)vy;
 
@@ -808,7 +1269,7 @@ void ggml_vec_dot_i2_i8_s_Nx1(int n, float * s, size_t bs, const void * vx, size
             accu[iy] = _mm256_setzero_si256();
         }
 
-        int8_t * y_col = y + col * by;
+        const int8_t * y_col = y + col * by;
         
         for (int i = 0; i < group32_num; i++) {
             const uint8_t *px = x + i * 1024;
diff --git a/utils/convert-bitnet-embedding-to-gguf.py b/utils/convert-bitnet-embedding-to-gguf.py
new file mode 100644
index 000000000..3a4340734
--- /dev/null
+++ b/utils/convert-bitnet-embedding-to-gguf.py
@@ -0,0 +1,502 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import sys
+from hashlib import sha256
+from pathlib import Path
+from typing import Any, Iterator
+
+import numpy as np
+import torch
+
+# Allow using the local gguf-py if present
+if "NO_LOCAL_GGUF" not in os.environ:
+    _local_gguf = Path(__file__).parent / "gguf-py"
+    if _local_gguf.exists():
+        sys.path.insert(1, str(_local_gguf))
+import gguf
+
+logger = logging.getLogger("convert-bitnet-embedding")
+
+# ---------------------------------------------------------------------------
+# Tensor name mapping: HuggingFace -> GGUF
+# ---------------------------------------------------------------------------
+
+def build_tensor_name_map(n_layers: int) -> dict[str, str]:
+    """Build HF tensor name -> GGUF tensor name mapping."""
+    mapping: dict[str, str] = {
+        "embed_tokens.weight": "token_embd.weight",
+        "norm.weight": "output_norm.weight",
+    }
+
+    for i in range(n_layers):
+        pfx = f"layers.{i}"
+        blk = f"blk.{i}"
+
+        mapping.update({
+            # Layer norms
+            f"{pfx}.input_layernorm.weight":           f"{blk}.attn_norm.weight",
+            f"{pfx}.post_attention_layernorm.weight":   f"{blk}.ffn_norm.weight",
+
+            # Self-attention projections
+            f"{pfx}.self_attn.q_proj.weight":           f"{blk}.attn_q.weight",
+            f"{pfx}.self_attn.k_proj.weight":           f"{blk}.attn_k.weight",
+            f"{pfx}.self_attn.v_proj.weight":           f"{blk}.attn_v.weight",
+            f"{pfx}.self_attn.o_proj.weight":           f"{blk}.attn_output.weight",
+
+            # QK head norms (standard Qwen3)
+            f"{pfx}.self_attn.q_norm.weight":           f"{blk}.attn_q_norm.weight",
+            f"{pfx}.self_attn.k_norm.weight":           f"{blk}.attn_k_norm.weight",
+
+            # Per-projection input norms (BitNet-specific)
+            f"{pfx}.self_attn.q_proj.norm.weight":      f"{blk}.attn_q_norm_in.weight",
+            f"{pfx}.self_attn.k_proj.norm.weight":      f"{blk}.attn_k_norm_in.weight",
+            f"{pfx}.self_attn.v_proj.norm.weight":      f"{blk}.attn_v_norm_in.weight",
+            f"{pfx}.self_attn.o_proj.norm.weight":      f"{blk}.attn_output_norm_in.weight",
+
+            # MLP projections
+            f"{pfx}.mlp.gate_proj.weight":              f"{blk}.ffn_gate.weight",
+            f"{pfx}.mlp.up_proj.weight":                f"{blk}.ffn_up.weight",
+            f"{pfx}.mlp.down_proj.weight":              f"{blk}.ffn_down.weight",
+
+            # Per-projection input norms for MLP (BitNet-specific)
+            f"{pfx}.mlp.gate_proj.norm.weight":         f"{blk}.ffn_gate_norm_in.weight",
+            f"{pfx}.mlp.up_proj.norm.weight":           f"{blk}.ffn_up_norm_in.weight",
+            f"{pfx}.mlp.down_proj.norm.weight":         f"{blk}.ffn_down_norm_in.weight",
+        })
+
+    return mapping
+
+
+# ---------------------------------------------------------------------------
+# Tokenizer handling (GPT-2 / BPE for Qwen3)
+# ---------------------------------------------------------------------------
+
+def get_vocab_base_pre(tokenizer) -> str:
+    # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
+    # is specific for the BPE pre-tokenizer used by the model
+    # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
+    # use in llama.cpp to implement the same pre-tokenizer
+
+    chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+
+    chktok = tokenizer.encode(chktxt)
+    chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+    logger.debug(f"chktok: {chktok}")
+    logger.debug(f"chkhsh: {chkhsh}")
+
+    res = None
+
+    # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
+    #       or pull the latest version of the model from Huggingface
+    #       don't edit the hashes manually!
+    if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
+        # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+        res = "llama-bpe"
+    if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
+        # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
+        res = "deepseek-llm"
+    if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
+        # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
+        res = "deepseek-coder"
+    if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
+        # ref: https://huggingface.co/tiiuae/falcon-7b
+        res = "falcon"
+    if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
+        # ref: https://huggingface.co/openai-community/gpt2
+        res = "gpt-2"
+    if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
+        # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
+        res = "qwen2"
+
+    if res is None:
+        logger.warning("\n")
+        logger.warning("**************************************************************************************")
+        logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
+        logger.warning("**          There are 2 possible reasons for this:")
+        logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
+        logger.warning("**          - the pre-tokenization config has changed upstream")
+        logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
+        logger.warning("** ref:     https://github.com/ggml-org/llama.cpp/pull/6920")
+        logger.warning("**")
+        logger.warning(f"** chkhsh:  {chkhsh}")
+        logger.warning("**************************************************************************************")
+        logger.warning("\n")
+        raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
+
+    logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
+    logger.debug(f"chkhsh: {chkhsh}")
+
+    return res
+
+
+def _does_token_look_special(token: str) -> bool:
+    """Check if a token looks like a special token (e.g., <|...|>, <...>)."""
+    if not token:
+        return False
+    # Matches patterns like <|endoftext|>, <s>, </s>, [CLS], [SEP], etc.
+    if token.startswith(("<|", "<", "[")) and token.endswith(("|>", ">", "]")):
+        return True
+    return False
+
+
+def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict):
+    """Set GPT-2 BPE vocab for Qwen3."""
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+    vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+
+    tokpre = get_vocab_base_pre(tokenizer)
+
+    tokens: list[str] = []
+    toktypes: list[int] = []
+
+    reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()}
+    added_vocab = tokenizer.get_added_vocab()
+
+    added_tokens_decoder = tokenizer.added_tokens_decoder
+
+    for i in range(vocab_size):
+        if i not in reverse_vocab:
+            tokens.append(f"[PAD{i}]")
+            toktypes.append(gguf.TokenType.UNUSED)
+        elif reverse_vocab[i] in added_vocab:
+            token = reverse_vocab[i]
+
+            # Only encode-decode non-normalized tokens (matching llama.cpp upstream)
+            if not added_tokens_decoder[i].normalized:
+                token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
+
+            if added_tokens_decoder[i].special or _does_token_look_special(token):
+                toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                # Pre-normalize user-defined spaces (for Gemma-style tokenizers)
+                token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")
+                toktypes.append(gguf.TokenType.USER_DEFINED)
+
+            tokens.append(token)
+        else:
+            tokens.append(reverse_vocab[i])
+            toktypes.append(gguf.TokenType.NORMAL)
+
+    gguf_writer.add_tokenizer_model("gpt2")
+    gguf_writer.add_tokenizer_pre(tokpre)
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_types(toktypes)
+
+    special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+    # Override EOS token: PyTorch tokenizer appends <|endoftext|> (151643) as the
+    # sentence-end marker, not <|im_end|> (151645). For last-token pooling to work
+    # correctly, llama.cpp must append the same token.
+    special_vocab.special_token_ids["eos"] = 151643
+    special_vocab.add_to_gguf(gguf_writer)
+
+    # Embedding models need EOS token appended for last-token pooling
+    gguf_writer.add_add_eos_token(True)
+
+
+# ---------------------------------------------------------------------------
+# GGUF metadata
+# ---------------------------------------------------------------------------
+
+def set_gguf_parameters(gguf_writer: gguf.GGUFWriter, hparams: dict, dir_model: Path, ftype: int):
+    gguf_writer.add_name(dir_model.name)
+
+    n_layers = hparams["num_hidden_layers"]
+    n_embd = hparams["hidden_size"]
+    n_head = hparams["num_attention_heads"]
+    n_head_kv = hparams.get("num_key_value_heads", n_head)
+    n_ff = hparams["intermediate_size"]
+
+    gguf_writer.add_block_count(n_layers)
+    gguf_writer.add_context_length(hparams.get("max_position_embeddings", 32768))
+    gguf_writer.add_embedding_length(n_embd)
+    gguf_writer.add_feed_forward_length(n_ff)
+    gguf_writer.add_head_count(n_head)
+    gguf_writer.add_head_count_kv(n_head_kv)
+    gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+    head_dim = hparams.get("head_dim", n_embd // n_head)
+    gguf_writer.add_rope_dimension_count(head_dim)
+    gguf_writer.add_key_length(head_dim)
+    gguf_writer.add_value_length(head_dim)
+
+    if hparams.get("rope_theta") is not None:
+        gguf_writer.add_rope_freq_base(hparams["rope_theta"])
+    if hparams.get("rms_norm_eps") is not None:
+        gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+    gguf_writer.add_file_type(ftype)
+
+    # Pooling type for embedding models
+    # Try to read from modules.json / 1_Pooling/config.json (sentence-transformers convention)
+    pooling_type = None
+    module_path = dir_model / "modules.json"
+    if module_path.is_file():
+        with open(module_path, encoding="utf-8") as f:
+            modules = json.load(f)
+        for mod in modules:
+            if mod["type"].endswith("Pooling"):
+                pooling_path = dir_model / mod["path"] / "config.json"
+                if pooling_path.is_file():
+                    with open(pooling_path, encoding="utf-8") as f:
+                        pooling = json.load(f)
+                    if pooling.get("pooling_mode_mean_tokens"):
+                        pooling_type = gguf.PoolingType.MEAN
+                    elif pooling.get("pooling_mode_cls_token"):
+                        pooling_type = gguf.PoolingType.CLS
+                    elif pooling.get("pooling_mode_lasttoken"):
+                        pooling_type = gguf.PoolingType.LAST
+                break
+    if pooling_type is None:
+        # Default to MEAN pooling for embedding models
+        logger.info("  No pooling config found, defaulting to MEAN pooling")
+        pooling_type = gguf.PoolingType.MEAN
+    gguf_writer.add_pooling_type(pooling_type)
+
+    logger.info(f"  n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}")
+
+
+# ---------------------------------------------------------------------------
+# Tensor iteration from safetensors
+# ---------------------------------------------------------------------------
+
+def iter_tensors(dir_model: Path) -> Iterator[tuple[str, torch.Tensor]]:
+    """Yield (name, tensor) from safetensors files."""
+    from safetensors import safe_open
+
+    safetensor_files = sorted(dir_model.glob("*.safetensors"))
+    if not safetensor_files:
+        raise FileNotFoundError(f"No .safetensors files in {dir_model}")
+
+    for sf_path in safetensor_files:
+        logger.info(f"Loading {sf_path.name}")
+        with safe_open(str(sf_path), framework="pt", device="cpu") as f:
+            for name in f.keys():
+                yield name, f.get_tensor(name)
+
+
+# ---------------------------------------------------------------------------
+# I2_S ternary packing (platform-independent)
+# ---------------------------------------------------------------------------
+#
+# I2_S format (from dequantize_row_i2_s in ggml-quants.c):
+#   - Every 128 values form a block, packed into 32 bytes
+#   - Each byte stores 4 values at positions [0*32+gp, 1*32+gp, 2*32+gp, 3*32+gp]
+#     where gp is the byte index within the 32-byte group
+#   - Encoding per byte: c0=(b>>6)&3, c1=(b>>4)&3, c2=(b>>2)&3, c3=(b>>0)&3
+#   - Value mapping: 0 -> -1, 1 -> 0, 2 -> +1, 3 -> 0
+#   - Scale is stored as a separate tensor (tensor_name + "_scale")
+
+def quantize_to_i2_s(w: np.ndarray) -> np.ndarray:
+    """Quantize float weights to ternary and pack into I2_S layout.
+
+    Uses the same quantization as BitLinear weight_quant_minmax():
+        scale = 1.0 / mean(|w|)
+        q = round(w * scale).clamp(-1, 1)
+        dequant = q / scale = q * mean(|w|)
+
+    The I2_S format is self-contained: packed ternary bytes followed by a f32 scale
+    appended at the end of the data buffer.
+
+    Args:
+        w: float weight tensor of shape (M, K)
+
+    Returns:
+        packed_data: uint8 array containing I2_S packed bytes + scale (as 4 trailing bytes)
+    """
+    M, K = w.shape
+    n = M * K
+    w_flat = w.flatten().astype(np.float32)
+
+    # BitLinear weight_quant_minmax: scale = 1/mean(|w|), then round & clamp
+    abs_mean = np.mean(np.abs(w_flat))
+    abs_mean = max(abs_mean, 1e-5)
+    inv_scale = 1.0 / abs_mean
+    q_float = np.round(w_flat * inv_scale).clip(-1, 1)  # ternary: {-1, 0, 1}
+
+    # scale for dequantization = abs_mean (i.e., dequant = q * abs_mean)
+    scale = np.float32(abs_mean)
+
+    # Map ternary {-1, 0, 1} -> I2_S encoding {0, 1, 2}
+    #   -1 -> 0,  0 -> 1,  +1 -> 2
+    q = np.ones(n, dtype=np.uint8)  # default to 1 (zero)
+    q[q_float > 0.5] = 2    # +1 -> 2
+    q[q_float < -0.5] = 0   # -1 -> 0
+
+    # Pack into I2_S layout: 128-value blocks, interleaved into 32 bytes
+    # Pad to multiple of 128
+    pad_len = (128 - n % 128) % 128
+    if pad_len:
+        q = np.pad(q, (0, pad_len), constant_values=1)
+
+    n_padded = len(q)
+    n_blocks = n_padded // 128
+
+    q = q.reshape(n_blocks, 4, 32)
+
+    # Pack: byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3
+    packed = (q[:, 0, :].astype(np.uint8) << 6) | \
+             (q[:, 1, :].astype(np.uint8) << 4) | \
+             (q[:, 2, :].astype(np.uint8) << 2) | \
+             (q[:, 3, :].astype(np.uint8))
+
+    packed = packed.reshape(-1).astype(np.uint8)
+
+    # I2_S format: packed_bytes + 32-byte aligned tail (scale in first 4 bytes of tail)
+    # Total size = n_elements / 4 + 32  (as defined in ggml.c)
+    packed_size = n // 4
+    total_size = packed_size + 32
+    result = np.zeros(total_size, dtype=np.uint8)
+    result[:len(packed)] = packed[:packed_size]
+    # Write scale as float32 at offset packed_size
+    result[packed_size:packed_size+4] = np.frombuffer(scale.tobytes(), dtype=np.uint8)
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Main conversion
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert bitnet-embeddings to GGUF")
+    parser.add_argument("model", type=Path, help="Model directory")
+    parser.add_argument("--outfile", type=Path, default=None, help="Output GGUF file")
+    parser.add_argument("--outtype", choices=["f32", "f16", "i2_s"], default="f16",
+                        help="Output type: f32, f16, or i2_s (ternary quantized)")
+    parser.add_argument("--verbose", action="store_true")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    dir_model = args.model
+    if not dir_model.is_dir():
+        logger.error(f"{dir_model} is not a directory")
+        sys.exit(1)
+
+    # Default output filename
+    if args.outfile is None:
+        suffix = {"f32": "-f32", "f16": "-f16", "i2_s": "-f16-new-i2_s"}[args.outtype]
+        args.outfile = dir_model / f"{dir_model.name}{suffix}.gguf"
+
+    # Load config
+    with open(dir_model / "config.json") as f:
+        hparams = json.load(f)
+
+    arch = hparams.get("model_type", "qwen3")
+    assert arch == "qwen3", f"Expected qwen3 architecture, got {arch}"
+
+    n_layers = hparams["num_hidden_layers"]
+
+    # Determine ftype
+    if args.outtype == "f32":
+        ftype = 0  # GGML F32
+    elif args.outtype == "f16":
+        ftype = 1  # GGML F16
+    else:  # i2_s
+        ftype = 40  # LLAMA_FTYPE_MOSTLY_I2_S
+
+    logger.info(f"Converting {dir_model.name} to GGUF ({args.outtype})")
+
+    # Create GGUF writer
+    gguf_writer = gguf.GGUFWriter(str(args.outfile), "qwen3")
+
+    # Set parameters
+    set_gguf_parameters(gguf_writer, hparams, dir_model, ftype)
+
+    # Set vocab
+    logger.info("Setting tokenizer/vocab...")
+    set_vocab(gguf_writer, dir_model, hparams)
+
+    # Build tensor name map
+    tensor_map = build_tensor_name_map(n_layers)
+
+    # Process tensors
+    logger.info("Processing tensors...")
+    tensor_count = 0
+    for hf_name, data_torch in iter_tensors(dir_model):
+        # Skip tensors we don't need
+        if hf_name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
+            continue
+
+        # Strip "model." prefix if present
+        name = hf_name
+        if name.startswith("model."):
+            name = name[len("model."):]
+
+        # Look up GGUF name
+        gguf_name = tensor_map.get(name)
+        if gguf_name is None:
+            logger.warning(f"Skipping unmapped tensor: {hf_name}")
+            continue
+
+        old_dtype = data_torch.dtype
+
+        # Convert bf16 -> f32 first (bf16 not directly supported by gguf)
+        if data_torch.dtype == torch.bfloat16:
+            data_torch = data_torch.to(torch.float32)
+
+        data = data_torch.squeeze().numpy()
+        n_dims = len(data.shape)
+        data_shape = data.shape
+
+        # Determine if this is a linear weight suitable for ternary quantization
+        is_norm = gguf_name.endswith("_norm.weight") or gguf_name.endswith("_norm_in.weight")
+        is_embed = gguf_name == "token_embd.weight"
+        is_linear_weight = n_dims == 2 and not is_norm and not is_embed
+        suit_i2 = is_linear_weight
+
+        if args.outtype == "i2_s" and suit_i2:
+            # --- I2_S ternary packing (scale embedded in data) ---
+            packed = quantize_to_i2_s(data)
+            data_qtype = gguf.GGMLQuantizationType.I2_S
+
+            shape_str = f"{{{', '.join(str(n) for n in reversed(data_shape))}}}"
+            logger.info(f"  {gguf_name}: {list(data_shape)} {old_dtype} -> I2_S, shape = {shape_str}")
+
+            gguf_writer.add_tensor(gguf_name, packed, raw_shape=data_shape, raw_dtype=data_qtype)
+            tensor_count += 1
+
+        elif args.outtype in ("f16", "i2_s") and (is_linear_weight or is_embed):
+            # 2D weight tensors (linear + embedding) -> f16
+            data = data.astype(np.float16)
+            logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16")
+            gguf_writer.add_tensor(gguf_name, data)
+            tensor_count += 1
+
+        else:
+            # norms, 1D tensors
+            if args.outtype in ("f16", "i2_s"):
+                data = data.astype(np.float16)
+                logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16")
+            else:
+                if data.dtype != np.float32:
+                    data = data.astype(np.float32)
+                logger.info(f"  {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float32")
+            gguf_writer.add_tensor(gguf_name, data)
+            tensor_count += 1
+
+    logger.info(f"Total tensors written: {tensor_count}")
+
+    # Note: output.weight (lm_head) is skipped for embedding models —
+    # it is not needed (no token generation) and saves ~297MB for this model.
+
+    # Write GGUF
+    logger.info(f"Writing to {args.outfile}...")
+    gguf_writer.write_header_to_file()
+    gguf_writer.write_kv_data_to_file()
+    gguf_writer.write_tensors_to_file()
+    gguf_writer.close()
+
+    logger.info("Done!")
+
+
+if __name__ == "__main__":
+    main()