update

DD-DuDa · DD-DuDa · commit 409f1184d99f · 2025-08-31T12:07:22.000+01:00
diff --git a/benchmark/bench_single_decode.ipynb b/benchmark/bench_single_decode.ipynb
diff --git a/csrc/bit_decode/decode_api.cpp b/csrc/bit_decode/decode_api.cpp
@@ -190,7 +190,7 @@ void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream, bool force_split
             } else if (params.group_size == 64) {
                 // run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, num_bits, 64>(params, stream);
             } else if (params.group_size == 32) {
-                // run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, num_bits, 32>(params, stream);
+                run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, num_bits, 32>(params, stream);
             }
         } else {
             if (params.group_size == 128) {
@@ -212,7 +212,7 @@ void run_kvcache_qpack(Flash_fwd_params &params, cudaStream_t stream) {
         } else if (params.group_size == 64) {
             // run_kvcache_qpack_<cutlass::half_t, 128, 1, num_bits, 64>(params, stream);
         } else if (params.group_size == 32) {
-            // run_kvcache_qpack_<cutlass::half_t, 128, 1, num_bits, 32>(params, stream);
+            run_kvcache_qpack_<cutlass::half_t, 128, 1, num_bits, 32>(params, stream);
         }
     } else {
         if (params.group_size == 128) {
diff --git a/csrc/bit_decode/src/genfile/flash_fwd_split_hdim128_fp16_sm80_2bit.cu b/csrc/bit_decode/src/genfile/flash_fwd_split_hdim128_fp16_sm80_2bit.cu
@@ -6,4 +6,4 @@
 
 template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 2, 128>(Flash_fwd_params &params, cudaStream_t stream);
 // template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 2, 64>(Flash_fwd_params &params, cudaStream_t stream);
-// template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 2, 32>(Flash_fwd_params &params, cudaStream_t stream);
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 2, 32>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/csrc/bit_decode/src/genfile/flash_fwd_split_hdim128_fp16_sm80_4bit.cu b/csrc/bit_decode/src/genfile/flash_fwd_split_hdim128_fp16_sm80_4bit.cu
@@ -11,4 +11,4 @@
 
 template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 4, 128>(Flash_fwd_params &params, cudaStream_t stream);
 // template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 4, 64>(Flash_fwd_params &params, cudaStream_t stream);
-// template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 4, 32>(Flash_fwd_params &params, cudaStream_t stream);
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 4, 32>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/csrc/bit_decode/src/genfile/flash_qpack_hdim128_fp16_sm80_2bit.cu b/csrc/bit_decode/src/genfile/flash_qpack_hdim128_fp16_sm80_2bit.cu
@@ -12,7 +12,7 @@ void run_kvcache_qpack_<cutlass::half_t, 128, 1, 2, 128>(Flash_fwd_params &param
 // void run_kvcache_qpack_<cutlass::half_t, 128, 1, 2, 64>(Flash_fwd_params &params, cudaStream_t stream) {
 //     run_kvcache_qpack_hdim128<cutlass::half_t, 1, 2, 64>(params, stream);
 // }
-// template<>
-// void run_kvcache_qpack_<cutlass::half_t, 128, 1, 2, 32>(Flash_fwd_params &params, cudaStream_t stream) {
-//     run_kvcache_qpack_hdim128<cutlass::half_t, 1, 2, 32>(params, stream);
-// }
+template<>
+void run_kvcache_qpack_<cutlass::half_t, 128, 1, 2, 32>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_kvcache_qpack_hdim128<cutlass::half_t, 1, 2, 32>(params, stream);
+}
diff --git a/csrc/bit_decode/src/genfile/flash_qpack_hdim128_fp16_sm80_4bit.cu b/csrc/bit_decode/src/genfile/flash_qpack_hdim128_fp16_sm80_4bit.cu
@@ -4,10 +4,10 @@
 
 #include "../flash_fwd_launch_template.h"
 
-// template<>
-// void run_kvcache_qpack_<cutlass::half_t, 128, 1, 4, 32>(Flash_fwd_params &params, cudaStream_t stream) {
-//     run_kvcache_qpack_hdim128<cutlass::half_t, 1, 4, 32>(params, stream);
-// }
+template<>
+void run_kvcache_qpack_<cutlass::half_t, 128, 1, 4, 32>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_kvcache_qpack_hdim128<cutlass::half_t, 1, 4, 32>(params, stream);
+}
 // template<>
 // void run_kvcache_qpack_<cutlass::half_t, 128, 1, 4, 64>(Flash_fwd_params &params, cudaStream_t stream) {
 //     run_kvcache_qpack_hdim128<cutlass::half_t, 1, 4, 64>(params, stream);
diff --git a/evaluation/test.py b/evaluation/test.py
@@ -0,0 +1,113 @@
+import torch
+import torch.nn as nn
+import math
+import triton
+from einops import rearrange, repeat
+import numpy as np
+
+from flash_attn import flash_attn_with_kvcache
+from bit_decode import kvcache_pack_int, fwd_kvcache_int
+
+
+def attention_ref(
+    q,
+    k,
+    v,
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        k: (batch_size, seqlen_k, nheads_k, head_dim)
+        v: (batch_size, seqlen_k, nheads_k, head_dim)
+    Output:
+        output: (batch_size, seqlen_q, nheads, head_dim)
+        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
+    """
+    dtype_og = q.dtype
+
+    d = q.shape[-1]
+
+    scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k)
+    
+    attention = torch.softmax(scores, dim=-1).to(v.dtype)
+
+    output = torch.einsum("bhts,bshd->bthd", attention, v)
+
+    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
+
+
+# Define constants
+batch_size = 1
+nheads = 32
+nheads_k = 32
+d = 128
+
+# Sequence length
+seqlen_q = 1
+seqlen_kv = 4096
+
+# Quantization parameters
+quant_mode = "k-channel"
+num_bits = 4
+pack_nums = 16 / num_bits
+group_size = 128
+
+
+# Set seed and parameters
+device = "cuda"
+dtype = torch.float16
+torch.random.manual_seed(0)
+
+# Initialize tensors
+q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
+k_cache = torch.randn(batch_size, seqlen_kv, nheads_k, d, device=device, dtype=dtype)
+v_cache = torch.randn(batch_size, seqlen_kv, nheads_k, d, device=device, dtype=dtype)
+
+k_cache_rep = repeat(k_cache, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+v_cache_rep = repeat(v_cache, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+
+# Reference attention computation
+out_ref, _ = attention_ref(q, k_cache_rep, v_cache_rep)
+
+##################### BitDecoding Packing Kernel ##################### 
+
+# Initialize quantization tensors
+if quant_mode == "k-channel":
+    k_pack   = torch.zeros((batch_size, int(seqlen_kv // pack_nums), nheads_k, d),  dtype=torch.uint16, device=device)
+    k_params = torch.zeros((batch_size, int(seqlen_kv // group_size), nheads_k, d), dtype=torch.float32, device=device)
+else:
+    k_pack   = torch.zeros((batch_size, seqlen_kv, nheads_k, int(d // pack_nums)),  dtype=torch.uint16, device=device)
+    k_params = torch.zeros((batch_size, int(d // group_size), nheads_k, seqlen_kv), dtype=torch.float32, device=device)
+
+v_pack   = torch.zeros((batch_size, seqlen_kv, nheads_k, int(d // pack_nums)),  dtype=torch.uint16, device=device)
+v_params = torch.zeros((batch_size, int(d // group_size), nheads_k, seqlen_kv), dtype=torch.float32, device=device)
+
+cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_kv, seqlen_kv, dtype=torch.int32, device=device)
+
+kvcache_pack_int(
+    k_cache, k_pack, k_params,
+    v_cache, v_pack, v_params,
+    None, # opt_block_table
+    cu_seqlens_k,              
+    seqlen_kv,
+    quant_mode,
+    group_size,
+    num_bits
+)
+
+sm_scale = 1.0 / math.sqrt(d)
+out_bitdecode = fwd_kvcache_int(
+                    q,
+                    k_pack, k_params, 
+                    v_pack, v_params,
+                    None, # opt_block_table
+                    sm_scale,
+                    quant_mode, 
+                    group_size,
+                    num_bits
+                )
+
+print(f"seqlen_kv:{seqlen_kv} BitDecode vs Pytorch: {(out_bitdecode - out_ref).abs().mean().item()}")
+
+print(f"out_ref: \n{out_ref[0,0,0,:8]}")
+print(f"out_bitdecode: \n{out_bitdecode[0,0,0,:8]}")