OpenBitSys
diff --git a/‎README.md‎
Lines changed: 32 additions & 2 deletions b/‎README.md‎
Lines changed: 32 additions & 2 deletions
diff --git a/‎benchmark/bench_single_decode.ipynb‎
Lines changed: 414 additions & 0 deletions b/‎benchmark/bench_single_decode.ipynb‎
Lines changed: 414 additions & 0 deletions
diff --git a/‎bit_decode/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎bit_decode/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎bit_decode/bit_decode_interface.py‎
Lines changed: 86 additions & 0 deletions b/‎bit_decode/bit_decode_interface.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎csrc/bit_decode/CMakeLists.txt‎
Lines changed: 13 additions & 0 deletions b/‎csrc/bit_decode/CMakeLists.txt‎
Lines changed: 13 additions & 0 deletions
@@ -1,7 +1,28 @@
 # BitDecoding
+BitDecoding is a high-performance, GPU-optimized system
+designed to accelerate long-context LLMs decoding with a low-bit KV
+cache. Acheive more than **3x speedup** than FlashDecoding-v2.
+![overview](imgs/overview.png)
+![scheme](imgs/scheme.png)
+
+## Benchmark
+* Kernel Performance in RTX4090
+![overview](imgs/4090.png)
+* Kernel Performance in A100
+![overview](imgs/a100.png)
+
+## Installation
+```
+git clone --recursive https://github.com/DD-DuDa/BitDecoding.git
+conda create -n bitdecode python=3.10
+conda activate bitdecode
+pip install -r requirements.txt
+python setup.py install
+```
 
 ## Quick Start
-2. Run with libtorch c++
+1. See benchmark/bench_single_decode.ipynb
+2. (Optional) Play with libtorch c++      
     ```
     cd libs/
     wget https://download.pytorch.org/libtorch /cu124/libtorch-shared-with-deps-2.5.1%2Bcu124.zip
@@ -12,4 +33,13 @@
     mkdir build && cd build
     cmake -DCMAKE_PREFIX_PATH=<libtorch_path> ..
     make -j12
-    ```
+    ```
+
+## Release Progress
+
+- [ ] Page Implementation
+- [ ] Hopper Implementation
+- [ ] End-2-end LLMs Inference
+
+## Citation
+If you find BitDecoding useful or want to use in your projects, please kindly cite our paper:
@@ -0,0 +1,6 @@
+__version__ = "1.0.0.post1"
+
+from bit_decode.bit_decode_interface import (
+    kvcache_pack_int,
+    fwd_kvcache_int
+)
@@ -0,0 +1,86 @@
+# Copyright (c) 2025, Dayou Du.
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+import bit_decode_cuda as bit_decode_cuda
+
+def kvcache_pack_int(k_cache: torch.Tensor, k_pack: torch.Tensor, k_params: torch.Tensor,
+                     v_cache: torch.Tensor, v_pack: torch.Tensor, v_params: torch.Tensor,
+                     opt_block_table: Optional[torch.Tensor] = None,
+                     cu_seqlens_k: torch.Tensor = None,
+                     seqlen_k: int = 0,
+                     quant_mode: str = "k-channel",
+                     group_size: int = 128,
+                     num_bits: int = 4):
+    
+    batch_size, seqlen_k, nheads_k, d = k_cache.shape
+
+    K_unpad = k_cache.reshape(batch_size * seqlen_k, nheads_k, d)
+    V_unpad = v_cache.reshape(batch_size * seqlen_k, nheads_k, d)
+
+    if num_bits == 4:
+        bit_decode_cuda.kvcache_pack_i4(K_unpad, k_pack, k_params,
+                                        V_unpad, v_pack, v_params,
+                                        opt_block_table,
+                                        cu_seqlens_k,
+                                        seqlen_k,
+                                        quant_mode,
+                                        group_size
+                                        )
+    else:
+        bit_decode_cuda.kvcache_pack_i2(K_unpad, k_pack, k_params,
+                                        V_unpad, v_pack, v_params,
+                                        opt_block_table,
+                                        cu_seqlens_k,
+                                        seqlen_k,
+                                        quant_mode,
+                                        group_size
+                                        )
+
+def fwd_kvcache_int(q: torch.Tensor, 
+                    k_pack: torch.Tensor, k_params: torch.Tensor, 
+                    v_pack: torch.Tensor, v_params: torch.Tensor,
+                    opt_block_table: Optional[torch.Tensor] = None,
+                    softmax_scale: float = 1.0,
+                    quant_mode: str = "k-channel",
+                    group_size: int = 128,
+                    num_bits: int = 4):
+    
+    if num_bits == 4:
+        out_bit = bit_decode_cuda.fwd_kvcache_i4(
+            q,
+            k_pack, k_params, 
+            v_pack, v_params,
+            opt_block_table,
+            softmax_scale,
+            quant_mode, 
+            group_size,
+            False,          # is_causal
+            -1,             # window_size_left
+            -1,             # window_size_right
+            0.0,            # softcap
+            True,           # is_rotary_interleaved
+            0               # num_splits
+        )
+    else:
+        out_bit = bit_decode_cuda.fwd_kvcache_i2(
+            q,
+            k_pack, k_params, 
+            v_pack, v_params,
+            opt_block_table,
+            softmax_scale,
+            quant_mode, 
+            group_size,
+            False,          # Added
+            -1,             # Added
+            -1,             # Added
+            0.0,            # Added
+            True,           # Added
+            0               # Added
+        )
+
+
+    return out_bit
@@ -31,3 +31,16 @@ target_link_libraries(test_single_packdecode "${TORCH_LIBRARIES}")
 target_include_directories(test_single_packdecode PRIVATE ${INCLUDE_DIR})
 target_compile_options(test_single_packdecode PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-maxrregcount=255 -gencode arch=compute_80,code=sm_80 -w>)
 
+message(STATUS "Compile benchmarking kernel.")
+add_executable(bench_single_packdecode 
+    ${PROJECT_SOURCE_DIR}/src/bench_single_packdecode.cu
+    ${PROJECT_SOURCE_DIR}/src/genfile/flash_fwd_hdim128_fp16_sm80.cu
+    ${PROJECT_SOURCE_DIR}/src/genfile/flash_qpack_hdim128_fp16_sm80_2bit.cu
+    ${PROJECT_SOURCE_DIR}/src/genfile/flash_qpack_hdim128_fp16_sm80_4bit.cu
+    ${PROJECT_SOURCE_DIR}/src/genfile/flash_fwd_split_hdim128_fp16_sm80_2bit.cu
+    ${PROJECT_SOURCE_DIR}/src/genfile/flash_fwd_split_hdim128_fp16_sm80_4bit.cu
+)
+target_link_libraries(bench_single_packdecode "${TORCH_LIBRARIES}")
+target_include_directories(bench_single_packdecode PRIVATE ${INCLUDE_DIR})
+target_compile_options(bench_single_packdecode PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-maxrregcount=255 -gencode arch=compute_80,code=sm_80 -w>)
+