uncomment 2bit

DD-DuDa · DD-DuDa · commit c531ba4ce418 · 2025-07-27T11:34:43.000+01:00
diff --git a/benchmark/bench_single_decode.ipynb b/benchmark/bench_single_decode.ipynb
@@ -4,7 +4,21 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "/home/ddy/miniconda3/envs/issue/lib/python3.10/site-packages/bit_decode-1.0.0.post1-py3.10-linux-x86_64.egg/bit_decode_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _Z28run_mha_fwd_splitkv_dispatchIN7cutlass6half_tELi128ELb0ELi1ELi2ELi128EEvR16Flash_fwd_paramsP11CUstream_st",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 9\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mflash_attn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m flash_attn_with_kvcache\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mbit_decode\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m kvcache_pack_int, fwd_kvcache_int\n",
+      "File \u001b[0;32m~/miniconda3/envs/issue/lib/python3.10/site-packages/bit_decode-1.0.0.post1-py3.10-linux-x86_64.egg/bit_decode/__init__.py:3\u001b[0m\n\u001b[1;32m      1\u001b[0m __version__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1.0.0.post1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mbit_decode\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbit_decode_interface\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m      4\u001b[0m     kvcache_pack_int,\n\u001b[1;32m      5\u001b[0m     fwd_kvcache_int\n\u001b[1;32m      6\u001b[0m )\n",
+      "File \u001b[0;32m~/miniconda3/envs/issue/lib/python3.10/site-packages/bit_decode-1.0.0.post1-py3.10-linux-x86_64.egg/bit_decode/bit_decode_interface.py:8\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnn\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mbit_decode_cuda\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mbit_decode_cuda\u001b[39;00m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mkvcache_pack_int\u001b[39m(k_cache: torch\u001b[38;5;241m.\u001b[39mTensor, k_pack: torch\u001b[38;5;241m.\u001b[39mTensor, k_params: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[1;32m     11\u001b[0m                      v_cache: torch\u001b[38;5;241m.\u001b[39mTensor, v_pack: torch\u001b[38;5;241m.\u001b[39mTensor, v_params: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[1;32m     12\u001b[0m                      opt_block_table: Optional[torch\u001b[38;5;241m.\u001b[39mTensor] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     16\u001b[0m                      group_size: \u001b[38;5;28mint\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m128\u001b[39m,\n\u001b[1;32m     17\u001b[0m                      num_bits: \u001b[38;5;28mint\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m4\u001b[39m):\n\u001b[1;32m     19\u001b[0m     batch_size, seqlen_k, nheads_k, d \u001b[38;5;241m=\u001b[39m k_cache\u001b[38;5;241m.\u001b[39mshape\n",
+      "\u001b[0;31mImportError\u001b[0m: /home/ddy/miniconda3/envs/issue/lib/python3.10/site-packages/bit_decode-1.0.0.post1-py3.10-linux-x86_64.egg/bit_decode_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _Z28run_mha_fwd_splitkv_dispatchIN7cutlass6half_tELi128ELb0ELi1ELi2ELi128EEvR16Flash_fwd_paramsP11CUstream_st"
+     ]
+    }
+   ],
    "source": [
     "import torch\n",
     "import torch.nn as nn\n",
@@ -392,7 +406,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "bitdecode",
+   "display_name": "issue",
    "language": "python",
    "name": "python3"
   },
@@ -406,7 +420,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.10.18"
   }
  },
  "nbformat": 4,
diff --git a/csrc/bit_decode/src/genfile/flash_fwd_split_hdim128_fp16_sm80_2bit.cu b/csrc/bit_decode/src/genfile/flash_fwd_split_hdim128_fp16_sm80_2bit.cu
@@ -4,6 +4,6 @@
 
 #include "../flash_fwd_launch_template.h"
 
-// template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 2, 128>(Flash_fwd_params &params, cudaStream_t stream);
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 2, 128>(Flash_fwd_params &params, cudaStream_t stream);
 // template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 2, 64>(Flash_fwd_params &params, cudaStream_t stream);
 // template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false, 1, 2, 32>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/csrc/bit_decode/src/genfile/flash_qpack_hdim128_fp16_sm80_2bit.cu b/csrc/bit_decode/src/genfile/flash_qpack_hdim128_fp16_sm80_2bit.cu
@@ -4,10 +4,10 @@
 
 #include "../flash_fwd_launch_template.h"
 
-// template<>
-// void run_kvcache_qpack_<cutlass::half_t, 128, 1, 2, 128>(Flash_fwd_params &params, cudaStream_t stream) {
-//     run_kvcache_qpack_hdim128<cutlass::half_t, 1, 2, 128>(params, stream);
-// }
+template<>
+void run_kvcache_qpack_<cutlass::half_t, 128, 1, 2, 128>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_kvcache_qpack_hdim128<cutlass::half_t, 1, 2, 128>(params, stream);
+}
 // template<>
 // void run_kvcache_qpack_<cutlass::half_t, 128, 1, 2, 64>(Flash_fwd_params &params, cudaStream_t stream) {
 //     run_kvcache_qpack_hdim128<cutlass::half_t, 1, 2, 64>(params, stream);