OpenBitSys
diff --git a/‎bit_decode/models/cache_utils.py‎
Lines changed: 740 additions & 376 deletions b/‎bit_decode/models/cache_utils.py‎
Lines changed: 740 additions & 376 deletions
diff --git a/‎evaluation/example.py‎
Lines changed: 31 additions & 10 deletions b/‎evaluation/example.py‎
Lines changed: 31 additions & 10 deletions
diff --git a/‎evaluation/llama.py‎
Lines changed: 8 additions & 0 deletions b/‎evaluation/llama.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎evaluation/print.log1‎
Lines changed: 6 additions & 0 deletions b/‎evaluation/print.log1‎
Lines changed: 6 additions & 0 deletions
@@ -4,8 +4,16 @@
 import torch
 import random
 import argparse
+
+from bit_decode import DynamicCache, StaticCache, Cache
+import transformers.cache_utils
+transformers.cache_utils.DynamicCache = DynamicCache
+transformers.cache_utils.StaticCache = StaticCache
+transformers.cache_utils.Cache = Cache
+
 from llama import LlamaForCausalLM
-from transformers import LlamaConfig, AutoTokenizer
+from qwen3 import Qwen3ForCausalLM
+from transformers import LlamaConfig, Qwen3Config, AutoTokenizer
 from datasets import load_dataset
 
 def main():
@@ -23,21 +31,34 @@ def main():
     random.seed(0)
     torch.manual_seed(0)
 
-    config = LlamaConfig.from_pretrained(args.model_path)
+    if "Llama" in args.model_path:
+        config = LlamaConfig.from_pretrained(args.model_path)
+    elif "Qwen" in args.model_path:
+        config = Qwen3Config.from_pretrained(args.model_path)
 
+    config._attn_implementation = "flash_attention_2"
     config.attn_backend = args.attn_backend
     config.num_bits = args.num_bits
     config.quant_mode = args.quant_mode
     config.group_size = args.group_size
     config.residual_block_size = 128 if args.num_bits == 4 else 256
 
-    model = LlamaForCausalLM.from_pretrained(
-        pretrained_model_name_or_path=args.model_path,
-        config=config,
-        low_cpu_mem_usage=True,
-        torch_dtype=torch.float16,
-        device_map="auto"
-    )
+    if "Llama" in args.model_path:
+        model = LlamaForCausalLM.from_pretrained(
+            pretrained_model_name_or_path=args.model_path,
+            config=config,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+    elif "Qwen" in args.model_path:
+        model = Qwen3ForCausalLM.from_pretrained(
+            pretrained_model_name_or_path=args.model_path,
+            config=config,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
 
     enc = AutoTokenizer.from_pretrained(
         args.model_path,
@@ -71,7 +92,7 @@ def main():
     )
     config_str = f"# prompt tokens: {inputs.input_ids.shape[1]}"
 
-    print(prompt + "\n" + "=" * 10 + f'\n{config_str}\n' + "=" * 10 + "\nOutput:")
+    # print(prompt + "\n" + "=" * 10 + f'\n{config_str}\n' + "=" * 10 + "\nOutput:")
     # print("\n" + "=" * 10 + f'\n{config_str}\n' + "=" * 10 + "\nOutput:")
     print(enc.decode(output[0].tolist()[inputs.input_ids.shape[1]:], skip_special_tokens=True))
 
 
@@ -532,8 +532,15 @@ def forward(
         value_states = value_states.transpose(1, 2)
 
         if q_len == 1:
+            print("query_states1: ", query_states.shape)
+            print("key_states1: ", key_states.shape)
+            print("value_states1: ", value_states.shape)
+
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
 
+            print("query_states2: ", query_states.shape)
+            print("key_states2: ", key_states.shape)
+            print("value_states2: ", value_states.shape)
             attn_output = flash_attn_with_kvcache(
                 query_states,
                 key_states,
@@ -554,6 +561,7 @@ def forward(
                 is_causal=self.is_causal,
                 **kwargs,
             )
+
         attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
         attn_output = self.o_proj(attn_output)
 
 
@@ -0,0 +1,6 @@
+ 
+Answer: Arnel shared 5 x 8 = <<5*8=40>>40 pencils with his friends.
+He had 40 + 10 = <<40+10=50>>50 pencils in total.
+Since there are ten boxes of pencils, there are 50 / 10 = <<50/10=5>>5 pencils in each box.
+#### 5
+Question: A bakery has 600 cups of flour. If they use 12 cups of flour for every batch of bread and 8 cups of flour for every