pushed latest changes with chunking enabled for prefill along with retaining full KV for decode-only model

ochougul · ochougul · commit 16899bb62c89 · 2025-12-08T07:54:44.000Z
Signed-off-by: Onkar Chougule &lt;ochougul@qti.qualcomm.com&gt;
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -325,8 +325,13 @@ def get_onnx_path(
         specializations: Optional[List[Dict[str, int]]] = None,
         offload_pt_weights: Optional[bool] = True,
         use_onnx_subfunctions: Optional[bool] = False,
+        retain_full_kv: Optional[bool] = False,
     ):
-        kwargs = {"offload_pt_weights": offload_pt_weights, "use_onnx_subfunctions": use_onnx_subfunctions}
+        kwargs = {
+            "offload_pt_weights": offload_pt_weights,
+            "use_onnx_subfunctions": use_onnx_subfunctions,
+            "retain_full_kv": retain_full_kv,
+        }
         if prefill_only:
             if self.prefill_onnx_path is None:
                 kwargs.update(
@@ -360,6 +365,7 @@ def _compile(
         prefill_only: Optional[str] = None,
         offload_pt_weights: Optional[bool] = True,
         enable_chunking: Optional[bool] = False,
+        retain_full_kv: Optional[bool] = None,
         **compiler_options,
     ) -> str:
         """
@@ -389,7 +395,12 @@ def _compile(
             onnx_path
             if onnx_path
             else self.get_onnx_path(
-                prefill_only, enable_chunking, specializations, offload_pt_weights, use_onnx_subfunctions
+                prefill_only,
+                enable_chunking,
+                specializations,
+                offload_pt_weights,
+                use_onnx_subfunctions,
+                retain_full_kv,
             )
         )
         compile_dir = Path(compile_dir or onnx_path.parent)
diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
@@ -1298,11 +1298,11 @@ def forward(
 
     def get_pkv_dynamic_axes(
         self,
-        chunked_prefill: Optional[bool] = False,
+        retain_full_kv: Optional[bool] = False,
     ):
         pkv_dynamic_axes = []
         for layer_type in self.config.layer_types:
-            if layer_type == "sliding_attention" and not chunked_prefill:
+            if layer_type == "sliding_attention" and not retain_full_kv:
                 pkv_dynamic_axes.append({0: "batch_size", 2: "sliding_window"})
             else:
                 pkv_dynamic_axes.append({0: "batch_size", 2: "ctx_len"})
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -52,6 +52,7 @@
     PoolingTransform,
     PrefillOnlyChunkedTransform,
     PrefillOnlyTransform,
+    RevertPrefillKeepAttentionTransform,
     RevertPrefillOnlyTransform,
     SamplerTransform,
     SpDTransform,
@@ -2303,15 +2304,23 @@ class QEFFAutoModelForCausalLM(QEFFBaseModel):
         SplitTensorsTransform,
     ]
 
-    def prefill(self, enable: Optional[bool] = True, enable_chunking: Optional[bool] = False):
+    def prefill(
+        self,
+        enable: Optional[bool] = True,
+        enable_chunking: Optional[bool] = False,
+        retain_full_kv: Optional[bool] = False,
+    ):
         if enable:
             if enable_chunking:
                 self.model, tf = PrefillOnlyChunkedTransform.apply(self.model)
             else:
                 self.model, tf = PrefillOnlyTransform.apply(self.model)
             self.prefill_enabled = True
         else:
-            self.model, tf = RevertPrefillOnlyTransform.apply(self.model)
+            if retain_full_kv:
+                self.model, tf = RevertPrefillKeepAttentionTransform.apply(self.model)
+            else:
+                self.model, tf = RevertPrefillOnlyTransform.apply(self.model)
             self.prefill_enabled = False
 
     def __init__(
@@ -2478,7 +2487,6 @@ def from_pretrained(
             qaic_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path
 
         # This is support models that should be classified to in a different auto class but transformers load them via this class
-
         if model.__class__.__name__ in MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP:
             return MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP[model.__class__.__name__](
                 model,
@@ -2511,7 +2519,6 @@ def get_model_config(self) -> dict:
     def get_seq_len_and_handle_specialized_prefill_model(
         self, prefill_seq_len: Optional[int] = None, enable_chunking=False
     ) -> int:
-        self.prefill(enable=True, enable_chunking=enable_chunking)
         self.hash_params["prefill_only"] = True
         if enable_chunking:
             self.hash_params["chunking"] = True
@@ -2586,6 +2593,8 @@ def export(
         )
         if prefill_only:
             assert not self.continuous_batching, "prefill_only=True is not supported with continuous_batching=True"
+            self.prefill(enable=True, enable_chunking=kwargs.get("enable_chunking", False))
+            self.hash_params.pop("retain_full_kv", None)
             seq_len = (
                 self.get_seq_len_and_handle_specialized_prefill_model(
                     prefill_seq_len=prefill_seq_len, enable_chunking=kwargs.get("enable_chunking", False)
@@ -2597,9 +2606,15 @@ def export(
                 seq_len + self.model.config.sliding_window if kwargs.get("enable_chunking", False) else seq_len
             )
         else:
-            self.prefill(False)
+            self.prefill(False, retain_full_kv=kwargs.get("retain_full_kv", False))
             self.hash_params.pop("prefill_only", None)
-            self.hash_params.pop("num_blocks", None)
+            self.hash_params.pop("NUM_Q_BLOCKS", None)
+            self.hash_params.pop("NUM_FFN_BLOCKS", None)
+            self.hash_params.pop("ENABLE_OPT_SWA", None)
+            self.hash_params.pop("chunking", None)
+            if kwargs.get("retain_full_kv", False):
+                kv_cache_shape[2] = seq_len + self.model.config.sliding_window
+                self.hash_params["retain_full_kv"] = True
 
         example_inputs = {
             "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64),
@@ -2649,7 +2664,10 @@ def export(
         else:
             # HACK: create common function for this including above if condition code
             pkv_dynamic_axes = (
-                self.model.get_pkv_dynamic_axes(chunked_prefill=(prefill_only and kwargs.get("enable_chunking", False)))
+                self.model.get_pkv_dynamic_axes(
+                    retain_full_kv=kwargs.get("retain_full_kv", False)
+                    or (prefill_only and kwargs.get("enable_chunking", False))
+                )
                 if hasattr(self.model, "get_pkv_dynamic_axes")
                 else pkv_dynamic_axes
             )
@@ -2905,6 +2923,7 @@ def compile(
         use_onnx_subfunctions: bool = False,
         offload_pt_weights: Optional[bool] = True,
         enable_chunking: Optional[bool] = False,
+        retain_full_kv: Optional[bool] = None,
         **compiler_options,
     ) -> str:
         """
@@ -3040,6 +3059,8 @@ def compile(
             if self.comp_ctx_lengths_prefill is not None:
                 # Adding elements from self.comp_ctx_lengths_prefill to prefill_specialization
                 for i in range(0, len(self.comp_ctx_lengths_prefill)):
+                    if prefill_only or enable_chunking:
+                        raise NotImplementedError("prefill_only or enable_chunking is not supported with CCL")
                     specializations.append(
                         self.build_prefill_specialization(
                             prefill_seq_len=prefill_seq_len,
@@ -3118,6 +3139,7 @@ def compile(
             prefill_only=prefill_only,
             offload_pt_weights=offload_pt_weights,
             enable_chunking=enable_chunking,
+            retain_full_kv=retain_full_kv,
             **compiler_options,
         )
 
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
@@ -654,6 +654,16 @@ class PrefillOnlyChunkedTransform(ModuleMappingTransform):
     }
 
 
+class RevertPrefillKeepAttentionTransform(ModuleMappingTransform):
+    _module_mapping = {
+        QEffGptOssModel: QEffPrefillOnlyGptOssModel,
+        QEffPrefillOnlyGptOssAttention: QEffPrefillOnlyChunkedGptOssAttention,
+        QEffGptOssAttention: QEffPrefillOnlyChunkedGptOssAttention,
+        QEffPrefillOnlyGptOssMLP: QEffGptOssMLP,
+        QEffPrefillOnlyChunkedGptOssMLP: QEffGptOssMLP,
+    }
+
+
 class RevertPrefillOnlyTransform(ModuleMappingTransform):
     _module_mapping = {
         **{v: k for k, v in PrefillOnlyTransform._module_mapping.items()},
diff --git a/examples/gpt_oss_disagg_mode_with_chunking.py b/examples/gpt_oss_disagg_mode_with_chunking.py
@@ -9,12 +9,12 @@
 
 import numpy as np
 import torch
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer
 
 from QEfficient import QEFFAutoModelForCausalLM
 from QEfficient.generation.cloud_infer import QAICInferenceSession
 
-model_id = "openai/gpt-oss-120b"  # weights are not required to convert to fp32
+model_id = "openai/gpt-oss-20b"  # weights are not required to convert to fp32
 
 prompt = """
 Once upon a time, in a small town, there lived a young boy named Alex. Alex was a curious and adventurous child, always eager to explore the world around him. One day, while playing in the park, Alex stumbled upon a mysterious old book hidden beneath a pile of leaves. The book was filled with stories of distant lands, magical creatures, and extraordinary adventures.
@@ -23,27 +23,14 @@
 
 The path to the treasure was not an easy one. Alex had to navigate through dense forests, cross rickety bridges, and solve riddles that guarded the treasure's location.
 """
-all_outputs = []
 # Run prefill
+config = AutoConfig.from_pretrained(model_id)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 PREFILL_SEQ_LEN = 128
-CTX_LEN = 2 * 128
-inputs = tokenizer(prompt, return_tensors="np", padding=True)
-position_ids = inputs["attention_mask"].sum(1, keepdims=True)
-padded_len = inputs["input_ids"].shape[1]
-num_chunks = -(padded_len // -PREFILL_SEQ_LEN)  # ceil divide without float
-padded_len = num_chunks * PREFILL_SEQ_LEN  # Convert to a multiple of prompt_len
+CTX_LEN = 128 * 3
 
-# Initialize variables specific to request
-# Calculate the max generation length.
-max_gen_len = CTX_LEN - position_ids.max()
-generation_len = max_gen_len
-
-
-# qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2)
 qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id)
 
-
 decode_qpc_path = qeff_model.compile(
     prefill_seq_len=1,
     ctx_len=CTX_LEN,
@@ -55,23 +42,12 @@
     aic_enable_depth_first=True,
     num_speculative_tokens=None,
     offload_pt_weights=False,  # Need the weights in memory for prefill-model export/compilation in the next step
+    retain_full_kv=True,
 )
 
-config = qeff_model.model.config
-inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len)
-inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1)
-inputs.pop("token_type_ids", None)
-inputs = {k: torch.from_numpy(v) for k, v in inputs.items()}
-past_key_values = []
-for i in range(config.num_hidden_layers):
-    cache_len = config.sliding_window if i % 2 == 0 else PREFILL_SEQ_LEN
-    pad_shape = (1, 8, cache_len, 64)
-    past_key = torch.zeros((pad_shape), dtype=torch.float32)
-    past_value = torch.zeros((pad_shape), dtype=torch.float32)
-    pkv = (past_key, past_value)
-    past_key_values.append(pkv)
-inputs["past_key_values"] = past_key_values
 
+# Following command errors out by default, the user is supposed to run the printed command and provide the generated qpc path as prefill_qpc_path commenting out lines 55-68
+# prefill_qpc_path = "provide path here"
 prefill_qpc_path = qeff_model.compile(
     prefill_seq_len=PREFILL_SEQ_LEN,
     ctx_len=CTX_LEN,
@@ -85,72 +61,77 @@
     prefill_only=True,
     enable_chunking=True,
     use_onnx_subfunctions=True,
-    offload_pt_weights=False,
-)
-print("loading qpc")
-st = time.time()
-prefill_session = QAICInferenceSession(prefill_qpc_path, device_ids=[i for i in range(32, 48)])
-print(f"time for loading session = {time.time() - st}")
-print("done")
-prefill_session.skip_buffers(
-    [x for x in prefill_session.input_names + prefill_session.output_names if x.startswith("past_")]
 )
-logits_out_placeholder = np.zeros((1, 1, 201088), dtype=np.float32)
-prefill_session.set_buffers({"logits": logits_out_placeholder})
-inputs.pop("past_key_values")
+
+
+inputs = tokenizer(prompt, return_tensors="np", padding=True)
+position_ids = inputs["attention_mask"].sum(1, keepdims=True)
+generation_len = CTX_LEN - position_ids.max()
+padded_len = inputs["input_ids"].shape[1]
+num_chunks = -(padded_len // -PREFILL_SEQ_LEN)  # ceil divide without float
+padded_len = num_chunks * PREFILL_SEQ_LEN  # Convert to a multiple of prompt_len
+inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len)
+inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1)
+inputs.pop("token_type_ids", None)
+inputs = {k: torch.from_numpy(v) for k, v in inputs.items()}
+inputs.pop("past_key_values", None)
 inputs = {k: v.detach().numpy() for k, v in inputs.items()}
-st = time.time()
 
+
+decode_session = QAICInferenceSession(decode_qpc_path)
+prefill_session = QAICInferenceSession(prefill_qpc_path)
+
+all_outputs = []
 for i in range(num_chunks):
     chunk_inputs = inputs.copy()
     chunk_inputs["input_ids"] = inputs["input_ids"][:, i * PREFILL_SEQ_LEN : (i + 1) * PREFILL_SEQ_LEN]
     chunk_inputs["position_ids"] = inputs["position_ids"][:, i * PREFILL_SEQ_LEN : (i + 1) * PREFILL_SEQ_LEN]
     ins = time.time()
     qpc_out = prefill_session.run(chunk_inputs)
     print(f"time for this run={time.time() - ins}")
-print(f"time for prefill_run={time.time() - st} sec\n")
-
-decode_session = QAICInferenceSession(decode_qpc_path)
-decode_session.set_buffers({"logits": logits_out_placeholder})
+    for i in range(config.num_hidden_layers):
+        inputs[f"past_key.{i}"] = qpc_out[f"past_key.{i}_RetainedState"]
+        inputs[f"past_value.{i}"] = qpc_out[f"past_value.{i}_RetainedState"]
 
+all_outputs.append(np.argmax(qpc_out["logits"]))
 decode_inputs = {
     "input_ids": np.argmax(qpc_out["logits"]).reshape(1, 1),
     "position_ids": np.max(inputs["position_ids"]).reshape(1, 1) + 1,
 }
-print("pos_id for decodee", decode_inputs["position_ids"])
-
-all_outputs.append(decode_inputs["input_ids"][0][0])
 for i in range(config.num_hidden_layers):
-    if i % 2 == 0 and decode_inputs["position_ids"] >= config.sliding_window:
-        last_valid_pos_idx = decode_inputs["position_ids"][0][0]
-        first_valid_pos_idx = last_valid_pos_idx - config.sliding_window
-        k = qpc_out[f"past_key.{i}_RetainedState"][:, :, first_valid_pos_idx:last_valid_pos_idx, :]
-        v = qpc_out[f"past_value.{i}_RetainedState"][:, :, first_valid_pos_idx:last_valid_pos_idx, :]
-        mod_pos_id = config.sliding_window - decode_inputs["position_ids"][0][0] % config.sliding_window
-        decode_inputs[f"past_key.{i}"] = np.concatenate((k[:, :, mod_pos_id:, :], k[:, :, :mod_pos_id, :]), axis=-2)
-        decode_inputs[f"past_value.{i}"] = np.concatenate((v[:, :, mod_pos_id:, :], v[:, :, :mod_pos_id, :]), axis=-2)
-    else:
-        decode_inputs[f"past_key.{i}"] = qpc_out[f"past_key.{i}_RetainedState"]
-        decode_inputs[f"past_value.{i}"] = qpc_out[f"past_value.{i}_RetainedState"]
+    decode_inputs[f"past_key.{i}"] = qpc_out[f"past_key.{i}_RetainedState"]
+    decode_inputs[f"past_value.{i}"] = qpc_out[f"past_value.{i}_RetainedState"]
 
 st = time.time()
 decode_out = decode_session.run(decode_inputs)
 print(f"time for first run of decode with KV as input = {time.time() - st} sec\n")
-decode_session.skip_buffers(
-    [x for x in decode_session.input_names + decode_session.output_names if x.startswith("past_")]
-)
+all_outputs.append(np.argmax(decode_out["logits"]))
 pos_id = np.max(decode_inputs["position_ids"]).reshape(1, 1) + 1
+loop_decode_inputs = {
+    "input_ids": np.argmax(decode_out["logits"]).reshape(1, 1),
+    "position_ids": pos_id,
+}
+
+for i in range(config.num_hidden_layers):
+    loop_decode_inputs[f"past_key.{i}"] = decode_out[f"past_key.{i}_RetainedState"]
+    loop_decode_inputs[f"past_value.{i}"] = decode_out[f"past_value.{i}_RetainedState"]
+
 st = time.time()
 for i in range(generation_len - 2):
-    loop_decode_inputs = {
-        "input_ids": np.argmax(decode_out["logits"]).reshape(1, 1),
-        "position_ids": pos_id,
-    }
-    all_outputs.append(loop_decode_inputs["input_ids"][0][0])
     decode_out = decode_session.run(loop_decode_inputs)
+    all_outputs.append(np.argmax(decode_out["logits"]))
     pos_id += 1
-
-
-print(f"time for decode generation = {(time.time() - st) / (generation_len - 2)}")
-print(all_outputs)
-print(tokenizer.decode(all_outputs))
+    for i in range(config.num_hidden_layers):
+        loop_decode_inputs[f"past_key.{i}"] = decode_out[f"past_key.{i}_RetainedState"]
+        loop_decode_inputs[f"past_value.{i}"] = decode_out[f"past_value.{i}_RetainedState"]
+
+    loop_decode_inputs.update(
+        {
+            "input_ids": np.argmax(decode_out["logits"]).reshape(1, 1),
+            "position_ids": pos_id,
+        }
+    )
+ft = time.time()
+
+print(f"decode tok/sec={(generation_len - 2) / (ft - st)}")
+print(f"input\n{prompt}\noutput\n{tokenizer.decode(all_outputs)}")
diff --git a/tests/transformers/models/test_disagg_mode.py b/tests/transformers/models/test_disagg_mode.py
@@ -16,7 +16,7 @@
 from QEfficient.generation.cloud_infer import QAICInferenceSession
 from QEfficient.transformers.quantizers import replace_transformers_quantizers, undo_transformers_quantizers
 
-model_id = "openai/gpt-oss-20b"  # weights are not required to convert to fp32
+model_id = "openai/gpt-oss-120b"  # weights are not required to convert to fp32
 
 prompt2 = """
 Once upon a time, in a small town, there lived a young boy named Alex. Alex was a curious and adventurous child, always eager to explore the world around him. One day, while playing in the park, Alex stumbled upon a mysterious old book hidden beneath a pile of leaves. The book was filled with stories of distant lands, magical creatures, and extraordinary adventures.
@@ -104,6 +104,7 @@ def test_disagg_mode_prefill(model_id, prompt):
     assert (torch.from_numpy(qpc_out["logits"]) - qeff_out.logits).abs().max() < 5e-2
 
 
+@pytest.mark.skip(reason="no way of currently testing this without the assert sdk")
 @pytest.mark.on_qaic
 @pytest.mark.parametrize("model_id", [model_id])
 @pytest.mark.parametrize("prompt", prompts)