added support for prefix caching for gpt-oss

ochougul · ochougul · commit 21b54969cebe · 2025-12-08T15:03:35.000Z
Signed-off-by: Onkar Chougule &lt;ochougul@qti.qualcomm.com&gt;
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -511,8 +511,8 @@ class FeatureNotAvailableError(Exception):
 
             exec_command = f'QAIC_COMPILER_OPTS_UNSUPPORTED="-loader-inline-all=0" {" ".join(command)}'
             raise FeatureNotAvailableError(
-                f"ONNX graph is exported with subfunctions, assert version of apps SDK should be used for compiling this model. \
-                                           Run following command manually with assert compiler:\n{exec_command}"
+                "ONNX graph is exported with subfunctions, assert version of apps SDK should be used for compiling this model."
+                + f"\nRun following command manually with assert compiler:\n{exec_command}"
             )
         try:
             subprocess.run(command, capture_output=True, check=True)
diff --git a/QEfficient/base/onnx_transforms.py b/QEfficient/base/onnx_transforms.py
@@ -12,6 +12,16 @@
 from onnx import ModelProto, external_data_helper, numpy_helper
 
 from QEfficient.customop.ctx_scatter_gather import CtxGather, CtxGatherFunc, CtxScatter, CtxScatterFunc
+from QEfficient.customop.ctx_scatter_gather_cb import (
+    CtxGatherCB,
+    CtxGatherCB3D,
+    CtxGatherFuncCB,
+    CtxGatherFuncCB3D,
+    CtxScatterCB,
+    CtxScatterCB3D,
+    CtxScatterFuncCB,
+    CtxScatterFuncCB3D,
+)
 from QEfficient.customop.rms_norm import CustomRMSNorm, CustomRMSNormFunc
 
 
@@ -114,6 +124,10 @@ class CustomOpTransform(OnnxTransform):
         "CustomRMSNormFunc": (CustomRMSNormFunc, CustomRMSNorm),
         "CtxScatterFunc": (CtxScatterFunc, CtxScatter),
         "CtxGatherFunc": (CtxGatherFunc, CtxGather),
+        "CtxScatterFuncCB3D": (CtxScatterFuncCB3D, CtxScatterCB3D),
+        "CtxGatherFuncCB3D": (CtxGatherFuncCB3D, CtxGatherCB3D),
+        "CtxScatterFuncCB": (CtxScatterFuncCB, CtxScatterCB),
+        "CtxGatherFuncCB": (CtxGatherFuncCB, CtxGatherCB),
     }
 
     @classmethod
diff --git a/QEfficient/customop/ctx_scatter_gather_cb.py b/QEfficient/customop/ctx_scatter_gather_cb.py
@@ -126,6 +126,7 @@ class CtxGatherFuncCB(torch.autograd.Function):
     def forward(data: torch.Tensor, batch_index: torch.Tensor, ctx_indices: torch.Tensor, comp_ctx_len: int):
         batch_indices = batch_index.view(-1, 1, 1)
         head_indices = torch.arange(data.shape[1]).view(1, -1, 1)
+        ctx_indices = torch.where(ctx_indices >= data.shape[2], 0, ctx_indices)
         return data[batch_indices, head_indices, ctx_indices]
 
     @staticmethod
diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
@@ -44,6 +44,7 @@ def _get_invalid_idx_value(cls):
         """
         if torch.onnx.is_in_onnx_export():
             if cls.SUBFUNC_ENABLED:
+                # TODO: should not return 0 remove this if condition, it can hurt perf
                 return 0
             else:
                 return torch.iinfo(torch.int32).max
@@ -722,9 +723,22 @@ def full_cache_update_chunked(
         cache_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         position_ids = cache_kwargs.get("position_ids")
+        batch_index = cache_kwargs.get("batch_index")
+        invalid_idx_value = InvalidIndexProvider._get_invalid_idx_value()
 
-        self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], position_ids, key_states)
-        self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
+        # Scatter
+        if batch_index is not None:
+            if torch.onnx.is_in_onnx_export():
+                scatter_position_ids = torch.where(position_ids < 0, torch.iinfo(torch.int32).max, position_ids)
+            self.key_cache[layer_idx] = CtxScatterFuncCB.apply(
+                self.key_cache[layer_idx], batch_index, scatter_position_ids, key_states
+            )
+            self.value_cache[layer_idx] = CtxScatterFuncCB.apply(
+                self.value_cache[layer_idx], batch_index, scatter_position_ids, value_states
+            )
+        else:
+            self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], position_ids, key_states)
+            self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
 
         k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
 
@@ -733,11 +747,13 @@ def full_cache_update_chunked(
         ctx_indices = torch.arange(ctx_len)[None, None, ...]
         gather_limit = position_ids.max(1, keepdim=True).values.unsqueeze(1)
         invalid_mask = ctx_indices > gather_limit
-
-        invalid_idx_value = InvalidIndexProvider._get_invalid_idx_value()
         ctx_indices = torch.where(invalid_mask, invalid_idx_value, ctx_indices)
-        k_out = CtxGatherFunc.apply(k_out, ctx_indices, ctx_len)
-        v_out = CtxGatherFunc.apply(v_out, ctx_indices, ctx_len)
+        if batch_index is not None:
+            k_out = CtxGatherFuncCB.apply(k_out, batch_index, ctx_indices, ctx_len)
+            v_out = CtxGatherFuncCB.apply(v_out, batch_index, ctx_indices, ctx_len)
+        else:
+            k_out = CtxGatherFunc.apply(k_out, ctx_indices, ctx_len)
+            v_out = CtxGatherFunc.apply(v_out, ctx_indices, ctx_len)
         v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)
 
         return k_out, v_out
@@ -750,26 +766,40 @@ def sliding_window_update_chunked(
         cache_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         position_ids = cache_kwargs.get("position_ids")
+        batch_index = cache_kwargs.get("batch_index")
+        invalid_idx_value = InvalidIndexProvider._get_invalid_idx_value()
 
-        self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], position_ids, key_states)
-        self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
+        if batch_index is not None:
+            if torch.onnx.is_in_onnx_export():
+                scatter_position_ids = torch.where(position_ids < 0, torch.iinfo(torch.int32).max, position_ids)
+            self.key_cache[layer_idx] = CtxScatterFuncCB.apply(
+                self.key_cache[layer_idx], batch_index, scatter_position_ids, key_states
+            )
+            self.value_cache[layer_idx] = CtxScatterFuncCB.apply(
+                self.value_cache[layer_idx], batch_index, scatter_position_ids, value_states
+            )
+        else:
+            self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], position_ids, key_states)
+            self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
 
         k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
         sliding_window_len = cache_kwargs.get("sliding_window")
+
         # Gather
         ctx_len = position_ids.shape[1] + sliding_window_len
         ctx_indices = torch.arange(ctx_len)[None, None, ...]
-        # positive_pos_ids = torch.where(position_ids<0, 0, position_ids)
         first_pos_idx = position_ids[0][0]
         add_idx = torch.where(first_pos_idx >= sliding_window_len, first_pos_idx - sliding_window_len, 0)
         ctx_indices += add_idx
         gather_limit = position_ids.max(1, keepdim=True).values.unsqueeze(1)
         invalid_mask = ctx_indices > gather_limit
-
-        invalid_idx_value = InvalidIndexProvider._get_invalid_idx_value()
         ctx_indices = torch.where(invalid_mask, invalid_idx_value, ctx_indices)
-        k_out = CtxGatherFunc.apply(k_out, ctx_indices, ctx_len)
-        v_out = CtxGatherFunc.apply(v_out, ctx_indices, ctx_len)
+        if batch_index is not None:
+            k_out = CtxGatherFuncCB.apply(k_out, batch_index, ctx_indices, ctx_len)
+            v_out = CtxGatherFuncCB.apply(v_out, batch_index, ctx_indices, ctx_len)
+        else:
+            k_out = CtxGatherFunc.apply(k_out, ctx_indices, ctx_len)
+            v_out = CtxGatherFunc.apply(v_out, ctx_indices, ctx_len)
         v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)
 
         return k_out, v_out
diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
@@ -92,8 +92,7 @@ def forward(self, hidden: torch.Tensor):
             down_out = (intermediate @ W_d) + b_d  # [T, H]
 
             # Apply routing weights and accumulate
-            masked_down = torch.where(routing_weight > 0, down_out * routing_weight, torch.zeros_like(expert_out))
-            expert_out += masked_down
+            expert_out += down_out * routing_weight
 
         # original shape [B, S, H]
         return expert_out.view(B, S, H), router_logits
@@ -148,8 +147,7 @@ def forward(self, hidden: torch.Tensor):
             down_out = (intermediate @ W_d) + b_d  # [T, H]
 
             # Apply routing weights and accumulate
-            masked_down = torch.where(routing_weight > 0, down_out * routing_weight, torch.zeros_like(expert_out))
-            expert_out += masked_down
+            expert_out += down_out * routing_weight
 
         # original shape [B, S, H]
         return expert_out.view(B, S, H), router_logits
@@ -221,8 +219,7 @@ def blocked_ffn_forward(self, hidden: torch.Tensor):
             down_out = torch.cat(outs, dim=0)
 
             # Apply routing weights and accumulate
-            masked_down = torch.where(routing_weight > 0, down_out * routing_weight, torch.zeros_like(expert_out))
-            expert_out += masked_down
+            expert_out += down_out * routing_weight
 
         # original shape [B, S, H]
         return expert_out.view(B, S, H), router_logits
@@ -1296,16 +1293,15 @@ def forward(
             router_logits=outputs.router_logits,
         )
 
-    def get_pkv_dynamic_axes(
-        self,
-        retain_full_kv: Optional[bool] = False,
-    ):
+    def get_pkv_dynamic_axes(self, retain_full_kv: Optional[bool] = False, continuous_batching: Optional[bool] = False):
         pkv_dynamic_axes = []
         for layer_type in self.config.layer_types:
             if layer_type == "sliding_attention" and not retain_full_kv:
-                pkv_dynamic_axes.append({0: "batch_size", 2: "sliding_window"})
+                pkv_dynamic_axes.append(
+                    {0: "full_batch_size" if continuous_batching else "batch_size", 2: "sliding_window"}
+                )
             else:
-                pkv_dynamic_axes.append({0: "batch_size", 2: "ctx_len"})
+                pkv_dynamic_axes.append({0: "full_batch_size" if continuous_batching else "batch_size", 2: "ctx_len"})
         return pkv_dynamic_axes
 
     def get_specializations(
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -2592,7 +2592,6 @@ def export(
             self.model.config, fbs if self.continuous_batching else bs, seq_len
         )
         if prefill_only:
-            assert not self.continuous_batching, "prefill_only=True is not supported with continuous_batching=True"
             self.prefill(enable=True, enable_chunking=kwargs.get("enable_chunking", False))
             self.hash_params.pop("retain_full_kv", None)
             seq_len = (
@@ -2666,7 +2665,8 @@ def export(
             pkv_dynamic_axes = (
                 self.model.get_pkv_dynamic_axes(
                     retain_full_kv=kwargs.get("retain_full_kv", False)
-                    or (prefill_only and kwargs.get("enable_chunking", False))
+                    or (prefill_only and kwargs.get("enable_chunking", False)),
+                    continuous_batching=self.continuous_batching,
                 )
                 if hasattr(self.model, "get_pkv_dynamic_axes")
                 else pkv_dynamic_axes
@@ -2678,7 +2678,6 @@ def export(
             )
 
             for i in range(self.num_layers):
-                pkv_dynamic_axes[i][0] = "full_batch_size" if self.continuous_batching else "batch_size"
                 for kv in ["key", "value"]:
                     example_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
                     dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes[i]
@@ -3030,15 +3029,6 @@ def compile(
         if self.is_tlm:
             num_speculative_tokens = self.check_and_get_num_speculative_tokens(num_speculative_tokens, prefill_seq_len)
 
-        if self.continuous_batching and full_batch_size is None:
-            raise TypeError("`full_batch_size` is required when `continuous_batching=True`.")
-
-        if kv_cache_batch_size and not full_batch_size:
-            raise ValueError(
-                "KV caching requires continuous batching. Please set `full_batch_size` and "
-                "enable `continuous_batching=True` in `from_pretrained`."
-            )
-
         if (
             self.model.qaic_config is not None
             and self.model.qaic_config.get("include_sampler", False)
@@ -3048,7 +3038,9 @@ def compile(
             raise ValueError("Currently, sampler does not support `num_speculative_tokens` > 0.")
 
         if kv_cache_batch_size and prefill_only is not None and prefill_only:
-            logger.warning("kv_cache_batch_size will be ignored as prefill_only is set to True")
+            logger.warning(
+                "kv_cache_batch_size will be ignored as prefill_only is set to True unless this is GPTOSS model"
+            )
 
         # Infer kv_cache_batch_size if not provided
         kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size
@@ -3086,6 +3078,13 @@ def compile(
                 )
 
         if prefill_only is None or not prefill_only:
+            if self.continuous_batching and full_batch_size is None:
+                raise TypeError("`full_batch_size` is required when `continuous_batching=True`.")
+            if kv_cache_batch_size and not full_batch_size:
+                raise ValueError(
+                    "KV caching requires continuous batching. Please set `full_batch_size` and "
+                    "enable `continuous_batching=True` in `from_pretrained`."
+                )
             if self.comp_ctx_lengths_decode is not None:
                 # Adding elements from self.comp_ctx_lengths_decode to decode_specialization
                 for i in range(0, len(self.comp_ctx_lengths_decode)):