added disagg mode example for chunking mode

ochougul · ochougul · commit ced0683f143c · 2025-12-02T11:10:09.000Z
Signed-off-by: Onkar Chougule &lt;ochougul@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
@@ -99,7 +99,6 @@ def forward(self, hidden: torch.Tensor):
         return expert_out.view(B, S, H), router_logits
 
 
-
 class QEffPrefillOnlyGptOssMLP(GptOssMLP):
     def forward(self, hidden: torch.Tensor):
         if os.environ.get("NUM_FFN_BLOCKS", None) is not None:
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
@@ -655,9 +655,10 @@ class PrefillOnlyChunkedTransform(ModuleMappingTransform):
 
 
 class RevertPrefillOnlyTransform(ModuleMappingTransform):
-    _module_mapping = {v: k for k, v in PrefillOnlyTransform._module_mapping.items()}.update(
-        {v: k for k, v in PrefillOnlyChunkedTransform._module_mapping.items()}
-    )
+    _module_mapping = {
+        **{v: k for k, v in PrefillOnlyTransform._module_mapping.items()},
+        **{v: k for k, v in PrefillOnlyChunkedTransform._module_mapping.items()},
+    }
 
 
 class SpDTransform:
diff --git a/examples/gpt_oss_disagg_mode.py b/examples/gpt_oss_disagg_mode.py
@@ -80,6 +80,7 @@
     aic_enable_depth_first=True,
     num_speculative_tokens=None,
     prefill_only=True,
+    use_onnx_subfunctions=True,
 )
 
 prefill_session = QAICInferenceSession(prefill_qpc_path)
diff --git a/examples/gpt_oss_disagg_mode_with_chunking.py b/examples/gpt_oss_disagg_mode_with_chunking.py
@@ -0,0 +1,156 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import time
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer
+
+from QEfficient import QEFFAutoModelForCausalLM
+from QEfficient.generation.cloud_infer import QAICInferenceSession
+
+model_id = "openai/gpt-oss-120b"  # weights are not required to convert to fp32
+
+prompt = """
+Once upon a time, in a small town, there lived a young boy named Alex. Alex was a curious and adventurous child, always eager to explore the world around him. One day, while playing in the park, Alex stumbled upon a mysterious old book hidden beneath a pile of leaves. The book was filled with stories of distant lands, magical creatures, and extraordinary adventures.
+
+As Alex flipped through the pages, he discovered a map that led to a hidden treasure. Excited by the prospect of a real-life treasure hunt, Alex decided to embark on a thrilling journey. He packed his backpack with snacks, a flashlight, and a compass, and set off into the unknown.
+
+The path to the treasure was not an easy one. Alex had to navigate through dense forests, cross rickety bridges, and solve riddles that guarded the treasure's location.
+"""
+all_outputs = []
+# Run prefill
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+PREFILL_SEQ_LEN = 128
+CTX_LEN = 2 * 128
+inputs = tokenizer(prompt, return_tensors="np", padding=True)
+position_ids = inputs["attention_mask"].sum(1, keepdims=True)
+padded_len = inputs["input_ids"].shape[1]
+num_chunks = -(padded_len // -PREFILL_SEQ_LEN)  # ceil divide without float
+padded_len = num_chunks * PREFILL_SEQ_LEN  # Convert to a multiple of prompt_len
+
+# Initialize variables specific to request
+# Calculate the max generation length.
+max_gen_len = CTX_LEN - position_ids.max()
+generation_len = max_gen_len
+
+
+# qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2)
+qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id)
+
+
+decode_qpc_path = qeff_model.compile(
+    prefill_seq_len=1,
+    ctx_len=CTX_LEN,
+    num_cores=16,
+    mxfp6_matmul=True,
+    mxint8_kv_cache=True,
+    num_devices=1,
+    mos=1,
+    aic_enable_depth_first=True,
+    num_speculative_tokens=None,
+    offload_pt_weights=False,  # Need the weights in memory for prefill-model export/compilation in the next step
+)
+
+config = qeff_model.model.config
+inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len)
+inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1)
+inputs.pop("token_type_ids", None)
+inputs = {k: torch.from_numpy(v) for k, v in inputs.items()}
+past_key_values = []
+for i in range(config.num_hidden_layers):
+    cache_len = config.sliding_window if i % 2 == 0 else PREFILL_SEQ_LEN
+    pad_shape = (1, 8, cache_len, 64)
+    past_key = torch.zeros((pad_shape), dtype=torch.float32)
+    past_value = torch.zeros((pad_shape), dtype=torch.float32)
+    pkv = (past_key, past_value)
+    past_key_values.append(pkv)
+inputs["past_key_values"] = past_key_values
+
+prefill_qpc_path = qeff_model.compile(
+    prefill_seq_len=PREFILL_SEQ_LEN,
+    ctx_len=CTX_LEN,
+    num_cores=16,
+    mxfp6_matmul=True,
+    mxint8_kv_cache=True,
+    num_devices=1,
+    mos=1,
+    aic_enable_depth_first=True,
+    num_speculative_tokens=None,
+    prefill_only=True,
+    enable_chunking=True,
+    use_onnx_subfunctions=True,
+    offload_pt_weights=False,
+)
+print("loading qpc")
+st = time.time()
+prefill_session = QAICInferenceSession(prefill_qpc_path, device_ids=[i for i in range(32, 48)])
+print(f"time for loading session = {time.time() - st}")
+print("done")
+prefill_session.skip_buffers(
+    [x for x in prefill_session.input_names + prefill_session.output_names if x.startswith("past_")]
+)
+logits_out_placeholder = np.zeros((1, 1, 201088), dtype=np.float32)
+prefill_session.set_buffers({"logits": logits_out_placeholder})
+inputs.pop("past_key_values")
+inputs = {k: v.detach().numpy() for k, v in inputs.items()}
+st = time.time()
+
+for i in range(num_chunks):
+    chunk_inputs = inputs.copy()
+    chunk_inputs["input_ids"] = inputs["input_ids"][:, i * PREFILL_SEQ_LEN : (i + 1) * PREFILL_SEQ_LEN]
+    chunk_inputs["position_ids"] = inputs["position_ids"][:, i * PREFILL_SEQ_LEN : (i + 1) * PREFILL_SEQ_LEN]
+    ins = time.time()
+    qpc_out = prefill_session.run(chunk_inputs)
+    print(f"time for this run={time.time() - ins}")
+print(f"time for prefill_run={time.time() - st} sec\n")
+
+decode_session = QAICInferenceSession(decode_qpc_path)
+decode_session.set_buffers({"logits": logits_out_placeholder})
+
+decode_inputs = {
+    "input_ids": np.argmax(qpc_out["logits"]).reshape(1, 1),
+    "position_ids": np.max(inputs["position_ids"]).reshape(1, 1) + 1,
+}
+print("pos_id for decodee", decode_inputs["position_ids"])
+
+all_outputs.append(decode_inputs["input_ids"][0][0])
+for i in range(config.num_hidden_layers):
+    if i % 2 == 0 and decode_inputs["position_ids"] >= config.sliding_window:
+        last_valid_pos_idx = decode_inputs["position_ids"][0][0]
+        first_valid_pos_idx = last_valid_pos_idx - config.sliding_window
+        k = qpc_out[f"past_key.{i}_RetainedState"][:, :, first_valid_pos_idx:last_valid_pos_idx, :]
+        v = qpc_out[f"past_value.{i}_RetainedState"][:, :, first_valid_pos_idx:last_valid_pos_idx, :]
+        mod_pos_id = config.sliding_window - decode_inputs["position_ids"][0][0] % config.sliding_window
+        decode_inputs[f"past_key.{i}"] = np.concatenate((k[:, :, mod_pos_id:, :], k[:, :, :mod_pos_id, :]), axis=-2)
+        decode_inputs[f"past_value.{i}"] = np.concatenate((v[:, :, mod_pos_id:, :], v[:, :, :mod_pos_id, :]), axis=-2)
+    else:
+        decode_inputs[f"past_key.{i}"] = qpc_out[f"past_key.{i}_RetainedState"]
+        decode_inputs[f"past_value.{i}"] = qpc_out[f"past_value.{i}_RetainedState"]
+
+st = time.time()
+decode_out = decode_session.run(decode_inputs)
+print(f"time for first run of decode with KV as input = {time.time() - st} sec\n")
+decode_session.skip_buffers(
+    [x for x in decode_session.input_names + decode_session.output_names if x.startswith("past_")]
+)
+pos_id = np.max(decode_inputs["position_ids"]).reshape(1, 1) + 1
+st = time.time()
+for i in range(generation_len - 2):
+    loop_decode_inputs = {
+        "input_ids": np.argmax(decode_out["logits"]).reshape(1, 1),
+        "position_ids": pos_id,
+    }
+    all_outputs.append(loop_decode_inputs["input_ids"][0][0])
+    decode_out = decode_session.run(loop_decode_inputs)
+    pos_id += 1
+
+
+print(f"time for decode generation = {(time.time() - st) / (generation_len - 2)}")
+print(all_outputs)
+print(tokenizer.decode(all_outputs))

Original file line number	Diff line number	Diff line change
`@@ -80,6 +80,7 @@`
`80`	`80`	`aic_enable_depth_first=True,`
`81`	`81`	`num_speculative_tokens=None,`
`82`	`82`	`prefill_only=True,`
	`83`	`+ use_onnx_subfunctions=True,`
`83`	`84`	`)`
`84`	`85`
`85`	`86`	`prefill_session = QAICInferenceSession(prefill_qpc_path)`