quic
diff --git a/‎QEfficient/transformers/models/modeling_auto.py‎
Lines changed: 67 additions & 26 deletions b/‎QEfficient/transformers/models/modeling_auto.py‎
Lines changed: 67 additions & 26 deletions
diff --git a/‎QEfficient/utils/check_ccl_specializations.py‎
Lines changed: 1 addition & 8 deletions b/‎QEfficient/utils/check_ccl_specializations.py‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎examples/ccl_gpt_oss.py‎
Lines changed: 13 additions & 9 deletions b/‎examples/ccl_gpt_oss.py‎
Lines changed: 13 additions & 9 deletions
@@ -909,7 +909,7 @@ def __init__(
         self,
         model: nn.Module,
         continuous_batching: bool = False,
-        qaic_config: Optional[dict] = None,
+        ccl_enabled: bool = False,
         **kwargs,
     ):
         """
@@ -932,11 +932,10 @@ def __init__(
         self.model = model
         self.config = model.config
 
-        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(qaic_config)
-
         self.vision_model = QEffVisionEncoderForTextImageToTextModel(model, **kwargs)
         self.lang_model = QEffCausalLMForTextImageToTextModel(model, **kwargs)
         self.continuous_batching = continuous_batching
+        self.ccl_enabled = ccl_enabled
         self.input_shapes, self.output_names = None, None
 
     @property
@@ -955,7 +954,7 @@ def model_name(self) -> str:
         return mname
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Optional[dict] = None, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
         """
         Load a QEfficient multimodal model for dual QPC from a pretrained HuggingFace model or local path.
 
@@ -980,11 +979,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Option
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
+        ccl_enabled = kwargs.pop("ccl_enabled", None)
+
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(
             model,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            qaic_config=qaic_config,
+            ccl_enabled=ccl_enabled,
             **kwargs,
         )
 
@@ -1090,6 +1091,8 @@ def compile(
         compile_dir: Optional[str] = None,
         *,
         prefill_seq_len: Optional[int] = None,
+        comp_ctx_lengths_prefill: Optional[List[int]] = None,
+        comp_ctx_lengths_decode: Optional[List[int]] = None,
         ctx_len: Optional[int] = None,
         batch_size: int = 1,
         full_batch_size: Optional[int] = None,
@@ -1174,10 +1177,21 @@ def compile(
 
         output_names = self.model.get_output_names(kv_offload=True)
 
+        # if ccl_enabled is True read Compute-Context-Length lists
+        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None
+        if self.ccl_enabled:
+            if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None:
+                logger.warning(
+                    "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model."
+                )
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+                comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
+            )
+
         # For supporting VLLM and Disaggregated with CCL
-        if "comp_ctx_lengths_prefill" in compiler_options:
-            self.comp_ctx_lengths_prefill = compiler_options.pop("comp_ctx_lengths_prefill")
-            self.comp_ctx_lengths_decode = compiler_options.pop("comp_ctx_lengths_decode")
+        if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None:
+            self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill
+            self.comp_ctx_lengths_decode = comp_ctx_lengths_decode
 
         specializations, compiler_options = self.model.get_specializations(
             batch_size=batch_size,
@@ -1600,7 +1614,7 @@ class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, Multimodal
     def __init__(
         self,
         model: nn.Module,
-        qaic_config: Optional[dict] = None,
+        ccl_enabled: bool = False,
         **kwargs,
     ):
         """
@@ -1622,8 +1636,6 @@ def __init__(
             raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
         super().__init__(model, **kwargs)
 
-        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(qaic_config)
-
         # to handle internvl models
         if hasattr(self.model.config, "llm_config") and hasattr(self.model.config, "vision_config"):
             self.model.config.llm_config.use_cache = True
@@ -1635,12 +1647,12 @@ def __init__(
             else:
                 self.model.config.use_cache = True
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
+        self.ccl_enabled = ccl_enabled
 
     @classmethod
     def from_pretrained(
         cls,
         pretrained_model_name_or_path,
-        qaic_config: Optional[dict] = None,
         *args,
         **kwargs,
     ):
@@ -1671,6 +1683,8 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
+        ccl_enabled = kwargs.pop("ccl_enabled", None)
+
         from transformers import AutoConfig
 
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
@@ -1681,7 +1695,7 @@ def from_pretrained(
         return cls(
             model,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            qaic_config=qaic_config,
+            ccl_enabled=ccl_enabled,
             **kwargs,
         )
 
@@ -1725,6 +1739,8 @@ def compile(
         *,
         prefill_seq_len: Optional[int] = None,
         ctx_len: Optional[int] = None,
+        comp_ctx_lengths_prefill: Optional[List[int]] = None,
+        comp_ctx_lengths_decode: Optional[List[int]] = None,
         batch_size: int = 1,
         full_batch_size: Optional[int] = None,
         kv_cache_batch_size: Optional[int] = None,
@@ -1794,10 +1810,21 @@ def compile(
         kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size
         output_names = self.model.get_output_names()
 
+        # if ccl_enabled is True read Compute-Context-Length lists
+        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None
+        if self.ccl_enabled:
+            if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None:
+                logger.warning(
+                    "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model."
+                )
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+                comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
+            )
+
         # For supporting VLLM and Disaggregated with CCL
-        if "comp_ctx_lengths_prefill" in compiler_options:
-            self.comp_ctx_lengths_prefill = compiler_options.pop("comp_ctx_lengths_prefill")
-            self.comp_ctx_lengths_decode = compiler_options.pop("comp_ctx_lengths_decode")
+        if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None:
+            self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill
+            self.comp_ctx_lengths_decode = comp_ctx_lengths_decode
 
         # Get specializations from modelling file
         # TODO: expose this via the auto class as well
@@ -2180,7 +2207,7 @@ def __new__(
         model: nn.Module,
         kv_offload: Optional[bool] = True,
         continuous_batching: bool = False,
-        qaic_config: Optional[dict] = None,
+        ccl_enabled: bool = False,
         **kwargs,
     ):
         """
@@ -2204,10 +2231,10 @@ def __new__(
         """
         if kv_offload:
             return _QEffAutoModelForImageTextToTextDualQPC(
-                model, continuous_batching, qaic_config=qaic_config, **kwargs
+                model, continuous_batching, ccl_enabled=ccl_enabled, **kwargs
             )
         else:
-            return _QEFFAutoModelForImageTextToTextSingleQPC(model, qaic_config=qaic_config, **kwargs)
+            return _QEFFAutoModelForImageTextToTextSingleQPC(model, ccl_enabled=ccl_enabled, **kwargs)
 
     @classmethod
     @with_replaced_quantizers
@@ -2257,14 +2284,15 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
+        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(
             model,
             kv_offload=kv_offload,
             continuous_batching=continuous_batching,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            qaic_config=qaic_config,
+            ccl_enabled=ccl_enabled,
             **kwargs,
         )
 
@@ -2317,6 +2345,7 @@ def __init__(
         model: nn.Module,
         continuous_batching: bool = False,
         qaic_config: Optional[dict] = None,
+        ccl_enabled: bool = False,
         **kwargs,
     ):
         """
@@ -2363,8 +2392,6 @@ def __init__(
         # Set use_cache=True to get KV values as output during ONNX export
         model.config.use_cache = True
 
-        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(qaic_config)
-
         super().__init__(model, qaic_config=qaic_config, **kwargs)
         self.num_layers = model.config.num_hidden_layers
         self.continuous_batching = continuous_batching
@@ -2373,6 +2400,7 @@ def __init__(
         self.is_tlm = transformed
 
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
+        self.ccl_enabled = ccl_enabled
 
         # ---Sampling---
         # Note: SamplerTransform should be applied after all other transforms
@@ -2465,6 +2493,7 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kv_offload = kwargs.pop("kv_offload", None)
+        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
@@ -2478,14 +2507,15 @@ def from_pretrained(
                 model,
                 kv_offload=kv_offload,
                 pretrained_model_name_or_path=pretrained_model_name_or_path,
-                qaic_config=qaic_config,
+                ccl_enabled=ccl_enabled,
                 **kwargs,
             )
         return cls(
             model,
             continuous_batching=continuous_batching,
             qaic_config=qaic_config,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
+            ccl_enabled=ccl_enabled,
             **kwargs,
         )
 
@@ -2814,6 +2844,8 @@ def compile(
         *,
         prefill_seq_len: int = 32,
         ctx_len: int = 128,
+        comp_ctx_lengths_prefill: Optional[List[int]] = None,
+        comp_ctx_lengths_decode: Optional[List[int]] = None,
         batch_size: int = 1,
         full_batch_size: Optional[int] = None,
         kv_cache_batch_size: Optional[int] = None,
@@ -2905,10 +2937,19 @@ def compile(
 
         """
 
+        # if ccl_enabled is True read Compute-Context-Length lists
+        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None
+        if self.ccl_enabled:
+            if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None:
+                logger.warning(
+                    "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model."
+                )
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+                comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
+            )
+
         # For supporting VLLM and Disaggregated with CCL
-        if "comp_ctx_lengths_prefill" in compiler_options and "comp_ctx_lengths_decode" in compiler_options:
-            comp_ctx_lengths_prefill = compiler_options.pop("comp_ctx_lengths_prefill")
-            comp_ctx_lengths_decode = compiler_options.pop("comp_ctx_lengths_decode")
+        if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None:
             if isinstance(comp_ctx_lengths_prefill, str):
                 import ast
 
 
@@ -6,14 +6,7 @@
 # -----------------------------------------------------------------------------
 
 
-def process_ccl_specializations(qaic_config):
-    if qaic_config is None:
-        return None, None
-    ccl_prefill = qaic_config.pop("comp_ctx_lengths_prefill", None)
-    ccl_decode = qaic_config.pop("comp_ctx_lengths_decode", None)
-    ctx_len = qaic_config.pop("ctx_len", None)
-    prefill_seq_len = qaic_config.pop("prefill_seq_len", 128)
-
+def process_ccl_specializations(ccl_prefill, ccl_decode, ctx_len, prefill_seq_len):
     if ccl_prefill is None or ccl_decode is None:
         return None, None
 
 
@@ -11,26 +11,28 @@
 
 model_id = "openai/gpt-oss-20b"  # weights are not required to convert to fp32
 
+## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
+## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
+##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
+##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
+##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. 
+##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
+##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
+
 ctx_len = 4096
 # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists.
 # Set the list of ccl during prefilling process
-comp_ctx_lengths_prefill = [512, ctx_len]
+comp_ctx_lengths_prefill = [512, ctx_len] #None #
 # Set the list of ccl during decoding process
-comp_ctx_lengths_decode = [512, ctx_len]
+comp_ctx_lengths_decode = [512, ctx_len] #None #
 
 
 qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
     model_id,
-    qaic_config={
-        "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill,
-        "comp_ctx_lengths_decode": comp_ctx_lengths_decode,
-        "ctx_len": ctx_len,
-        "prefill_seq_len": 1,  # Passing prefill_seq_len is mandatory for CCL goal in moe models. Currently we can get best perf using PL=1.
-    },
+    ccl_enabled=True,
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-onnx_model_path = qeff_model.export()
 qpc_path = qeff_model.compile(
     prefill_seq_len=1,  # Currently we can get best perf using PL=1 i.e. decode-only model, prefill optimizations are being worked on.
     ctx_len=ctx_len,
@@ -41,6 +43,8 @@
     mos=1,
     aic_enable_depth_first=True,
     num_speculative_tokens=None,
+    comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
+    comp_ctx_lengths_decode=comp_ctx_lengths_decode,
 )
 print(f"qpc path is {qpc_path}")
 streamer = TextStreamer(tokenizer)