From d58736d54a52fdceb5e0a9c60b051e4b6e7194bb Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Wed, 19 Nov 2025 15:01:38 -0800 Subject: [PATCH 01/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- .../transformers/models/modeling_auto.py | 94 ++++++++++++++----- QEfficient/utils/check_ccl_specializations.py | 9 +- .../compute_context_length/gemma3.py | 30 +++--- .../compute_context_length/gpt_oss.py | 22 +++-- .../compute_context_length/granite_vision.py | 11 ++- .../compute_context_length/internvl.py | 27 +++--- .../compute_context_length/llama4.py | 20 ++-- .../compute_context_length/llama4_cb.py | 24 +++-- .../llama4_multi_image.py | 16 +++- .../compute_context_length/mistral3.py | 14 ++- .../compute_context_length/molmo.py | 22 +++-- .../compute_context_length/qwen2_5_vl.py | 22 +++-- .../compute_context_length/qwen2_5_vl_cb.py | 19 ++-- .../ccl_qwen3moe_inference.py | 24 +++-- 14 files changed, 231 insertions(+), 123 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index f3618cb1e..8a4578737 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -914,7 +914,7 @@ def __init__( self, model: nn.Module, continuous_batching: bool = False, - qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -937,11 +937,10 @@ def __init__( self.model = model self.config = model.config - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(qaic_config) - self.vision_model = QEffVisionEncoderForTextImageToTextModel(model, **kwargs) self.lang_model = QEffCausalLMForTextImageToTextModel(model, qaic_config=qaic_config, **kwargs) self.continuous_batching = continuous_batching + self.ccl_enabled = ccl_enabled self.input_shapes, self.output_names = None, None @property @@ -960,7 +959,7 @@ def model_name(self) -> str: return mname @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Optional[dict] = None, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): """ Load a QEfficient multimodal model for dual QPC from a pretrained HuggingFace model or local path. @@ -985,11 +984,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Option logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + ccl_enabled = kwargs.pop("ccl_enabled", None) + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) return cls( model, pretrained_model_name_or_path=pretrained_model_name_or_path, - qaic_config=qaic_config, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -1095,6 +1096,8 @@ def compile( compile_dir: Optional[str] = None, *, prefill_seq_len: Optional[int] = None, + comp_ctx_lengths_prefill: Optional[List[int]] = None, + comp_ctx_lengths_decode: Optional[List[int]] = None, ctx_len: Optional[int] = None, batch_size: int = 1, full_batch_size: Optional[int] = None, @@ -1179,10 +1182,21 @@ def compile( output_names = self.model.get_output_names(kv_offload=True) + # if ccl_enabled is True read Compute-Context-Length lists + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None + if self.ccl_enabled: + if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: + logger.warning( + "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." + ) + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len + ) + # For supporting VLLM and Disaggregated with CCL - if "comp_ctx_lengths_prefill" in compiler_options: - self.comp_ctx_lengths_prefill = compiler_options.pop("comp_ctx_lengths_prefill") - self.comp_ctx_lengths_decode = compiler_options.pop("comp_ctx_lengths_decode") + if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: + self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill + self.comp_ctx_lengths_decode = comp_ctx_lengths_decode specializations, compiler_options = self.model.get_specializations( batch_size=batch_size, @@ -1605,7 +1619,7 @@ class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, Multimodal def __init__( self, model: nn.Module, - qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -1630,9 +1644,12 @@ def __init__( raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") super().__init__(model, **kwargs) +<<<<<<< HEAD self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(qaic_config) self.model.qaic_config = qaic_config +======= +>>>>>>> 29b555b (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process) # to handle internvl models if hasattr(self.model.config, "llm_config") and hasattr(self.model.config, "vision_config"): self.model.config.llm_config.use_cache = True @@ -1644,6 +1661,7 @@ def __init__( else: self.model.config.use_cache = True self.hash_params["qeff_auto_class"] = self.__class__.__name__ + self.ccl_enabled = ccl_enabled if self.model.qaic_config is not None and self.model.qaic_config.get("num_kv_blocks", None) is not None: BlockedKVAttentionTransform.apply(self.model, num_kv_blocks=self.model.qaic_config.get("num_kv_blocks")) @@ -1652,7 +1670,6 @@ def __init__( def from_pretrained( cls, pretrained_model_name_or_path, - qaic_config: Optional[dict] = None, *args, **kwargs, ): @@ -1683,6 +1700,8 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + ccl_enabled = kwargs.pop("ccl_enabled", None) + from transformers import AutoConfig config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) @@ -1693,7 +1712,7 @@ def from_pretrained( return cls( model, pretrained_model_name_or_path=pretrained_model_name_or_path, - qaic_config=qaic_config, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -1737,6 +1756,8 @@ def compile( *, prefill_seq_len: Optional[int] = None, ctx_len: Optional[int] = None, + comp_ctx_lengths_prefill: Optional[List[int]] = None, + comp_ctx_lengths_decode: Optional[List[int]] = None, batch_size: int = 1, full_batch_size: Optional[int] = None, kv_cache_batch_size: Optional[int] = None, @@ -1806,10 +1827,21 @@ def compile( kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size output_names = self.model.get_output_names() + # if ccl_enabled is True read Compute-Context-Length lists + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None + if self.ccl_enabled: + if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: + logger.warning( + "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." + ) + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len + ) + # For supporting VLLM and Disaggregated with CCL - if "comp_ctx_lengths_prefill" in compiler_options: - self.comp_ctx_lengths_prefill = compiler_options.pop("comp_ctx_lengths_prefill") - self.comp_ctx_lengths_decode = compiler_options.pop("comp_ctx_lengths_decode") + if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: + self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill + self.comp_ctx_lengths_decode = comp_ctx_lengths_decode # Get specializations from modelling file # TODO: expose this via the auto class as well @@ -2192,7 +2224,7 @@ def __new__( model: nn.Module, kv_offload: Optional[bool] = True, continuous_batching: bool = False, - qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -2216,10 +2248,10 @@ def __new__( """ if kv_offload: return _QEffAutoModelForImageTextToTextDualQPC( - model, continuous_batching, qaic_config=qaic_config, **kwargs + model, continuous_batching, ccl_enabled=ccl_enabled, **kwargs ) else: - return _QEFFAutoModelForImageTextToTextSingleQPC(model, qaic_config=qaic_config, **kwargs) + return _QEFFAutoModelForImageTextToTextSingleQPC(model, ccl_enabled=ccl_enabled, **kwargs) @classmethod @with_replaced_quantizers @@ -2269,6 +2301,7 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + ccl_enabled = kwargs.pop("ccl_enabled", None) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) return cls( @@ -2276,7 +2309,7 @@ def from_pretrained( kv_offload=kv_offload, continuous_batching=continuous_batching, pretrained_model_name_or_path=pretrained_model_name_or_path, - qaic_config=qaic_config, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -2327,6 +2360,7 @@ def __init__( model: nn.Module, continuous_batching: bool = False, qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -2374,8 +2408,6 @@ def __init__( # Set use_cache=True to get KV values as output during ONNX export model.config.use_cache = True - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(qaic_config) - super().__init__(model, qaic_config=qaic_config, **kwargs) self.num_layers = model.config.num_hidden_layers self.continuous_batching = continuous_batching @@ -2384,6 +2416,7 @@ def __init__( self.is_tlm = transformed self.hash_params["qeff_auto_class"] = self.__class__.__name__ + self.ccl_enabled = ccl_enabled # ---Sampling--- # Note: SamplerTransform should be applied after all other transforms @@ -2479,6 +2512,7 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kv_offload = kwargs.pop("kv_offload", None) + ccl_enabled = kwargs.pop("ccl_enabled", None) kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) @@ -2492,7 +2526,7 @@ def from_pretrained( model, kv_offload=kv_offload, pretrained_model_name_or_path=pretrained_model_name_or_path, - qaic_config=qaic_config, + ccl_enabled=ccl_enabled, **kwargs, ) return cls( @@ -2500,6 +2534,7 @@ def from_pretrained( continuous_batching=continuous_batching, qaic_config=qaic_config, pretrained_model_name_or_path=pretrained_model_name_or_path, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -2828,6 +2863,8 @@ def compile( *, prefill_seq_len: int = 32, ctx_len: int = 128, + comp_ctx_lengths_prefill: Optional[List[int]] = None, + comp_ctx_lengths_decode: Optional[List[int]] = None, batch_size: int = 1, full_batch_size: Optional[int] = None, kv_cache_batch_size: Optional[int] = None, @@ -2919,10 +2956,19 @@ def compile( """ + # if ccl_enabled is True read Compute-Context-Length lists + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None + if self.ccl_enabled: + if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: + logger.warning( + "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." + ) + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len + ) + # For supporting VLLM and Disaggregated with CCL - if "comp_ctx_lengths_prefill" in compiler_options and "comp_ctx_lengths_decode" in compiler_options: - comp_ctx_lengths_prefill = compiler_options.pop("comp_ctx_lengths_prefill") - comp_ctx_lengths_decode = compiler_options.pop("comp_ctx_lengths_decode") + if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: if isinstance(comp_ctx_lengths_prefill, str): import ast diff --git a/QEfficient/utils/check_ccl_specializations.py b/QEfficient/utils/check_ccl_specializations.py index 308c69554..0d6a078f6 100644 --- a/QEfficient/utils/check_ccl_specializations.py +++ b/QEfficient/utils/check_ccl_specializations.py @@ -6,14 +6,7 @@ # ----------------------------------------------------------------------------- -def process_ccl_specializations(qaic_config): - if qaic_config is None: - return None, None - ccl_prefill = qaic_config.pop("comp_ctx_lengths_prefill", None) - ccl_decode = qaic_config.pop("comp_ctx_lengths_decode", None) - ctx_len = qaic_config.pop("ctx_len", None) - prefill_seq_len = qaic_config.pop("prefill_seq_len", 128) - +def process_ccl_specializations(ccl_prefill, ccl_decode, ctx_len, prefill_seq_len): if ccl_prefill is None or ccl_decode is None: return None, None diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py index c31b1748a..14d9e59ca 100644 --- a/examples/performance/compute_context_length/gemma3.py +++ b/examples/performance/compute_context_length/gemma3.py @@ -20,22 +20,26 @@ tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_id) -# pass HF_TOKEN if gated model -# For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ### +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 8192 -comp_ctx_lengths_prefill = [3072] -comp_ctx_lengths_decode = [4096, ctx_len] +comp_ctx_lengths_prefill = [3072] # None # +comp_ctx_lengths_decode = [4096, ctx_len] # None # +# pass HF_TOKEN if gated model +# For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ### qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( model_id, config=config, attn_implementation="eager", kv_offload=True, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) ### use skip_vision=True, if want to run only text, or false ### @@ -54,7 +58,9 @@ aic_enable_depth_first=True, skip_vision=True, mos=1, - node_precision_info="examples/gemma3_example/fp32_nodes_gemma3_27b.yaml", + node_precision_info="examples/gemma3_example/fp32_nodes_gemma3_4b.yaml", + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) messages = [ @@ -90,7 +96,9 @@ mxint8_kv_cache=False, aic_enable_depth_first=True, mos=1, - node_precision_info="examples/gemma3_example/fp32_nodes_gemma3_27b.yaml", + node_precision_info="examples/gemma3_example/fp32_nodes_gemma3_4b.yaml", + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ### IMAGE + TEXT ### diff --git a/examples/performance/compute_context_length/gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py index b211ba914..3b3ce179d 100644 --- a/examples/performance/compute_context_length/gpt_oss.py +++ b/examples/performance/compute_context_length/gpt_oss.py @@ -11,26 +11,28 @@ model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 4096 # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists. # Set the list of ccl during prefilling process -comp_ctx_lengths_prefill = [512, ctx_len] +comp_ctx_lengths_prefill = [512, ctx_len] #None # # Set the list of ccl during decoding process -comp_ctx_lengths_decode = [512, ctx_len] +comp_ctx_lengths_decode = [512, ctx_len] #None # qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_id, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - "prefill_seq_len": 1, # Passing prefill_seq_len is mandatory for CCL goal in moe models. Currently we can get best perf using PL=1. - }, + ccl_enabled=True, ) tokenizer = AutoTokenizer.from_pretrained(model_id) -onnx_model_path = qeff_model.export() qpc_path = qeff_model.compile( prefill_seq_len=1, # Currently we can get best perf using PL=1 i.e. decode-only model, prefill optimizations are being worked on. ctx_len=ctx_len, @@ -41,6 +43,8 @@ mos=1, aic_enable_depth_first=True, num_speculative_tokens=None, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) print(f"qpc path is {qpc_path}") streamer = TextStreamer(tokenizer) diff --git a/examples/performance/compute_context_length/granite_vision.py b/examples/performance/compute_context_length/granite_vision.py index 39b139bad..507ba11a4 100644 --- a/examples/performance/compute_context_length/granite_vision.py +++ b/examples/performance/compute_context_length/granite_vision.py @@ -20,6 +20,7 @@ def run_model( kv_offload=False, prefill_seq_len=5500, ctx_len=6000, + ccl_enabled=False, comp_ctx_lengths_prefill=None, comp_ctx_lengths_decode=None, generation_len=128, @@ -40,11 +41,7 @@ def run_model( model_name, token=token, kv_offload=kv_offload, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=ccl_enabled, ) ## STEP - 2 Export & Compile the Model @@ -56,6 +53,8 @@ def run_model( num_cores=num_cores, num_devices=num_devices, mxfp6_matmul=False, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ## STEP - 3 Load and process the inputs for Inference @@ -96,6 +95,7 @@ def run_model( num_cores = 16 num_devices = 4 ctx_len = 8192 + ccl_enabled = True comp_ctx_lengths_prefill = [5500] comp_ctx_lengths_decode = [6144, ctx_len] @@ -106,6 +106,7 @@ def run_model( image_url=image_url, prefill_seq_len=prefill_seq_len, ctx_len=ctx_len, + ccl_enabled=ccl_enabled, comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, generation_len=generation_len, diff --git a/examples/performance/compute_context_length/internvl.py b/examples/performance/compute_context_length/internvl.py index 827d50c97..bea3b49d3 100644 --- a/examples/performance/compute_context_length/internvl.py +++ b/examples/performance/compute_context_length/internvl.py @@ -174,27 +174,21 @@ def run_intern_on_aic( prefill_seq_len=3840, num_devices=1, num_cores=16, + ctx_len=512, + ccl_enabled=False, + comp_ctx_lengths_prefill=None, + comp_ctx_lengths_decode=None, ): ## STEP 1 -- LOAD THE MODEL # The original Intern-VL model, despite being multimodal, is loaded using `AutoModelForCausalLM` in Huggingface. # To maintain compatibility, we load this model using `QEFFAutoModelForCausalLM`. - ctx_len = 8192 - comp_ctx_lengths_prefill = [4096] - comp_ctx_lengths_decode = [6144, ctx_len] - - # model = QEFFAutoModelForCausalLM.from_pretrained(model_name, kv_offload=kv_offload, trust_remote_code=True) - model = QEFFAutoModelForCausalLM.from_pretrained( model_name, kv_offload=kv_offload, trust_remote_code=True, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=ccl_enabled, ) ## STEP 2 -- EXPORT & COMPILE THE MODEL @@ -205,6 +199,8 @@ def run_intern_on_aic( ctx_len=ctx_len, prefill_seq_len=prefill_seq_len, mxfp6_matmul=False, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ## STEP 3 -- SETUP THE PROCESSOR @@ -263,6 +259,11 @@ def run_intern_on_aic( num_devices = 4 num_cores = 16 + ctx_len = 8192 + ccl_enabled = True + comp_ctx_lengths_prefill = [4096] + comp_ctx_lengths_decode = [6144, ctx_len] + run_intern_on_aic( model_name=model_name, prompt=prompt, @@ -273,6 +274,10 @@ def run_intern_on_aic( prefill_seq_len=prefill_seq_len, num_devices=num_devices, num_cores=num_cores, + ctx_len=ctx_len, + ccl_enabled=ccl_enabled, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) diff --git a/examples/performance/compute_context_length/llama4.py b/examples/performance/compute_context_length/llama4.py index 534be8f96..cb0cb1939 100644 --- a/examples/performance/compute_context_length/llama4.py +++ b/examples/performance/compute_context_length/llama4.py @@ -17,6 +17,14 @@ config.text_config.num_hidden_layers = 4 config.vision_config.num_hidden_layers = 2 +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 8192 # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [3072] @@ -27,12 +35,8 @@ model_id, attn_implementation="eager", kv_offload=True, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, config=config, + ccl_enabled=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) @@ -54,6 +58,8 @@ aic_enable_depth_first=True, skip_vision=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) messages = [ @@ -95,6 +101,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ### IMAGE + TEXT ### @@ -121,7 +129,7 @@ ) inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) streamer = TextStreamer(tokenizer) - output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3], generation_len=100) + output = qeff_model.generate(inputs=inputs, device_ids=[8, 9, 10, 11], generation_len=100) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) print(output) diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py index ea7c09d69..a5d675c26 100644 --- a/examples/performance/compute_context_length/llama4_cb.py +++ b/examples/performance/compute_context_length/llama4_cb.py @@ -19,6 +19,14 @@ tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 4096 # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [3072] @@ -33,11 +41,7 @@ kv_offload=True, config=config, continuous_batching=True, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) qeff_model.compile( @@ -53,6 +57,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) else: qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( @@ -60,11 +66,7 @@ attn_implementation="eager", kv_offload=True, config=config, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) qeff_model.compile( @@ -79,6 +81,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) image_urls = [ diff --git a/examples/performance/compute_context_length/llama4_multi_image.py b/examples/performance/compute_context_length/llama4_multi_image.py index d7c403e5f..c839a08a2 100644 --- a/examples/performance/compute_context_length/llama4_multi_image.py +++ b/examples/performance/compute_context_length/llama4_multi_image.py @@ -17,6 +17,14 @@ config.text_config.num_hidden_layers = 4 config.vision_config.num_hidden_layers = 2 +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 8192 # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [5376] @@ -28,11 +36,7 @@ attn_implementation="eager", kv_offload=True, config=config, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) @@ -49,6 +53,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ### Multi_image Prompt ### diff --git a/examples/performance/compute_context_length/mistral3.py b/examples/performance/compute_context_length/mistral3.py index 96ed519f5..664e3c1c1 100644 --- a/examples/performance/compute_context_length/mistral3.py +++ b/examples/performance/compute_context_length/mistral3.py @@ -19,6 +19,7 @@ def run_model( kv_offload=False, prefill_seq_len=128, ctx_len=4096, + ccl_enabled=False, comp_ctx_lengths_prefill=None, comp_ctx_lengths_decode=None, generation_len=128, @@ -37,16 +38,15 @@ def run_model( config = AutoConfig.from_pretrained(model_name) config.vision_config._attn_implementation = "eager" + # For Testing Purpose Only + config.text_config.num_hidden_layers = 4 + config.vision_config.num_hidden_layers = 2 model = QEFFAutoModelForImageTextToText.from_pretrained( model_name, kv_offload=kv_offload, config=config, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=ccl_enabled, ) ## STEP - 2 Export & Compile the Model @@ -58,6 +58,8 @@ def run_model( num_cores=num_cores, num_devices=num_devices, mxfp6_matmul=False, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ## STEP - 3 Load and process the inputs for Inference @@ -96,6 +98,7 @@ def run_model( generation_len = 128 num_cores = 16 num_devices = 4 + ccl_enabled = True comp_ctx_lengths_prefill = [4096] comp_ctx_lengths_decode = [6144, ctx_len] @@ -106,6 +109,7 @@ def run_model( image_url=image_url, prefill_seq_len=prefill_seq_len, ctx_len=ctx_len, + ccl_enabled=ccl_enabled, comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, generation_len=generation_len, diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py index f68481631..7434c62b2 100644 --- a/examples/performance/compute_context_length/molmo.py +++ b/examples/performance/compute_context_length/molmo.py @@ -18,21 +18,25 @@ # config.num_hidden_layers = 2 +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + # load the model ctx_len = 8192 -comp_ctx_lengths_prefill = [3072] -comp_ctx_lengths_decode = [4096, 8192] +comp_ctx_lengths_prefill = [3072] # None # +comp_ctx_lengths_decode = [4096, 8192] # None # qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_id, kv_offload=True, trust_remote_code=True, config=config, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) @@ -51,6 +55,8 @@ aic_enable_depth_first=True, skip_vision=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) inputs = processor.process(text="Tell me about yourself") @@ -74,6 +80,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ### IMAGE + TEXT ### diff --git a/examples/performance/compute_context_length/qwen2_5_vl.py b/examples/performance/compute_context_length/qwen2_5_vl.py index 00f43a73f..3266be634 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl.py +++ b/examples/performance/compute_context_length/qwen2_5_vl.py @@ -21,20 +21,24 @@ config = AutoConfig.from_pretrained(model_id) # config.text_config.num_hidden_layers = 2 +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 8192 -comp_ctx_lengths_prefill = [4096] -comp_ctx_lengths_decode = [6144, ctx_len] +comp_ctx_lengths_prefill = [4096] # None # +comp_ctx_lengths_decode = [6144, ctx_len] # None # qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( model_id, attn_implementation="eager", kv_offload=True, config=config, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) @@ -59,6 +63,8 @@ aic_enable_depth_first=True, skip_vision=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) messages = [ @@ -103,6 +109,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ### IMAGE + TEXT ### diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py index 6954d356f..bc88c327f 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py +++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py @@ -16,7 +16,15 @@ ## For AWQ model update pytorch version to 2.8.* model_id = "Qwen/Qwen2.5-VL-32B-Instruct" config = AutoConfig.from_pretrained(model_id) -# config.text_config.num_hidden_layers = 2 +config.text_config.num_hidden_layers = 4 + +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. ctx_len = 8192 comp_ctx_lengths_prefill = [4096] @@ -28,11 +36,7 @@ kv_offload=True, config=config, continuous_batching=True, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) @@ -52,6 +56,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) image_urls = [ @@ -75,6 +81,7 @@ processor=processor, images=image_urls, generation_len=100, + device_ids=[28,29,30,31], ) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) diff --git a/examples/performance/compute_context_length/qwen3moe_example/ccl_qwen3moe_inference.py b/examples/performance/compute_context_length/qwen3moe_example/ccl_qwen3moe_inference.py index d2fa208df..9fb4c4d43 100644 --- a/examples/performance/compute_context_length/qwen3moe_example/ccl_qwen3moe_inference.py +++ b/examples/performance/compute_context_length/qwen3moe_example/ccl_qwen3moe_inference.py @@ -16,21 +16,25 @@ # We will use prompt_len=1 for compilation for both cb and non-cb inference """ +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 1024 prefill_seq_len = 1 -# In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same specializations. -comp_ctx_lengths_prefill = [256, 512, ctx_len] -comp_ctx_lengths_decode = [256, 512, ctx_len] +# In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same ccl specializations. +comp_ctx_lengths_prefill = [256, 512, ctx_len] # None # +comp_ctx_lengths_decode = [256, 512, ctx_len] # None # model = QEFFAutoModelForCausalLM.from_pretrained( model_name, continuous_batching=False, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - "prefill_seq_len": prefill_seq_len, - }, + ccl_enabled=True, + num_hidden_layers=4, ) model.compile( @@ -42,6 +46,8 @@ mxfp6_matmul=True, mxint8_kv_cache=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) # mos=1, tokenizer = AutoTokenizer.from_pretrained(model_name) From da186593860a3240c829dd0dfc81596e92b00d37 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Wed, 19 Nov 2025 16:17:21 -0800 Subject: [PATCH 02/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- examples/performance/compute_context_length/gpt_oss.py | 6 +++--- examples/performance/compute_context_length/llama4_cb.py | 2 +- .../compute_context_length/llama4_multi_image.py | 2 +- .../performance/compute_context_length/qwen2_5_vl_cb.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/performance/compute_context_length/gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py index 3b3ce179d..ff5334d4e 100644 --- a/examples/performance/compute_context_length/gpt_oss.py +++ b/examples/performance/compute_context_length/gpt_oss.py @@ -15,16 +15,16 @@ ## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. -## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. ## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. ctx_len = 4096 # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists. # Set the list of ccl during prefilling process -comp_ctx_lengths_prefill = [512, ctx_len] #None # +comp_ctx_lengths_prefill = [512, ctx_len] # None # # Set the list of ccl during decoding process -comp_ctx_lengths_decode = [512, ctx_len] #None # +comp_ctx_lengths_decode = [512, ctx_len] # None # qeff_model = QEFFAutoModelForCausalLM.from_pretrained( diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py index a5d675c26..98653080c 100644 --- a/examples/performance/compute_context_length/llama4_cb.py +++ b/examples/performance/compute_context_length/llama4_cb.py @@ -23,7 +23,7 @@ ## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. -## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. ## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. diff --git a/examples/performance/compute_context_length/llama4_multi_image.py b/examples/performance/compute_context_length/llama4_multi_image.py index c839a08a2..0fe8ffb78 100644 --- a/examples/performance/compute_context_length/llama4_multi_image.py +++ b/examples/performance/compute_context_length/llama4_multi_image.py @@ -21,7 +21,7 @@ ## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. -## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. ## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py index bc88c327f..75fece6db 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py +++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py @@ -22,7 +22,7 @@ ## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. -## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. ## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. @@ -81,7 +81,7 @@ processor=processor, images=image_urls, generation_len=100, - device_ids=[28,29,30,31], + device_ids=[28, 29, 30, 31], ) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) From 70203fdbd5cfa3fb3d2e53d8d1fdb74b860586b5 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Wed, 19 Nov 2025 22:06:06 -0800 Subject: [PATCH 03/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- QEfficient/transformers/models/modeling_auto.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 8a4578737..dfa64270d 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -941,6 +941,7 @@ def __init__( self.lang_model = QEffCausalLMForTextImageToTextModel(model, qaic_config=qaic_config, **kwargs) self.continuous_batching = continuous_batching self.ccl_enabled = ccl_enabled + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None self.input_shapes, self.output_names = None, None @property @@ -1183,7 +1184,6 @@ def compile( output_names = self.model.get_output_names(kv_offload=True) # if ccl_enabled is True read Compute-Context-Length lists - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None if self.ccl_enabled: if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: logger.warning( @@ -1662,6 +1662,7 @@ def __init__( self.model.config.use_cache = True self.hash_params["qeff_auto_class"] = self.__class__.__name__ self.ccl_enabled = ccl_enabled + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None if self.model.qaic_config is not None and self.model.qaic_config.get("num_kv_blocks", None) is not None: BlockedKVAttentionTransform.apply(self.model, num_kv_blocks=self.model.qaic_config.get("num_kv_blocks")) @@ -1828,7 +1829,6 @@ def compile( output_names = self.model.get_output_names() # if ccl_enabled is True read Compute-Context-Length lists - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None if self.ccl_enabled: if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: logger.warning( @@ -2417,6 +2417,7 @@ def __init__( self.hash_params["qeff_auto_class"] = self.__class__.__name__ self.ccl_enabled = ccl_enabled + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None # ---Sampling--- # Note: SamplerTransform should be applied after all other transforms @@ -2957,7 +2958,6 @@ def compile( """ # if ccl_enabled is True read Compute-Context-Length lists - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None if self.ccl_enabled: if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: logger.warning( From 67181e5582dee63f47b3bd5d60cb36acbfadd288 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Sun, 23 Nov 2025 17:49:03 -0800 Subject: [PATCH 04/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- examples/performance/README.md | 50 ++ .../compute_context_length/README.md | 5 +- .../compute_context_length/basic_inference.py | 14 +- .../compute_context_length/gemma3.py | 4 +- .../gemma3/fp32_nodes_gemma3_27b.yaml | 685 +++++++++++++++++ .../gemma3/fp32_nodes_gemma3_4b.yaml | 698 ++++++++++++++++++ .../compute_context_length/qwen3moe.py | 54 ++ .../compute_context_length/vlm_inference.py | 15 +- 8 files changed, 1512 insertions(+), 13 deletions(-) create mode 100755 examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_27b.yaml create mode 100755 examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml create mode 100644 examples/performance/compute_context_length/qwen3moe.py diff --git a/examples/performance/README.md b/examples/performance/README.md index 48d34d972..9308ce6db 100644 --- a/examples/performance/README.md +++ b/examples/performance/README.md @@ -95,6 +95,56 @@ python on_device_sampling.py \ --top-p 0.89 ``` +### Compute-Context-Length + +Calculating Context-Length dynamically during inference for getting the best related performance within each window of context-length + +#### compute_context_length/basic_inference.py +Configure CCL parameters: 1) ccl-enabled: to activate CCL feature, 2) comp-ctx-lengths-prefill: list of context length to be used during prefilling, and 3) comp-ctx-lengths-decode: list of context lengths to be used during decoding. + +**Usage for Text-only models:** +```bash +python compute_context_length/basic_inference.py \ + --model-name meta-llama/Llama-3.1-8B \ + --num-cores 16 \ + --prefill-seq-len 32 \ + --ctx-len 1024 \ + --ccl-enabled \ + --comp-ctx-lengths-prefill 500,1000 \ + --comp-ctx-lengths-decode 512,1024 +``` + +**Usage for VLM models such as mllama and llava:** +```bash +python compute_context_length/vlm_inference.py \ + --model-name meta-llama/Llama-3.2-11B-Vision-Instruct \ + --hf-token "" \ + --num-cores 16 \ + --prefill-seq-len 32 \ + --ctx-len 8192 \ + --img-size 560 \ + --ccl-enabled \ + --comp-ctx-lengths-prefill 4096 \ + --comp-ctx-lengths-decode 6144,8192 +``` + +**Usage with other MoE and Multimodal models:** +For various models available in compute_context_length directory such as gemma3, gpt_oss, granite_vision, internvl, llama4_cb, llama4_multi_image, llama4, mistral3, molmo, qwen2_5_vl, qwen2_5_vl_cb, and qwen3moe, use the related inference script and only change the model-name and ccl configuration in the related script. The following is an example of each model: +```bash +python compute_context_length/gemma3.py +python compute_context_length/gpt_oss.py +python compute_context_length/granite_vision.py +python compute_context_length/internvl.py +python compute_context_length/llama4_cb.py +python compute_context_length/llama4_multi_image.py +python compute_context_length/llama4.py +python compute_context_length/mistral3.py +python compute_context_length/molmo.py +python compute_context_length/qwen2_5_vl.py +python compute_context_length/qwen2_5_vl_cb.py +python compute_context_length/qwen3moe.py +``` + ## Performance Tips 1. **Speculative Decoding**: Best for long-form generation where draft model is much faster than target diff --git a/examples/performance/compute_context_length/README.md b/examples/performance/compute_context_length/README.md index bbc240645..9f1d29b9a 100644 --- a/examples/performance/compute_context_length/README.md +++ b/examples/performance/compute_context_length/README.md @@ -68,7 +68,7 @@ python vlm_inference.py \ Basic CCL usage with text-only language models. **Supported Models:** -- Llama (3.2, 3.3) +- Llama (3.2, 3.3, swiftkv) - Gemma/Gemma-2 - Mistral - Phi/Phi-3 @@ -77,6 +77,9 @@ Basic CCL usage with text-only language models. - GPT-2, GPT-J - CodeGen - OLMo-2 +- Mistral/Mixtral +- Qwen2 +- Falcon **Command-Line Arguments:** - `--model-name`: HuggingFace model ID (default: meta-llama/Llama-3.2-1B) diff --git a/examples/performance/compute_context_length/basic_inference.py b/examples/performance/compute_context_length/basic_inference.py index a4407b05a..425d038d9 100644 --- a/examples/performance/compute_context_length/basic_inference.py +++ b/examples/performance/compute_context_length/basic_inference.py @@ -46,6 +46,11 @@ def main(): default=1024, help="Maximum context length", ) + parser.add_argument( + "--ccl-enabled", + action="store_true", + help="Enable compute-context-length (CCL) feature", + ) parser.add_argument( "--comp-ctx-lengths-prefill", type=lambda x: [int(i) for i in x.split(",")], @@ -112,11 +117,7 @@ def main(): model = QEFFAutoModelForCausalLM.from_pretrained( args.model_name, continuous_batching=args.continuous_batching, - qaic_config={ - "comp_ctx_lengths_prefill": args.comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": args.comp_ctx_lengths_decode, - "ctx_len": args.ctx_len, # Required for CCL validation - }, + ccl_enabled=args.ccl_enabled, ) # Compile the model @@ -132,6 +133,9 @@ def main(): if args.continuous_batching: compile_kwargs["full_batch_size"] = args.full_batch_size + if args.ccl_enabled: + compile_kwargs["comp_ctx_lengths_prefill"] = args.comp_ctx_lengths_prefill + compile_kwargs["comp_ctx_lengths_decode"] = args.comp_ctx_lengths_decode qpc_path = model.compile(**compile_kwargs) print(f"Model compiled successfully to: {qpc_path}") diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py index 14d9e59ca..830e2ce43 100644 --- a/examples/performance/compute_context_length/gemma3.py +++ b/examples/performance/compute_context_length/gemma3.py @@ -58,7 +58,7 @@ aic_enable_depth_first=True, skip_vision=True, mos=1, - node_precision_info="examples/gemma3_example/fp32_nodes_gemma3_4b.yaml", + node_precision_info="examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml", comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) @@ -96,7 +96,7 @@ mxint8_kv_cache=False, aic_enable_depth_first=True, mos=1, - node_precision_info="examples/gemma3_example/fp32_nodes_gemma3_4b.yaml", + node_precision_info="examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml", comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) diff --git a/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_27b.yaml b/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_27b.yaml new file mode 100755 index 000000000..d2a4bf164 --- /dev/null +++ b/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_27b.yaml @@ -0,0 +1,685 @@ +FP32NodeInstanceNames: + + - /language_model/layers.0/Add_1_output_0 + - /language_model/layers.0/Add_2_output_0 + - /language_model/layers.0/Add_3_output_0 + - /language_model/layers.0/Add_output_0 + - /language_model/layers.1/Add_1_output_0 + - /language_model/layers.1/Add_2_output_0 + - /language_model/layers.1/Add_3_output_0 + - /language_model/layers.1/Add_output_0 + - /language_model/layers.2/Add_1_output_0 + - /language_model/layers.2/Add_2_output_0 + - /language_model/layers.2/Add_3_output_0 + - /language_model/layers.2/Add_output_0 + - /language_model/layers.3/Add_1_output_0 + - /language_model/layers.3/Add_2_output_0 + - /language_model/layers.3/Add_3_output_0 + - /language_model/layers.3/Add_output_0 + - /language_model/layers.4/Add_1_output_0 + - /language_model/layers.4/Add_2_output_0 + - /language_model/layers.4/Add_3_output_0 + - /language_model/layers.4/Add_output_0 + - /language_model/layers.5/Add_1_output_0 + - /language_model/layers.5/Add_2_output_0 + - /language_model/layers.5/Add_3_output_0 + - /language_model/layers.5/Add_output_0 + - /language_model/layers.6/Add_1_output_0 + - /language_model/layers.6/Add_2_output_0 + - /language_model/layers.6/Add_3_output_0 + - /language_model/layers.6/Add_output_0 + - /language_model/layers.7/Add_1_output_0 + - /language_model/layers.7/Add_2_output_0 + - /language_model/layers.7/Add_3_output_0 + - /language_model/layers.7/Add_output_0 + - /language_model/layers.8/Add_1_output_0 + - /language_model/layers.8/Add_2_output_0 + - /language_model/layers.8/Add_3_output_0 + - /language_model/layers.8/Add_output_0 + - /language_model/layers.9/Add_1_output_0 + - /language_model/layers.9/Add_2_output_0 + - /language_model/layers.9/Add_3_output_0 + - /language_model/layers.9/Add_output_0 + - /language_model/layers.10/Add_1_output_0 + - /language_model/layers.10/Add_2_output_0 + - /language_model/layers.10/Add_3_output_0 + - /language_model/layers.10/Add_output_0 + - /language_model/layers.11/Add_1_output_0 + - /language_model/layers.11/Add_2_output_0 + - /language_model/layers.11/Add_3_output_0 + - /language_model/layers.11/Add_output_0 + - /language_model/layers.12/Add_1_output_0 + - /language_model/layers.12/Add_2_output_0 + - /language_model/layers.12/Add_3_output_0 + - /language_model/layers.12/Add_output_0 + - /language_model/layers.13/Add_1_output_0 + - /language_model/layers.13/Add_2_output_0 + - /language_model/layers.13/Add_3_output_0 + - /language_model/layers.13/Add_output_0 + - /language_model/layers.14/Add_1_output_0 + - /language_model/layers.14/Add_2_output_0 + - /language_model/layers.14/Add_3_output_0 + - /language_model/layers.14/Add_output_0 + - /language_model/layers.15/Add_1_output_0 + - /language_model/layers.15/Add_2_output_0 + - /language_model/layers.15/Add_3_output_0 + - /language_model/layers.15/Add_output_0 + - /language_model/layers.16/Add_1_output_0 + - /language_model/layers.16/Add_2_output_0 + - /language_model/layers.16/Add_3_output_0 + - /language_model/layers.16/Add_output_0 + - /language_model/layers.17/Add_1_output_0 + - /language_model/layers.17/Add_2_output_0 + - /language_model/layers.17/Add_3_output_0 + - /language_model/layers.17/Add_output_0 + - /language_model/layers.18/Add_1_output_0 + - /language_model/layers.18/Add_2_output_0 + - /language_model/layers.18/Add_3_output_0 + - /language_model/layers.18/Add_output_0 + - /language_model/layers.19/Add_1_output_0 + - /language_model/layers.19/Add_2_output_0 + - /language_model/layers.19/Add_3_output_0 + - /language_model/layers.19/Add_output_0 + - /language_model/layers.20/Add_1_output_0 + - /language_model/layers.20/Add_2_output_0 + - /language_model/layers.20/Add_3_output_0 + - /language_model/layers.20/Add_output_0 + - /language_model/layers.21/Add_1_output_0 + - /language_model/layers.21/Add_2_output_0 + - /language_model/layers.21/Add_3_output_0 + - /language_model/layers.21/Add_output_0 + - /language_model/layers.22/Add_1_output_0 + - /language_model/layers.22/Add_2_output_0 + - /language_model/layers.22/Add_3_output_0 + - /language_model/layers.22/Add_output_0 + - /language_model/layers.23/Add_1_output_0 + - /language_model/layers.23/Add_2_output_0 + - /language_model/layers.23/Add_output_0 + - /language_model/layers.24/Add_1_output_0 + - /language_model/layers.24/Add_2_output_0 + - /language_model/layers.24/Add_3_output_0 + - /language_model/layers.24/Add_output_0 + - /language_model/layers.25/Add_1_output_0 + - /language_model/layers.25/Add_2_output_0 + - /language_model/layers.25/Add_3_output_0 + - /language_model/layers.25/Add_output_0 + - /language_model/layers.26/Add_1_output_0 + - /language_model/layers.26/Add_2_output_0 + - /language_model/layers.26/Add_3_output_0 + - /language_model/layers.26/Add_output_0 + - /language_model/layers.27/Add_1_output_0 + - /language_model/layers.27/Add_2_output_0 + - /language_model/layers.27/Add_3_output_0 + - /language_model/layers.27/Add_output_0 + - /language_model/layers.28/Add_1_output_0 + - /language_model/layers.28/Add_2_output_0 + - /language_model/layers.28/Add_3_output_0 + - /language_model/layers.28/Add_output_0 + - /language_model/layers.29/Add_1_output_0 + - /language_model/layers.29/Add_2_output_0 + - /language_model/layers.29/Add_3_output_0 + - /language_model/layers.29/Add_output_0 + - /language_model/layers.30/Add_1_output_0 + - /language_model/layers.30/Add_2_output_0 + - /language_model/layers.30/Add_3_output_0 + - /language_model/layers.30/Add_output_0 + - /language_model/layers.31/Add_1_output_0 + - /language_model/layers.31/Add_2_output_0 + - /language_model/layers.31/Add_3_output_0 + - /language_model/layers.31/Add_output_0 + - /language_model/layers.32/Add_1_output_0 + - /language_model/layers.32/Add_2_output_0 + - /language_model/layers.32/Add_3_output_0 + - /language_model/layers.32/Add_output_0 + - /language_model/layers.33/Add_1_output_0 + - /language_model/layers.33/Add_2_output_0 + - /language_model/layers.33/Add_3_output_0 + - /language_model/layers.33/Add_output_0 + - /language_model/layers.34/Add_1_output_0 + - /language_model/layers.34/Add_2_output_0 + - /language_model/layers.34/Add_3_output_0 + - /language_model/layers.34/Add_output_0 + - /language_model/layers.35/Add_1_output_0 + - /language_model/layers.35/Add_2_output_0 + - /language_model/layers.35/Add_3_output_0 + - /language_model/layers.35/Add_output_0 + - /language_model/layers.36/Add_1_output_0 + - /language_model/layers.36/Add_2_output_0 + - /language_model/layers.36/Add_3_output_0 + - /language_model/layers.36/Add_output_0 + - /language_model/layers.37/Add_1_output_0 + - /language_model/layers.37/Add_2_output_0 + - /language_model/layers.37/Add_3_output_0 + - /language_model/layers.37/Add_output_0 + - /language_model/layers.38/Add_1_output_0 + - /language_model/layers.38/Add_2_output_0 + - /language_model/layers.38/Add_3_output_0 + - /language_model/layers.38/Add_output_0 + - /language_model/layers.39/Add_1_output_0 + - /language_model/layers.39/Add_2_output_0 + - /language_model/layers.39/Add_3_output_0 + - /language_model/layers.39/Add_output_0 + - /language_model/layers.40/Add_1_output_0 + - /language_model/layers.40/Add_2_output_0 + - /language_model/layers.40/Add_3_output_0 + - /language_model/layers.40/Add_output_0 + - /language_model/layers.41/Add_1_output_0 + - /language_model/layers.41/Add_2_output_0 + - /language_model/layers.41/Add_3_output_0 + - /language_model/layers.41/Add_output_0 + - /language_model/layers.42/Add_1_output_0 + - /language_model/layers.42/Add_2_output_0 + - /language_model/layers.42/Add_3_output_0 + - /language_model/layers.42/Add_output_0 + - /language_model/layers.43/Add_1_output_0 + - /language_model/layers.43/Add_2_output_0 + - /language_model/layers.43/Add_3_output_0 + - /language_model/layers.43/Add_output_0 + - /language_model/layers.44/Add_1_output_0 + - /language_model/layers.44/Add_2_output_0 + - /language_model/layers.44/Add_3_output_0 + - /language_model/layers.44/Add_output_0 + - /language_model/layers.45/Add_1_output_0 + - /language_model/layers.45/Add_2_output_0 + - /language_model/layers.45/Add_3_output_0 + - /language_model/layers.45/Add_output_0 + - /language_model/layers.46/Add_1_output_0 + - /language_model/layers.46/Add_2_output_0 + - /language_model/layers.46/Add_3_output_0 + - /language_model/layers.46/Add_output_0 + - /language_model/layers.47/Add_1_output_0 + - /language_model/layers.47/Add_2_output_0 + - /language_model/layers.47/Add_3_output_0 + - /language_model/layers.47/Add_output_0 + - /language_model/layers.48/Add_1_output_0 + - /language_model/layers.48/Add_2_output_0 + - /language_model/layers.48/Add_3_output_0 + - /language_model/layers.48/Add_output_0 + - /language_model/layers.49/Add_1_output_0 + - /language_model/layers.49/Add_2_output_0 + - /language_model/layers.49/Add_3_output_0 + - /language_model/layers.49/Add_output_0 + - /language_model/layers.50/Add_1_output_0 + - /language_model/layers.50/Add_2_output_0 + - /language_model/layers.50/Add_3_output_0 + - /language_model/layers.50/Add_output_0 + - /language_model/layers.51/Add_1_output_0 + - /language_model/layers.51/Add_2_output_0 + - /language_model/layers.51/Add_3_output_0 + - /language_model/layers.51/Add_output_0 + - /language_model/layers.52/Add_1_output_0 + - /language_model/layers.52/Add_2_output_0 + - /language_model/layers.52/Add_3_output_0 + - /language_model/layers.52/Add_output_0 + - /language_model/layers.53/Add_1_output_0 + - /language_model/layers.53/Add_2_output_0 + - /language_model/layers.53/Add_3_output_0 + - /language_model/layers.53/Add_output_0 + - /language_model/layers.54/Add_1_output_0 + - /language_model/layers.54/Add_2_output_0 + - /language_model/layers.54/Add_3_output_0 + - /language_model/layers.54/Add_output_0 + - /language_model/layers.55/Add_1_output_0 + - /language_model/layers.55/Add_2_output_0 + - /language_model/layers.55/Add_3_output_0 + - /language_model/layers.55/Add_output_0 + - /language_model/layers.56/Add_1_output_0 + - /language_model/layers.56/Add_2_output_0 + - /language_model/layers.56/Add_3_output_0 + - /language_model/layers.56/Add_output_0 + - /language_model/layers.57/Add_1_output_0 + - /language_model/layers.57/Add_2_output_0 + - /language_model/layers.57/Add_3_output_0 + - /language_model/layers.57/Add_output_0 + - /language_model/layers.58/Add_1_output_0 + - /language_model/layers.58/Add_2_output_0 + - /language_model/layers.58/Add_3_output_0 + - /language_model/layers.58/Add_output_0 + - /language_model/layers.59/Add_1_output_0 + - /language_model/layers.59/Add_2_output_0 + - /language_model/layers.59/Add_3_output_0 + - /language_model/layers.59/Add_output_0 + - /language_model/layers.60/Add_1_output_0 + - /language_model/layers.60/Add_2_output_0 + - /language_model/layers.60/Add_3_output_0 + - /language_model/layers.60/Add_output_0 + - /language_model/layers.61/Add_1_output_0 + - /language_model/layers.61/Add_2_output_0 + - /language_model/layers.61/Add_3_output_0 + - /language_model/layers.61/Add_output_0 + - /language_model/norm/Add_output_0 + - /language_model/layers.0/self_attn/Mul_output_0 + - /language_model/layers.2/self_attn/Mul_output_0 + - /language_model/layers.3/self_attn/Mul_output_0 + - /language_model/layers.4/self_attn/Mul_output_0 + - /language_model/layers.5/self_attn/Mul_output_0 + - /language_model/layers.6/self_attn/Mul_output_0 + - /language_model/layers.7/self_attn/Mul_output_0 + - /language_model/layers.8/self_attn/Mul_output_0 + - /language_model/layers.9/self_attn/Mul_output_0 + - /language_model/layers.10/self_attn/Mul_output_0 + - /language_model/layers.11/self_attn/Mul_output_0 + - /language_model/layers.12/self_attn/Mul_output_0 + - /language_model/layers.13/self_attn/Mul_output_0 + - /language_model/layers.14/self_attn/Mul_output_0 + - /language_model/layers.15/self_attn/Mul_output_0 + - /language_model/layers.16/self_attn/Mul_output_0 + - /language_model/layers.17/self_attn/Mul_output_0 + - /language_model/layers.18/self_attn/Mul_output_0 + - /language_model/layers.19/self_attn/Mul_output_0 + - /language_model/layers.20/self_attn/Mul_output_0 + - /language_model/layers.21/self_attn/Mul_output_0 + - /language_model/layers.22/self_attn/Mul_output_0 + - /language_model/layers.23/self_attn/Mul_output_0 + - /language_model/layers.24/self_attn/Mul_output_0 + - /language_model/layers.25/self_attn/Mul_output_0 + - /language_model/layers.26/self_attn/Mul_output_0 + - /language_model/layers.27/self_attn/Mul_output_0 + - /language_model/layers.28/self_attn/Mul_output_0 + - /language_model/layers.29/self_attn/Mul_output_0 + - /language_model/layers.30/self_attn/Mul_output_0 + - /language_model/layers.31/self_attn/Mul_output_0 + - /language_model/layers.32/self_attn/Mul_output_0 + - /language_model/layers.33/self_attn/Mul_output_0 + - /language_model/layers.34/self_attn/Mul_output_0 + - /language_model/layers.35/self_attn/Mul_output_0 + - /language_model/layers.36/self_attn/Mul_output_0 + - /language_model/layers.37/self_attn/Mul_output_0 + - /language_model/layers.38/self_attn/Mul_output_0 + - /language_model/layers.39/self_attn/Mul_output_0 + - /language_model/layers.40/self_attn/Mul_output_0 + - /language_model/layers.41/self_attn/Mul_output_0 + - /language_model/layers.42/self_attn/Mul_output_0 + - /language_model/layers.43/self_attn/Mul_output_0 + - /language_model/layers.44/self_attn/Mul_output_0 + - /language_model/layers.45/self_attn/Mul_output_0 + - /language_model/layers.46/self_attn/Mul_output_0 + - /language_model/layers.47/self_attn/Mul_output_0 + - /language_model/layers.48/self_attn/Mul_output_0 + - /language_model/layers.49/self_attn/Mul_output_0 + - /language_model/layers.50/self_attn/Mul_output_0 + - /language_model/layers.51/self_attn/Mul_output_0 + - /language_model/layers.52/self_attn/Mul_output_0 + - /language_model/layers.53/self_attn/Mul_output_0 + - /language_model/layers.54/self_attn/Mul_output_0 + - /language_model/layers.55/self_attn/Mul_output_0 + - /language_model/layers.56/self_attn/Mul_output_0 + - /language_model/layers.57/self_attn/Mul_output_0 + - /language_model/layers.58/self_attn/Mul_output_0 + - /language_model/layers.59/self_attn/Mul_output_0 + - /language_model/layers.60/self_attn/Mul_output_0 + - /language_model/layers.61/self_attn/Mul_output_0 + - /language_model/layers.0/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.34/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.34/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.35/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.35/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.36/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.36/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.37/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.37/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.38/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.38/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.39/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.39/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.40/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.40/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.41/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.41/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.42/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.42/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.43/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.43/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.44/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.44/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.45/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.45/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.46/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.46/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.47/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.47/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.48/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.48/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.49/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.49/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.50/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.50/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.51/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.51/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.52/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.52/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.53/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.53/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.54/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.54/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.55/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.55/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.56/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.56/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.57/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.57/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.58/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.58/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.59/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.59/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.60/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.60/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.61/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.61/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/norm/CustomRMSNorm_output_0 + diff --git a/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml b/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml new file mode 100755 index 000000000..1c8aa1c41 --- /dev/null +++ b/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml @@ -0,0 +1,698 @@ +FP32NodeInstanceNames: + + - /language_model/layers.0/Add_output_0 + - /language_model/layers.0/Add_1_output_0 + - /language_model/layers.0/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/Add_2_output_0 + - /language_model/layers.0/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/Add_3_output_0 + - /language_model/layers.1/Add_output_0 + - /language_model/layers.1/Add_1_output_0 + - /language_model/layers.1/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/Add_2_output_0 + - /language_model/layers.1/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/Add_3_output_0 + - /language_model/layers.2/Add_output_0 + - /language_model/layers.2/Add_1_output_0 + - /language_model/layers.2/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/Add_2_output_0 + - /language_model/layers.2/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/Add_3_output_0 + - /language_model/layers.3/Add_output_0 + - /language_model/layers.3/Add_1_output_0 + - /language_model/layers.3/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/Add_2_output_0 + - /language_model/layers.3/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/Add_3_output_0 + - /language_model/layers.4/Add_output_0 + - /language_model/layers.4/Add_1_output_0 + - /language_model/layers.4/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/Add_2_output_0 + - /language_model/layers.4/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/Add_3_output_0 + - /language_model/layers.5/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/Add_output_0 + - /language_model/layers.5/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/Add_1_output_0 + - /language_model/layers.6/Add_output_0 + - /language_model/layers.6/Add_1_output_0 + - /language_model/layers.6/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/Add_2_output_0 + - /language_model/layers.6/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/Add_3_output_0 + - /language_model/layers.7/Add_output_0 + - /language_model/layers.7/Add_1_output_0 + - /language_model/layers.7/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/Add_2_output_0 + - /language_model/layers.7/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/Add_3_output_0 + - /language_model/layers.8/Add_output_0 + - /language_model/layers.8/Add_1_output_0 + - /language_model/layers.8/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/Add_2_output_0 + - /language_model/layers.8/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/Add_3_output_0 + - /language_model/layers.9/Add_output_0 + - /language_model/layers.9/Add_1_output_0 + - /language_model/layers.9/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/Add_2_output_0 + - /language_model/layers.9/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/Add_3_output_0 + - /language_model/layers.10/Add_output_0 + - /language_model/layers.10/Add_1_output_0 + - /language_model/layers.10/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/Add_2_output_0 + - /language_model/layers.10/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/Add_3_output_0 + - /language_model/layers.11/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/Add_output_0 + - /language_model/layers.11/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/Add_1_output_0 + - /language_model/layers.12/Add_output_0 + - /language_model/layers.12/Add_1_output_0 + - /language_model/layers.12/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/Add_2_output_0 + - /language_model/layers.12/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/Add_3_output_0 + - /language_model/layers.13/Add_output_0 + - /language_model/layers.13/Add_1_output_0 + - /language_model/layers.13/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/Add_2_output_0 + - /language_model/layers.13/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/Add_3_output_0 + - /language_model/layers.14/Add_output_0 + - /language_model/layers.14/Add_1_output_0 + - /language_model/layers.14/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/Add_2_output_0 + - /language_model/layers.14/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/Add_3_output_0 + - /language_model/layers.15/Add_output_0 + - /language_model/layers.15/Add_1_output_0 + - /language_model/layers.15/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/Add_2_output_0 + - /language_model/layers.15/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/Add_3_output_0 + - /language_model/layers.16/Add_output_0 + - /language_model/layers.16/Add_1_output_0 + - /language_model/layers.16/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/Add_2_output_0 + - /language_model/layers.16/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/Add_3_output_0 + - /language_model/layers.17/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/Add_output_0 + - /language_model/layers.17/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/Add_1_output_0 + - /language_model/layers.18/Add_output_0 + - /language_model/layers.18/Add_1_output_0 + - /language_model/layers.18/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/Add_2_output_0 + - /language_model/layers.18/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/Add_3_output_0 + - /language_model/layers.19/Add_output_0 + - /language_model/layers.19/Add_1_output_0 + - /language_model/layers.19/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/Add_2_output_0 + - /language_model/layers.19/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/Add_3_output_0 + - /language_model/layers.20/Add_output_0 + - /language_model/layers.20/Add_1_output_0 + - /language_model/layers.20/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/Add_2_output_0 + - /language_model/layers.20/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/Add_3_output_0 + - /language_model/layers.21/Add_output_0 + - /language_model/layers.21/Add_1_output_0 + - /language_model/layers.21/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/Add_2_output_0 + - /language_model/layers.21/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/Add_3_output_0 + - /language_model/layers.22/Add_output_0 + - /language_model/layers.22/Add_1_output_0 + - /language_model/layers.22/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/Add_2_output_0 + - /language_model/layers.22/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/Add_3_output_0 + - /language_model/layers.23/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/Add_output_0 + - /language_model/layers.23/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/Add_1_output_0 + - /language_model/layers.24/Add_output_0 + - /language_model/layers.24/Add_1_output_0 + - /language_model/layers.24/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/Add_2_output_0 + - /language_model/layers.24/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/Add_3_output_0 + - /language_model/layers.25/Add_output_0 + - /language_model/layers.25/Add_1_output_0 + - /language_model/layers.25/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/Add_2_output_0 + - /language_model/layers.25/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/Add_3_output_0 + - /language_model/layers.26/Add_output_0 + - /language_model/layers.26/Add_1_output_0 + - /language_model/layers.26/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/Add_2_output_0 + - /language_model/layers.26/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/Add_3_output_0 + - /language_model/layers.27/Add_output_0 + - /language_model/layers.27/Add_1_output_0 + - /language_model/layers.27/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/Add_2_output_0 + - /language_model/layers.27/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/Add_3_output_0 + - /language_model/layers.28/Add_output_0 + - /language_model/layers.28/Add_1_output_0 + - /language_model/layers.28/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/Add_2_output_0 + - /language_model/layers.28/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/Add_3_output_0 + - /language_model/layers.29/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/Add_output_0 + - /language_model/layers.29/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/Add_1_output_0 + - /language_model/layers.30/Add_output_0 + - /language_model/layers.30/Add_1_output_0 + - /language_model/layers.30/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/Add_2_output_0 + - /language_model/layers.30/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/Add_3_output_0 + - /language_model/layers.31/Add_output_0 + - /language_model/layers.31/Add_1_output_0 + - /language_model/layers.31/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/Add_2_output_0 + - /language_model/layers.31/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/Add_3_output_0 + - /language_model/layers.32/Add_output_0 + - /language_model/layers.32/Add_1_output_0 + - /language_model/layers.32/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/Add_2_output_0 + - /language_model/layers.32/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/Add_3_output_0 + - /language_model/layers.33/Add_output_0 + - /language_model/layers.33/Add_1_output_0 + - /language_model/layers.33/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/Add_2_output_0 + - /language_model/layers.33/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/Add_3_output_0 + - /language_model/norm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/Mul_output_0 + - /language_model/layers.0/self_attn/Mul_1_output_0 + - /language_model/layers.0/self_attn/Mul_2_output_0 + - /language_model/layers.0/self_attn/Mul_3_output_0 + - /language_model/layers.0/self_attn/Mul_4_output_0 + - /language_model/layers.0/self_attn/Mul_5_output_0 + - /language_model/layers.0/self_attn/Mul_6_output_0 + - /language_model/layers.0/self_attn/Mul_7_output_0 + - /language_model/layers.0/self_attn/Mul_8_output_0 + - /language_model/layers.1/self_attn/Mul_9_output_0 + - /language_model/layers.2/self_attn/Mul_output_0 + - /language_model/layers.2/self_attn/Mul_1_output_0 + - /language_model/layers.2/self_attn/Mul_2_output_0 + - /language_model/layers.2/self_attn/Mul_3_output_0 + - /language_model/layers.2/self_attn/Mul_4_output_0 + - /language_model/layers.2/self_attn/Mul_5_output_0 + - /language_model/layers.2/self_attn/Mul_6_output_0 + - /language_model/layers.2/self_attn/Mul_7_output_0 + - /language_model/layers.2/self_attn/Mul_8_output_0 + - /language_model/layers.2/self_attn/Mul_9_output_0 + - /language_model/layers.3/self_attn/Mul_output_0 + - /language_model/layers.3/self_attn/Mul_1_output_0 + - /language_model/layers.3/self_attn/Mul_2_output_0 + - /language_model/layers.3/self_attn/Mul_3_output_0 + - /language_model/layers.3/self_attn/Mul_4_output_0 + - /language_model/layers.3/self_attn/Mul_5_output_0 + - /language_model/layers.3/self_attn/Mul_6_output_0 + - /language_model/layers.3/self_attn/Mul_7_output_0 + - /language_model/layers.3/self_attn/Mul_8_output_0 + - /language_model/layers.3/self_attn/Mul_9_output_0 + - /language_model/layers.4/self_attn/Mul_output_0 + - /language_model/layers.4/self_attn/Mul_1_output_0 + - /language_model/layers.4/self_attn/Mul_2_output_0 + - /language_model/layers.4/self_attn/Mul_3_output_0 + - /language_model/layers.4/self_attn/Mul_4_output_0 + - /language_model/layers.4/self_attn/Mul_5_output_0 + - /language_model/layers.4/self_attn/Mul_6_output_0 + - /language_model/layers.4/self_attn/Mul_7_output_0 + - /language_model/layers.4/self_attn/Mul_8_output_0 + - /language_model/layers.4/self_attn/Mul_9_output_0 + - /language_model/layers.5/self_attn/Mul_output_0 + - /language_model/layers.5/self_attn/Mul_1_output_0 + - /language_model/layers.5/self_attn/Mul_2_output_0 + - /language_model/layers.5/self_attn/Mul_3_output_0 + - /language_model/layers.5/self_attn/Mul_4_output_0 + - /language_model/layers.5/self_attn/Mul_5_output_0 + - /language_model/layers.5/self_attn/Mul_6_output_0 + - /language_model/layers.5/self_attn/Mul_7_output_0 + - /language_model/layers.5/self_attn/Mul_8_output_0 + - /language_model/layers.5/self_attn/Mul_9_output_0 + - /language_model/layers.6/self_attn/Mul_output_0 + - /language_model/layers.6/self_attn/Mul_1_output_0 + - /language_model/layers.6/self_attn/Mul_2_output_0 + - /language_model/layers.6/self_attn/Mul_3_output_0 + - /language_model/layers.6/self_attn/Mul_4_output_0 + - /language_model/layers.6/self_attn/Mul_5_output_0 + - /language_model/layers.6/self_attn/Mul_6_output_0 + - /language_model/layers.6/self_attn/Mul_7_output_0 + - /language_model/layers.6/self_attn/Mul_8_output_0 + - /language_model/layers.6/self_attn/Mul_9_output_0 + - /language_model/layers.7/self_attn/Mul_output_0 + - /language_model/layers.7/self_attn/Mul_1_output_0 + - /language_model/layers.7/self_attn/Mul_2_output_0 + - /language_model/layers.7/self_attn/Mul_3_output_0 + - /language_model/layers.7/self_attn/Mul_4_output_0 + - /language_model/layers.7/self_attn/Mul_5_output_0 + - /language_model/layers.7/self_attn/Mul_6_output_0 + - /language_model/layers.7/self_attn/Mul_7_output_0 + - /language_model/layers.7/self_attn/Mul_8_output_0 + - /language_model/layers.7/self_attn/Mul_9_output_0 + - /language_model/layers.8/self_attn/Mul_output_0 + - /language_model/layers.8/self_attn/Mul_1_output_0 + - /language_model/layers.8/self_attn/Mul_2_output_0 + - /language_model/layers.8/self_attn/Mul_3_output_0 + - /language_model/layers.8/self_attn/Mul_4_output_0 + - /language_model/layers.8/self_attn/Mul_5_output_0 + - /language_model/layers.8/self_attn/Mul_6_output_0 + - /language_model/layers.8/self_attn/Mul_7_output_0 + - /language_model/layers.8/self_attn/Mul_8_output_0 + - /language_model/layers.8/self_attn/Mul_9_output_0 + - /language_model/layers.9/self_attn/Mul_output_0 + - /language_model/layers.9/self_attn/Mul_1_output_0 + - /language_model/layers.9/self_attn/Mul_2_output_0 + - /language_model/layers.9/self_attn/Mul_3_output_0 + - /language_model/layers.9/self_attn/Mul_4_output_0 + - /language_model/layers.9/self_attn/Mul_5_output_0 + - /language_model/layers.9/self_attn/Mul_6_output_0 + - /language_model/layers.9/self_attn/Mul_7_output_0 + - /language_model/layers.9/self_attn/Mul_8_output_0 + - /language_model/layers.9/self_attn/Mul_9_output_0 + - /language_model/layers.10/self_attn/Mul_output_0 + - /language_model/layers.10/self_attn/Mul_1_output_0 + - /language_model/layers.10/self_attn/Mul_2_output_0 + - /language_model/layers.10/self_attn/Mul_3_output_0 + - /language_model/layers.10/self_attn/Mul_4_output_0 + - /language_model/layers.10/self_attn/Mul_5_output_0 + - /language_model/layers.10/self_attn/Mul_6_output_0 + - /language_model/layers.10/self_attn/Mul_7_output_0 + - /language_model/layers.10/self_attn/Mul_8_output_0 + - /language_model/layers.10/self_attn/Mul_9_output_0 + - /language_model/layers.11/self_attn/Mul_output_0 + - /language_model/layers.11/self_attn/Mul_1_output_0 + - /language_model/layers.11/self_attn/Mul_2_output_0 + - /language_model/layers.11/self_attn/Mul_3_output_0 + - /language_model/layers.11/self_attn/Mul_4_output_0 + - /language_model/layers.11/self_attn/Mul_5_output_0 + - /language_model/layers.11/self_attn/Mul_6_output_0 + - /language_model/layers.11/self_attn/Mul_7_output_0 + - /language_model/layers.11/self_attn/Mul_8_output_0 + - /language_model/layers.11/self_attn/Mul_9_output_0 + - /language_model/layers.12/self_attn/Mul_output_0 + - /language_model/layers.12/self_attn/Mul_1_output_0 + - /language_model/layers.12/self_attn/Mul_2_output_0 + - /language_model/layers.12/self_attn/Mul_3_output_0 + - /language_model/layers.12/self_attn/Mul_4_output_0 + - /language_model/layers.12/self_attn/Mul_5_output_0 + - /language_model/layers.12/self_attn/Mul_6_output_0 + - /language_model/layers.12/self_attn/Mul_7_output_0 + - /language_model/layers.12/self_attn/Mul_8_output_0 + - /language_model/layers.12/self_attn/Mul_9_output_0 + - /language_model/layers.13/self_attn/Mul_output_0 + - /language_model/layers.13/self_attn/Mul_1_output_0 + - /language_model/layers.13/self_attn/Mul_2_output_0 + - /language_model/layers.13/self_attn/Mul_3_output_0 + - /language_model/layers.13/self_attn/Mul_4_output_0 + - /language_model/layers.13/self_attn/Mul_5_output_0 + - /language_model/layers.13/self_attn/Mul_6_output_0 + - /language_model/layers.13/self_attn/Mul_7_output_0 + - /language_model/layers.13/self_attn/Mul_8_output_0 + - /language_model/layers.13/self_attn/Mul_9_output_0 + - /language_model/layers.14/self_attn/Mul_output_0 + - /language_model/layers.14/self_attn/Mul_1_output_0 + - /language_model/layers.14/self_attn/Mul_2_output_0 + - /language_model/layers.14/self_attn/Mul_3_output_0 + - /language_model/layers.14/self_attn/Mul_4_output_0 + - /language_model/layers.14/self_attn/Mul_5_output_0 + - /language_model/layers.14/self_attn/Mul_6_output_0 + - /language_model/layers.14/self_attn/Mul_7_output_0 + - /language_model/layers.14/self_attn/Mul_8_output_0 + - /language_model/layers.14/self_attn/Mul_9_output_0 + - /language_model/layers.15/self_attn/Mul_output_0 + - /language_model/layers.15/self_attn/Mul_1_output_0 + - /language_model/layers.15/self_attn/Mul_2_output_0 + - /language_model/layers.15/self_attn/Mul_3_output_0 + - /language_model/layers.15/self_attn/Mul_4_output_0 + - /language_model/layers.15/self_attn/Mul_5_output_0 + - /language_model/layers.15/self_attn/Mul_6_output_0 + - /language_model/layers.15/self_attn/Mul_7_output_0 + - /language_model/layers.15/self_attn/Mul_8_output_0 + - /language_model/layers.15/self_attn/Mul_9_output_0 + - /language_model/layers.16/self_attn/Mul_output_0 + - /language_model/layers.16/self_attn/Mul_1_output_0 + - /language_model/layers.16/self_attn/Mul_2_output_0 + - /language_model/layers.16/self_attn/Mul_3_output_0 + - /language_model/layers.16/self_attn/Mul_4_output_0 + - /language_model/layers.16/self_attn/Mul_5_output_0 + - /language_model/layers.16/self_attn/Mul_6_output_0 + - /language_model/layers.16/self_attn/Mul_7_output_0 + - /language_model/layers.16/self_attn/Mul_8_output_0 + - /language_model/layers.16/self_attn/Mul_9_output_0 + - /language_model/layers.17/self_attn/Mul_output_0 + - /language_model/layers.17/self_attn/Mul_1_output_0 + - /language_model/layers.17/self_attn/Mul_2_output_0 + - /language_model/layers.17/self_attn/Mul_3_output_0 + - /language_model/layers.17/self_attn/Mul_4_output_0 + - /language_model/layers.17/self_attn/Mul_5_output_0 + - /language_model/layers.17/self_attn/Mul_6_output_0 + - /language_model/layers.17/self_attn/Mul_7_output_0 + - /language_model/layers.17/self_attn/Mul_8_output_0 + - /language_model/layers.17/self_attn/Mul_9_output_0 + - /language_model/layers.18/self_attn/Mul_output_0 + - /language_model/layers.18/self_attn/Mul_1_output_0 + - /language_model/layers.18/self_attn/Mul_2_output_0 + - /language_model/layers.18/self_attn/Mul_3_output_0 + - /language_model/layers.18/self_attn/Mul_4_output_0 + - /language_model/layers.18/self_attn/Mul_5_output_0 + - /language_model/layers.18/self_attn/Mul_6_output_0 + - /language_model/layers.18/self_attn/Mul_7_output_0 + - /language_model/layers.18/self_attn/Mul_8_output_0 + - /language_model/layers.18/self_attn/Mul_9_output_0 + - /language_model/layers.19/self_attn/Mul_output_0 + - /language_model/layers.19/self_attn/Mul_1_output_0 + - /language_model/layers.19/self_attn/Mul_2_output_0 + - /language_model/layers.19/self_attn/Mul_3_output_0 + - /language_model/layers.19/self_attn/Mul_4_output_0 + - /language_model/layers.19/self_attn/Mul_5_output_0 + - /language_model/layers.19/self_attn/Mul_6_output_0 + - /language_model/layers.19/self_attn/Mul_7_output_0 + - /language_model/layers.19/self_attn/Mul_8_output_0 + - /language_model/layers.19/self_attn/Mul_9_output_0 + - /language_model/layers.20/self_attn/Mul_output_0 + - /language_model/layers.20/self_attn/Mul_1_output_0 + - /language_model/layers.20/self_attn/Mul_2_output_0 + - /language_model/layers.20/self_attn/Mul_3_output_0 + - /language_model/layers.20/self_attn/Mul_4_output_0 + - /language_model/layers.20/self_attn/Mul_5_output_0 + - /language_model/layers.20/self_attn/Mul_6_output_0 + - /language_model/layers.20/self_attn/Mul_7_output_0 + - /language_model/layers.20/self_attn/Mul_8_output_0 + - /language_model/layers.20/self_attn/Mul_9_output_0 + - /language_model/layers.21/self_attn/Mul_output_0 + - /language_model/layers.21/self_attn/Mul_1_output_0 + - /language_model/layers.21/self_attn/Mul_2_output_0 + - /language_model/layers.21/self_attn/Mul_3_output_0 + - /language_model/layers.21/self_attn/Mul_4_output_0 + - /language_model/layers.21/self_attn/Mul_5_output_0 + - /language_model/layers.21/self_attn/Mul_6_output_0 + - /language_model/layers.21/self_attn/Mul_7_output_0 + - /language_model/layers.21/self_attn/Mul_8_output_0 + - /language_model/layers.21/self_attn/Mul_9_output_0 + - /language_model/layers.22/self_attn/Mul_output_0 + - /language_model/layers.22/self_attn/Mul_1_output_0 + - /language_model/layers.22/self_attn/Mul_2_output_0 + - /language_model/layers.22/self_attn/Mul_3_output_0 + - /language_model/layers.22/self_attn/Mul_4_output_0 + - /language_model/layers.22/self_attn/Mul_5_output_0 + - /language_model/layers.22/self_attn/Mul_6_output_0 + - /language_model/layers.22/self_attn/Mul_7_output_0 + - /language_model/layers.22/self_attn/Mul_8_output_0 + - /language_model/layers.22/self_attn/Mul_9_output_0 + - /language_model/layers.23/self_attn/Mul_output_0 + - /language_model/layers.23/self_attn/Mul_1_output_0 + - /language_model/layers.23/self_attn/Mul_2_output_0 + - /language_model/layers.23/self_attn/Mul_3_output_0 + - /language_model/layers.23/self_attn/Mul_4_output_0 + - /language_model/layers.23/self_attn/Mul_5_output_0 + - /language_model/layers.23/self_attn/Mul_6_output_0 + - /language_model/layers.23/self_attn/Mul_7_output_0 + - /language_model/layers.23/self_attn/Mul_8_output_0 + - /language_model/layers.23/self_attn/Mul_9_output_0 + - /language_model/layers.24/self_attn/Mul_output_0 + - /language_model/layers.24/self_attn/Mul_1_output_0 + - /language_model/layers.24/self_attn/Mul_2_output_0 + - /language_model/layers.24/self_attn/Mul_3_output_0 + - /language_model/layers.24/self_attn/Mul_4_output_0 + - /language_model/layers.24/self_attn/Mul_5_output_0 + - /language_model/layers.24/self_attn/Mul_6_output_0 + - /language_model/layers.24/self_attn/Mul_7_output_0 + - /language_model/layers.24/self_attn/Mul_8_output_0 + - /language_model/layers.24/self_attn/Mul_9_output_0 + - /language_model/layers.25/self_attn/Mul_output_0 + - /language_model/layers.25/self_attn/Mul_1_output_0 + - /language_model/layers.25/self_attn/Mul_2_output_0 + - /language_model/layers.25/self_attn/Mul_3_output_0 + - /language_model/layers.25/self_attn/Mul_4_output_0 + - /language_model/layers.25/self_attn/Mul_5_output_0 + - /language_model/layers.25/self_attn/Mul_6_output_0 + - /language_model/layers.25/self_attn/Mul_7_output_0 + - /language_model/layers.25/self_attn/Mul_8_output_0 + - /language_model/layers.25/self_attn/Mul_9_output_0 + - /language_model/layers.26/self_attn/Mul_output_0 + - /language_model/layers.26/self_attn/Mul_1_output_0 + - /language_model/layers.26/self_attn/Mul_2_output_0 + - /language_model/layers.26/self_attn/Mul_3_output_0 + - /language_model/layers.26/self_attn/Mul_4_output_0 + - /language_model/layers.26/self_attn/Mul_5_output_0 + - /language_model/layers.26/self_attn/Mul_6_output_0 + - /language_model/layers.26/self_attn/Mul_7_output_0 + - /language_model/layers.26/self_attn/Mul_8_output_0 + - /language_model/layers.26/self_attn/Mul_9_output_0 + - /language_model/layers.27/self_attn/Mul_output_0 + - /language_model/layers.27/self_attn/Mul_1_output_0 + - /language_model/layers.27/self_attn/Mul_2_output_0 + - /language_model/layers.27/self_attn/Mul_3_output_0 + - /language_model/layers.27/self_attn/Mul_4_output_0 + - /language_model/layers.27/self_attn/Mul_5_output_0 + - /language_model/layers.27/self_attn/Mul_6_output_0 + - /language_model/layers.27/self_attn/Mul_7_output_0 + - /language_model/layers.27/self_attn/Mul_8_output_0 + - /language_model/layers.27/self_attn/Mul_9_output_0 + - /language_model/layers.28/self_attn/Mul_output_0 + - /language_model/layers.28/self_attn/Mul_1_output_0 + - /language_model/layers.28/self_attn/Mul_2_output_0 + - /language_model/layers.28/self_attn/Mul_3_output_0 + - /language_model/layers.28/self_attn/Mul_4_output_0 + - /language_model/layers.28/self_attn/Mul_5_output_0 + - /language_model/layers.28/self_attn/Mul_6_output_0 + - /language_model/layers.28/self_attn/Mul_7_output_0 + - /language_model/layers.28/self_attn/Mul_8_output_0 + - /language_model/layers.28/self_attn/Mul_9_output_0 + - /language_model/layers.29/self_attn/Mul_output_0 + - /language_model/layers.29/self_attn/Mul_1_output_0 + - /language_model/layers.29/self_attn/Mul_2_output_0 + - /language_model/layers.29/self_attn/Mul_3_output_0 + - /language_model/layers.29/self_attn/Mul_4_output_0 + - /language_model/layers.29/self_attn/Mul_5_output_0 + - /language_model/layers.29/self_attn/Mul_6_output_0 + - /language_model/layers.29/self_attn/Mul_7_output_0 + - /language_model/layers.29/self_attn/Mul_8_output_0 + - /language_model/layers.29/self_attn/Mul_9_output_0 + - /language_model/layers.30/self_attn/Mul_output_0 + - /language_model/layers.30/self_attn/Mul_1_output_0 + - /language_model/layers.30/self_attn/Mul_2_output_0 + - /language_model/layers.30/self_attn/Mul_3_output_0 + - /language_model/layers.30/self_attn/Mul_4_output_0 + - /language_model/layers.30/self_attn/Mul_5_output_0 + - /language_model/layers.30/self_attn/Mul_6_output_0 + - /language_model/layers.30/self_attn/Mul_7_output_0 + - /language_model/layers.30/self_attn/Mul_8_output_0 + - /language_model/layers.30/self_attn/Mul_9_output_0 + - /language_model/layers.31/self_attn/Mul_output_0 + - /language_model/layers.31/self_attn/Mul_1_output_0 + - /language_model/layers.31/self_attn/Mul_2_output_0 + - /language_model/layers.31/self_attn/Mul_3_output_0 + - /language_model/layers.31/self_attn/Mul_4_output_0 + - /language_model/layers.31/self_attn/Mul_5_output_0 + - /language_model/layers.31/self_attn/Mul_6_output_0 + - /language_model/layers.31/self_attn/Mul_7_output_0 + - /language_model/layers.31/self_attn/Mul_8_output_0 + - /language_model/layers.31/self_attn/Mul_9_output_0 + - /language_model/layers.32/self_attn/Mul_output_0 + - /language_model/layers.32/self_attn/Mul_1_output_0 + - /language_model/layers.32/self_attn/Mul_2_output_0 + - /language_model/layers.32/self_attn/Mul_3_output_0 + - /language_model/layers.32/self_attn/Mul_4_output_0 + - /language_model/layers.32/self_attn/Mul_5_output_0 + - /language_model/layers.32/self_attn/Mul_6_output_0 + - /language_model/layers.32/self_attn/Mul_7_output_0 + - /language_model/layers.32/self_attn/Mul_8_output_0 + - /language_model/layers.32/self_attn/Mul_9_output_0 + - /language_model/layers.33/self_attn/Mul_output_0 + - /language_model/layers.33/self_attn/Mul_1_output_0 + - /language_model/layers.33/self_attn/Mul_2_output_0 + - /language_model/layers.33/self_attn/Mul_3_output_0 + - /language_model/layers.33/self_attn/Mul_4_output_0 + - /language_model/layers.33/self_attn/Mul_5_output_0 + - /language_model/layers.33/self_attn/Mul_6_output_0 + - /language_model/layers.33/self_attn/Mul_7_output_0 + - /language_model/layers.33/self_attn/Mul_8_output_0 + - /language_model/layers.33/self_attn/Mul_9_output_0 + - /language_model/layers.0/self_attn/Softmax_output_0 + - /language_model/layers.1/self_attn/Softmax_output_0 + - /language_model/layers.2/self_attn/Softmax_output_0 + - /language_model/layers.3/self_attn/Softmax_output_0 + - /language_model/layers.4/self_attn/Softmax_output_0 + - /language_model/layers.5/self_attn/Softmax_output_0 + - /language_model/layers.6/self_attn/Softmax_output_0 + - /language_model/layers.7/self_attn/Softmax_output_0 + - /language_model/layers.8/self_attn/Softmax_output_0 + - /language_model/layers.9/self_attn/Softmax_output_0 + - /language_model/layers.10/self_attn/Softmax_output_0 + - /language_model/layers.11/self_attn/Softmax_output_0 + - /language_model/layers.12/self_attn/Softmax_output_0 + - /language_model/layers.13/self_attn/Softmax_output_0 + - /language_model/layers.14/self_attn/Softmax_output_0 + - /language_model/layers.15/self_attn/Softmax_output_0 + - /language_model/layers.16/self_attn/Softmax_output_0 + - /language_model/layers.17/self_attn/Softmax_output_0 + - /language_model/layers.18/self_attn/Softmax_output_0 + - /language_model/layers.19/self_attn/Softmax_output_0 + - /language_model/layers.20/self_attn/Softmax_output_0 + - /language_model/layers.21/self_attn/Softmax_output_0 + - /language_model/layers.22/self_attn/Softmax_output_0 + - /language_model/layers.23/self_attn/Softmax_output_0 + - /language_model/layers.24/self_attn/Softmax_output_0 + - /language_model/layers.25/self_attn/Softmax_output_0 + - /language_model/layers.26/self_attn/Softmax_output_0 + - /language_model/layers.27/self_attn/Softmax_output_0 + - /language_model/layers.28/self_attn/Softmax_output_0 + - /language_model/layers.29/self_attn/Softmax_output_0 + - /language_model/layers.30/self_attn/Softmax_output_0 + - /language_model/layers.31/self_attn/Softmax_output_0 + - /language_model/layers.32/self_attn/Softmax_output_0 + - /language_model/layers.33/self_attn/Softmax_output_0 + diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py new file mode 100644 index 000000000..9fb4c4d43 --- /dev/null +++ b/examples/performance/compute_context_length/qwen3moe.py @@ -0,0 +1,54 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +from transformers import AutoTokenizer + +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.utils.constants import Constants + +model_name = "Qwen/Qwen3-30B-A3B-Instruct-2507" +""" +# For CB inference, set continuous_batching to True and add full_batch_size,mxfp6,mint8 argument in compile function +# We will use prompt_len=1 for compilation for both cb and non-cb inference +""" + +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + +ctx_len = 1024 +prefill_seq_len = 1 +# In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same ccl specializations. +comp_ctx_lengths_prefill = [256, 512, ctx_len] # None # +comp_ctx_lengths_decode = [256, 512, ctx_len] # None # + +model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + continuous_batching=False, + ccl_enabled=True, + num_hidden_layers=4, +) + +model.compile( + prefill_seq_len=prefill_seq_len, + ctx_len=ctx_len, + batch_size=1, + num_cores=16, + num_devices=4, + mxfp6_matmul=True, + mxint8_kv_cache=True, + mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, +) +# mos=1, +tokenizer = AutoTokenizer.from_pretrained(model_name) +exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer) diff --git a/examples/performance/compute_context_length/vlm_inference.py b/examples/performance/compute_context_length/vlm_inference.py index 0920ddf30..f9577bd2c 100644 --- a/examples/performance/compute_context_length/vlm_inference.py +++ b/examples/performance/compute_context_length/vlm_inference.py @@ -30,6 +30,7 @@ def run_model( kv_offload=True, prefill_seq_len=32, ctx_len=8192, + ccl_enabled=False, comp_ctx_lengths_prefill=None, comp_ctx_lengths_decode=None, generation_len=128, @@ -76,11 +77,7 @@ def run_model( token=hf_token, attn_implementation="eager", kv_offload=kv_offload, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=ccl_enabled, ) ## STEP 2: Export & Compile the Model @@ -93,6 +90,8 @@ def run_model( num_cores=num_cores, num_devices=num_devices, mxfp6_matmul=False, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) print(f"Model compiled successfully to: {qpc_path}") @@ -177,6 +176,11 @@ def main(): default=8192, help="Maximum context length", ) + parser.add_argument( + "--ccl-enabled", + action="store_true", + help="Enable compute-context-length (CCL) feature", + ) parser.add_argument( "--comp-ctx-lengths-prefill", type=lambda x: [int(i) for i in x.split(",")], @@ -223,6 +227,7 @@ def main(): kv_offload=args.kv_offload, prefill_seq_len=args.prefill_seq_len, ctx_len=args.ctx_len, + ccl_enabled=args.ccl_enabled, comp_ctx_lengths_prefill=args.comp_ctx_lengths_prefill, comp_ctx_lengths_decode=args.comp_ctx_lengths_decode, generation_len=args.generation_len, From d8c98ab05f651ece58b2244619eb9aeeb6edfcfc Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Sun, 23 Nov 2025 17:52:11 -0800 Subject: [PATCH 05/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- .../fp32_nodes_gemma3_27b.yaml | 685 +++++++++++++++++ .../fp32_nodes_gemma3_4b.yaml | 698 ++++++++++++++++++ 2 files changed, 1383 insertions(+) create mode 100755 examples/performance/compute_context_length/fp32_nodes_gemma3_27b.yaml create mode 100755 examples/performance/compute_context_length/fp32_nodes_gemma3_4b.yaml diff --git a/examples/performance/compute_context_length/fp32_nodes_gemma3_27b.yaml b/examples/performance/compute_context_length/fp32_nodes_gemma3_27b.yaml new file mode 100755 index 000000000..d2a4bf164 --- /dev/null +++ b/examples/performance/compute_context_length/fp32_nodes_gemma3_27b.yaml @@ -0,0 +1,685 @@ +FP32NodeInstanceNames: + + - /language_model/layers.0/Add_1_output_0 + - /language_model/layers.0/Add_2_output_0 + - /language_model/layers.0/Add_3_output_0 + - /language_model/layers.0/Add_output_0 + - /language_model/layers.1/Add_1_output_0 + - /language_model/layers.1/Add_2_output_0 + - /language_model/layers.1/Add_3_output_0 + - /language_model/layers.1/Add_output_0 + - /language_model/layers.2/Add_1_output_0 + - /language_model/layers.2/Add_2_output_0 + - /language_model/layers.2/Add_3_output_0 + - /language_model/layers.2/Add_output_0 + - /language_model/layers.3/Add_1_output_0 + - /language_model/layers.3/Add_2_output_0 + - /language_model/layers.3/Add_3_output_0 + - /language_model/layers.3/Add_output_0 + - /language_model/layers.4/Add_1_output_0 + - /language_model/layers.4/Add_2_output_0 + - /language_model/layers.4/Add_3_output_0 + - /language_model/layers.4/Add_output_0 + - /language_model/layers.5/Add_1_output_0 + - /language_model/layers.5/Add_2_output_0 + - /language_model/layers.5/Add_3_output_0 + - /language_model/layers.5/Add_output_0 + - /language_model/layers.6/Add_1_output_0 + - /language_model/layers.6/Add_2_output_0 + - /language_model/layers.6/Add_3_output_0 + - /language_model/layers.6/Add_output_0 + - /language_model/layers.7/Add_1_output_0 + - /language_model/layers.7/Add_2_output_0 + - /language_model/layers.7/Add_3_output_0 + - /language_model/layers.7/Add_output_0 + - /language_model/layers.8/Add_1_output_0 + - /language_model/layers.8/Add_2_output_0 + - /language_model/layers.8/Add_3_output_0 + - /language_model/layers.8/Add_output_0 + - /language_model/layers.9/Add_1_output_0 + - /language_model/layers.9/Add_2_output_0 + - /language_model/layers.9/Add_3_output_0 + - /language_model/layers.9/Add_output_0 + - /language_model/layers.10/Add_1_output_0 + - /language_model/layers.10/Add_2_output_0 + - /language_model/layers.10/Add_3_output_0 + - /language_model/layers.10/Add_output_0 + - /language_model/layers.11/Add_1_output_0 + - /language_model/layers.11/Add_2_output_0 + - /language_model/layers.11/Add_3_output_0 + - /language_model/layers.11/Add_output_0 + - /language_model/layers.12/Add_1_output_0 + - /language_model/layers.12/Add_2_output_0 + - /language_model/layers.12/Add_3_output_0 + - /language_model/layers.12/Add_output_0 + - /language_model/layers.13/Add_1_output_0 + - /language_model/layers.13/Add_2_output_0 + - /language_model/layers.13/Add_3_output_0 + - /language_model/layers.13/Add_output_0 + - /language_model/layers.14/Add_1_output_0 + - /language_model/layers.14/Add_2_output_0 + - /language_model/layers.14/Add_3_output_0 + - /language_model/layers.14/Add_output_0 + - /language_model/layers.15/Add_1_output_0 + - /language_model/layers.15/Add_2_output_0 + - /language_model/layers.15/Add_3_output_0 + - /language_model/layers.15/Add_output_0 + - /language_model/layers.16/Add_1_output_0 + - /language_model/layers.16/Add_2_output_0 + - /language_model/layers.16/Add_3_output_0 + - /language_model/layers.16/Add_output_0 + - /language_model/layers.17/Add_1_output_0 + - /language_model/layers.17/Add_2_output_0 + - /language_model/layers.17/Add_3_output_0 + - /language_model/layers.17/Add_output_0 + - /language_model/layers.18/Add_1_output_0 + - /language_model/layers.18/Add_2_output_0 + - /language_model/layers.18/Add_3_output_0 + - /language_model/layers.18/Add_output_0 + - /language_model/layers.19/Add_1_output_0 + - /language_model/layers.19/Add_2_output_0 + - /language_model/layers.19/Add_3_output_0 + - /language_model/layers.19/Add_output_0 + - /language_model/layers.20/Add_1_output_0 + - /language_model/layers.20/Add_2_output_0 + - /language_model/layers.20/Add_3_output_0 + - /language_model/layers.20/Add_output_0 + - /language_model/layers.21/Add_1_output_0 + - /language_model/layers.21/Add_2_output_0 + - /language_model/layers.21/Add_3_output_0 + - /language_model/layers.21/Add_output_0 + - /language_model/layers.22/Add_1_output_0 + - /language_model/layers.22/Add_2_output_0 + - /language_model/layers.22/Add_3_output_0 + - /language_model/layers.22/Add_output_0 + - /language_model/layers.23/Add_1_output_0 + - /language_model/layers.23/Add_2_output_0 + - /language_model/layers.23/Add_output_0 + - /language_model/layers.24/Add_1_output_0 + - /language_model/layers.24/Add_2_output_0 + - /language_model/layers.24/Add_3_output_0 + - /language_model/layers.24/Add_output_0 + - /language_model/layers.25/Add_1_output_0 + - /language_model/layers.25/Add_2_output_0 + - /language_model/layers.25/Add_3_output_0 + - /language_model/layers.25/Add_output_0 + - /language_model/layers.26/Add_1_output_0 + - /language_model/layers.26/Add_2_output_0 + - /language_model/layers.26/Add_3_output_0 + - /language_model/layers.26/Add_output_0 + - /language_model/layers.27/Add_1_output_0 + - /language_model/layers.27/Add_2_output_0 + - /language_model/layers.27/Add_3_output_0 + - /language_model/layers.27/Add_output_0 + - /language_model/layers.28/Add_1_output_0 + - /language_model/layers.28/Add_2_output_0 + - /language_model/layers.28/Add_3_output_0 + - /language_model/layers.28/Add_output_0 + - /language_model/layers.29/Add_1_output_0 + - /language_model/layers.29/Add_2_output_0 + - /language_model/layers.29/Add_3_output_0 + - /language_model/layers.29/Add_output_0 + - /language_model/layers.30/Add_1_output_0 + - /language_model/layers.30/Add_2_output_0 + - /language_model/layers.30/Add_3_output_0 + - /language_model/layers.30/Add_output_0 + - /language_model/layers.31/Add_1_output_0 + - /language_model/layers.31/Add_2_output_0 + - /language_model/layers.31/Add_3_output_0 + - /language_model/layers.31/Add_output_0 + - /language_model/layers.32/Add_1_output_0 + - /language_model/layers.32/Add_2_output_0 + - /language_model/layers.32/Add_3_output_0 + - /language_model/layers.32/Add_output_0 + - /language_model/layers.33/Add_1_output_0 + - /language_model/layers.33/Add_2_output_0 + - /language_model/layers.33/Add_3_output_0 + - /language_model/layers.33/Add_output_0 + - /language_model/layers.34/Add_1_output_0 + - /language_model/layers.34/Add_2_output_0 + - /language_model/layers.34/Add_3_output_0 + - /language_model/layers.34/Add_output_0 + - /language_model/layers.35/Add_1_output_0 + - /language_model/layers.35/Add_2_output_0 + - /language_model/layers.35/Add_3_output_0 + - /language_model/layers.35/Add_output_0 + - /language_model/layers.36/Add_1_output_0 + - /language_model/layers.36/Add_2_output_0 + - /language_model/layers.36/Add_3_output_0 + - /language_model/layers.36/Add_output_0 + - /language_model/layers.37/Add_1_output_0 + - /language_model/layers.37/Add_2_output_0 + - /language_model/layers.37/Add_3_output_0 + - /language_model/layers.37/Add_output_0 + - /language_model/layers.38/Add_1_output_0 + - /language_model/layers.38/Add_2_output_0 + - /language_model/layers.38/Add_3_output_0 + - /language_model/layers.38/Add_output_0 + - /language_model/layers.39/Add_1_output_0 + - /language_model/layers.39/Add_2_output_0 + - /language_model/layers.39/Add_3_output_0 + - /language_model/layers.39/Add_output_0 + - /language_model/layers.40/Add_1_output_0 + - /language_model/layers.40/Add_2_output_0 + - /language_model/layers.40/Add_3_output_0 + - /language_model/layers.40/Add_output_0 + - /language_model/layers.41/Add_1_output_0 + - /language_model/layers.41/Add_2_output_0 + - /language_model/layers.41/Add_3_output_0 + - /language_model/layers.41/Add_output_0 + - /language_model/layers.42/Add_1_output_0 + - /language_model/layers.42/Add_2_output_0 + - /language_model/layers.42/Add_3_output_0 + - /language_model/layers.42/Add_output_0 + - /language_model/layers.43/Add_1_output_0 + - /language_model/layers.43/Add_2_output_0 + - /language_model/layers.43/Add_3_output_0 + - /language_model/layers.43/Add_output_0 + - /language_model/layers.44/Add_1_output_0 + - /language_model/layers.44/Add_2_output_0 + - /language_model/layers.44/Add_3_output_0 + - /language_model/layers.44/Add_output_0 + - /language_model/layers.45/Add_1_output_0 + - /language_model/layers.45/Add_2_output_0 + - /language_model/layers.45/Add_3_output_0 + - /language_model/layers.45/Add_output_0 + - /language_model/layers.46/Add_1_output_0 + - /language_model/layers.46/Add_2_output_0 + - /language_model/layers.46/Add_3_output_0 + - /language_model/layers.46/Add_output_0 + - /language_model/layers.47/Add_1_output_0 + - /language_model/layers.47/Add_2_output_0 + - /language_model/layers.47/Add_3_output_0 + - /language_model/layers.47/Add_output_0 + - /language_model/layers.48/Add_1_output_0 + - /language_model/layers.48/Add_2_output_0 + - /language_model/layers.48/Add_3_output_0 + - /language_model/layers.48/Add_output_0 + - /language_model/layers.49/Add_1_output_0 + - /language_model/layers.49/Add_2_output_0 + - /language_model/layers.49/Add_3_output_0 + - /language_model/layers.49/Add_output_0 + - /language_model/layers.50/Add_1_output_0 + - /language_model/layers.50/Add_2_output_0 + - /language_model/layers.50/Add_3_output_0 + - /language_model/layers.50/Add_output_0 + - /language_model/layers.51/Add_1_output_0 + - /language_model/layers.51/Add_2_output_0 + - /language_model/layers.51/Add_3_output_0 + - /language_model/layers.51/Add_output_0 + - /language_model/layers.52/Add_1_output_0 + - /language_model/layers.52/Add_2_output_0 + - /language_model/layers.52/Add_3_output_0 + - /language_model/layers.52/Add_output_0 + - /language_model/layers.53/Add_1_output_0 + - /language_model/layers.53/Add_2_output_0 + - /language_model/layers.53/Add_3_output_0 + - /language_model/layers.53/Add_output_0 + - /language_model/layers.54/Add_1_output_0 + - /language_model/layers.54/Add_2_output_0 + - /language_model/layers.54/Add_3_output_0 + - /language_model/layers.54/Add_output_0 + - /language_model/layers.55/Add_1_output_0 + - /language_model/layers.55/Add_2_output_0 + - /language_model/layers.55/Add_3_output_0 + - /language_model/layers.55/Add_output_0 + - /language_model/layers.56/Add_1_output_0 + - /language_model/layers.56/Add_2_output_0 + - /language_model/layers.56/Add_3_output_0 + - /language_model/layers.56/Add_output_0 + - /language_model/layers.57/Add_1_output_0 + - /language_model/layers.57/Add_2_output_0 + - /language_model/layers.57/Add_3_output_0 + - /language_model/layers.57/Add_output_0 + - /language_model/layers.58/Add_1_output_0 + - /language_model/layers.58/Add_2_output_0 + - /language_model/layers.58/Add_3_output_0 + - /language_model/layers.58/Add_output_0 + - /language_model/layers.59/Add_1_output_0 + - /language_model/layers.59/Add_2_output_0 + - /language_model/layers.59/Add_3_output_0 + - /language_model/layers.59/Add_output_0 + - /language_model/layers.60/Add_1_output_0 + - /language_model/layers.60/Add_2_output_0 + - /language_model/layers.60/Add_3_output_0 + - /language_model/layers.60/Add_output_0 + - /language_model/layers.61/Add_1_output_0 + - /language_model/layers.61/Add_2_output_0 + - /language_model/layers.61/Add_3_output_0 + - /language_model/layers.61/Add_output_0 + - /language_model/norm/Add_output_0 + - /language_model/layers.0/self_attn/Mul_output_0 + - /language_model/layers.2/self_attn/Mul_output_0 + - /language_model/layers.3/self_attn/Mul_output_0 + - /language_model/layers.4/self_attn/Mul_output_0 + - /language_model/layers.5/self_attn/Mul_output_0 + - /language_model/layers.6/self_attn/Mul_output_0 + - /language_model/layers.7/self_attn/Mul_output_0 + - /language_model/layers.8/self_attn/Mul_output_0 + - /language_model/layers.9/self_attn/Mul_output_0 + - /language_model/layers.10/self_attn/Mul_output_0 + - /language_model/layers.11/self_attn/Mul_output_0 + - /language_model/layers.12/self_attn/Mul_output_0 + - /language_model/layers.13/self_attn/Mul_output_0 + - /language_model/layers.14/self_attn/Mul_output_0 + - /language_model/layers.15/self_attn/Mul_output_0 + - /language_model/layers.16/self_attn/Mul_output_0 + - /language_model/layers.17/self_attn/Mul_output_0 + - /language_model/layers.18/self_attn/Mul_output_0 + - /language_model/layers.19/self_attn/Mul_output_0 + - /language_model/layers.20/self_attn/Mul_output_0 + - /language_model/layers.21/self_attn/Mul_output_0 + - /language_model/layers.22/self_attn/Mul_output_0 + - /language_model/layers.23/self_attn/Mul_output_0 + - /language_model/layers.24/self_attn/Mul_output_0 + - /language_model/layers.25/self_attn/Mul_output_0 + - /language_model/layers.26/self_attn/Mul_output_0 + - /language_model/layers.27/self_attn/Mul_output_0 + - /language_model/layers.28/self_attn/Mul_output_0 + - /language_model/layers.29/self_attn/Mul_output_0 + - /language_model/layers.30/self_attn/Mul_output_0 + - /language_model/layers.31/self_attn/Mul_output_0 + - /language_model/layers.32/self_attn/Mul_output_0 + - /language_model/layers.33/self_attn/Mul_output_0 + - /language_model/layers.34/self_attn/Mul_output_0 + - /language_model/layers.35/self_attn/Mul_output_0 + - /language_model/layers.36/self_attn/Mul_output_0 + - /language_model/layers.37/self_attn/Mul_output_0 + - /language_model/layers.38/self_attn/Mul_output_0 + - /language_model/layers.39/self_attn/Mul_output_0 + - /language_model/layers.40/self_attn/Mul_output_0 + - /language_model/layers.41/self_attn/Mul_output_0 + - /language_model/layers.42/self_attn/Mul_output_0 + - /language_model/layers.43/self_attn/Mul_output_0 + - /language_model/layers.44/self_attn/Mul_output_0 + - /language_model/layers.45/self_attn/Mul_output_0 + - /language_model/layers.46/self_attn/Mul_output_0 + - /language_model/layers.47/self_attn/Mul_output_0 + - /language_model/layers.48/self_attn/Mul_output_0 + - /language_model/layers.49/self_attn/Mul_output_0 + - /language_model/layers.50/self_attn/Mul_output_0 + - /language_model/layers.51/self_attn/Mul_output_0 + - /language_model/layers.52/self_attn/Mul_output_0 + - /language_model/layers.53/self_attn/Mul_output_0 + - /language_model/layers.54/self_attn/Mul_output_0 + - /language_model/layers.55/self_attn/Mul_output_0 + - /language_model/layers.56/self_attn/Mul_output_0 + - /language_model/layers.57/self_attn/Mul_output_0 + - /language_model/layers.58/self_attn/Mul_output_0 + - /language_model/layers.59/self_attn/Mul_output_0 + - /language_model/layers.60/self_attn/Mul_output_0 + - /language_model/layers.61/self_attn/Mul_output_0 + - /language_model/layers.0/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.34/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.34/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.35/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.35/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.36/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.36/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.37/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.37/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.38/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.38/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.39/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.39/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.40/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.40/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.41/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.41/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.42/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.42/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.43/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.43/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.44/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.44/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.45/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.45/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.46/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.46/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.47/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.47/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.48/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.48/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.49/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.49/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.50/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.50/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.51/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.51/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.52/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.52/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.53/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.53/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.54/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.54/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.55/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.55/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.56/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.56/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.57/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.57/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.58/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.58/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.59/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.59/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.60/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.60/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.61/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.61/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/norm/CustomRMSNorm_output_0 + diff --git a/examples/performance/compute_context_length/fp32_nodes_gemma3_4b.yaml b/examples/performance/compute_context_length/fp32_nodes_gemma3_4b.yaml new file mode 100755 index 000000000..1c8aa1c41 --- /dev/null +++ b/examples/performance/compute_context_length/fp32_nodes_gemma3_4b.yaml @@ -0,0 +1,698 @@ +FP32NodeInstanceNames: + + - /language_model/layers.0/Add_output_0 + - /language_model/layers.0/Add_1_output_0 + - /language_model/layers.0/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/Add_2_output_0 + - /language_model/layers.0/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/Add_3_output_0 + - /language_model/layers.1/Add_output_0 + - /language_model/layers.1/Add_1_output_0 + - /language_model/layers.1/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/Add_2_output_0 + - /language_model/layers.1/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/Add_3_output_0 + - /language_model/layers.2/Add_output_0 + - /language_model/layers.2/Add_1_output_0 + - /language_model/layers.2/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/Add_2_output_0 + - /language_model/layers.2/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/Add_3_output_0 + - /language_model/layers.3/Add_output_0 + - /language_model/layers.3/Add_1_output_0 + - /language_model/layers.3/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/Add_2_output_0 + - /language_model/layers.3/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/Add_3_output_0 + - /language_model/layers.4/Add_output_0 + - /language_model/layers.4/Add_1_output_0 + - /language_model/layers.4/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/Add_2_output_0 + - /language_model/layers.4/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/Add_3_output_0 + - /language_model/layers.5/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/Add_output_0 + - /language_model/layers.5/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/Add_1_output_0 + - /language_model/layers.6/Add_output_0 + - /language_model/layers.6/Add_1_output_0 + - /language_model/layers.6/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/Add_2_output_0 + - /language_model/layers.6/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/Add_3_output_0 + - /language_model/layers.7/Add_output_0 + - /language_model/layers.7/Add_1_output_0 + - /language_model/layers.7/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/Add_2_output_0 + - /language_model/layers.7/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/Add_3_output_0 + - /language_model/layers.8/Add_output_0 + - /language_model/layers.8/Add_1_output_0 + - /language_model/layers.8/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/Add_2_output_0 + - /language_model/layers.8/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/Add_3_output_0 + - /language_model/layers.9/Add_output_0 + - /language_model/layers.9/Add_1_output_0 + - /language_model/layers.9/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/Add_2_output_0 + - /language_model/layers.9/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/Add_3_output_0 + - /language_model/layers.10/Add_output_0 + - /language_model/layers.10/Add_1_output_0 + - /language_model/layers.10/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/Add_2_output_0 + - /language_model/layers.10/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/Add_3_output_0 + - /language_model/layers.11/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/Add_output_0 + - /language_model/layers.11/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/Add_1_output_0 + - /language_model/layers.12/Add_output_0 + - /language_model/layers.12/Add_1_output_0 + - /language_model/layers.12/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/Add_2_output_0 + - /language_model/layers.12/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/Add_3_output_0 + - /language_model/layers.13/Add_output_0 + - /language_model/layers.13/Add_1_output_0 + - /language_model/layers.13/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/Add_2_output_0 + - /language_model/layers.13/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/Add_3_output_0 + - /language_model/layers.14/Add_output_0 + - /language_model/layers.14/Add_1_output_0 + - /language_model/layers.14/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/Add_2_output_0 + - /language_model/layers.14/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/Add_3_output_0 + - /language_model/layers.15/Add_output_0 + - /language_model/layers.15/Add_1_output_0 + - /language_model/layers.15/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/Add_2_output_0 + - /language_model/layers.15/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/Add_3_output_0 + - /language_model/layers.16/Add_output_0 + - /language_model/layers.16/Add_1_output_0 + - /language_model/layers.16/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/Add_2_output_0 + - /language_model/layers.16/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/Add_3_output_0 + - /language_model/layers.17/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/Add_output_0 + - /language_model/layers.17/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/Add_1_output_0 + - /language_model/layers.18/Add_output_0 + - /language_model/layers.18/Add_1_output_0 + - /language_model/layers.18/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/Add_2_output_0 + - /language_model/layers.18/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/Add_3_output_0 + - /language_model/layers.19/Add_output_0 + - /language_model/layers.19/Add_1_output_0 + - /language_model/layers.19/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/Add_2_output_0 + - /language_model/layers.19/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/Add_3_output_0 + - /language_model/layers.20/Add_output_0 + - /language_model/layers.20/Add_1_output_0 + - /language_model/layers.20/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/Add_2_output_0 + - /language_model/layers.20/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/Add_3_output_0 + - /language_model/layers.21/Add_output_0 + - /language_model/layers.21/Add_1_output_0 + - /language_model/layers.21/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/Add_2_output_0 + - /language_model/layers.21/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/Add_3_output_0 + - /language_model/layers.22/Add_output_0 + - /language_model/layers.22/Add_1_output_0 + - /language_model/layers.22/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/Add_2_output_0 + - /language_model/layers.22/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/Add_3_output_0 + - /language_model/layers.23/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/Add_output_0 + - /language_model/layers.23/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/Add_1_output_0 + - /language_model/layers.24/Add_output_0 + - /language_model/layers.24/Add_1_output_0 + - /language_model/layers.24/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/Add_2_output_0 + - /language_model/layers.24/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/Add_3_output_0 + - /language_model/layers.25/Add_output_0 + - /language_model/layers.25/Add_1_output_0 + - /language_model/layers.25/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/Add_2_output_0 + - /language_model/layers.25/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/Add_3_output_0 + - /language_model/layers.26/Add_output_0 + - /language_model/layers.26/Add_1_output_0 + - /language_model/layers.26/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/Add_2_output_0 + - /language_model/layers.26/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/Add_3_output_0 + - /language_model/layers.27/Add_output_0 + - /language_model/layers.27/Add_1_output_0 + - /language_model/layers.27/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/Add_2_output_0 + - /language_model/layers.27/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/Add_3_output_0 + - /language_model/layers.28/Add_output_0 + - /language_model/layers.28/Add_1_output_0 + - /language_model/layers.28/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/Add_2_output_0 + - /language_model/layers.28/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/Add_3_output_0 + - /language_model/layers.29/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/Add_output_0 + - /language_model/layers.29/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/Add_1_output_0 + - /language_model/layers.30/Add_output_0 + - /language_model/layers.30/Add_1_output_0 + - /language_model/layers.30/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/Add_2_output_0 + - /language_model/layers.30/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/Add_3_output_0 + - /language_model/layers.31/Add_output_0 + - /language_model/layers.31/Add_1_output_0 + - /language_model/layers.31/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/Add_2_output_0 + - /language_model/layers.31/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/Add_3_output_0 + - /language_model/layers.32/Add_output_0 + - /language_model/layers.32/Add_1_output_0 + - /language_model/layers.32/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/Add_2_output_0 + - /language_model/layers.32/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/Add_3_output_0 + - /language_model/layers.33/Add_output_0 + - /language_model/layers.33/Add_1_output_0 + - /language_model/layers.33/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/Add_2_output_0 + - /language_model/layers.33/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/Add_3_output_0 + - /language_model/norm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/Mul_output_0 + - /language_model/layers.0/self_attn/Mul_1_output_0 + - /language_model/layers.0/self_attn/Mul_2_output_0 + - /language_model/layers.0/self_attn/Mul_3_output_0 + - /language_model/layers.0/self_attn/Mul_4_output_0 + - /language_model/layers.0/self_attn/Mul_5_output_0 + - /language_model/layers.0/self_attn/Mul_6_output_0 + - /language_model/layers.0/self_attn/Mul_7_output_0 + - /language_model/layers.0/self_attn/Mul_8_output_0 + - /language_model/layers.1/self_attn/Mul_9_output_0 + - /language_model/layers.2/self_attn/Mul_output_0 + - /language_model/layers.2/self_attn/Mul_1_output_0 + - /language_model/layers.2/self_attn/Mul_2_output_0 + - /language_model/layers.2/self_attn/Mul_3_output_0 + - /language_model/layers.2/self_attn/Mul_4_output_0 + - /language_model/layers.2/self_attn/Mul_5_output_0 + - /language_model/layers.2/self_attn/Mul_6_output_0 + - /language_model/layers.2/self_attn/Mul_7_output_0 + - /language_model/layers.2/self_attn/Mul_8_output_0 + - /language_model/layers.2/self_attn/Mul_9_output_0 + - /language_model/layers.3/self_attn/Mul_output_0 + - /language_model/layers.3/self_attn/Mul_1_output_0 + - /language_model/layers.3/self_attn/Mul_2_output_0 + - /language_model/layers.3/self_attn/Mul_3_output_0 + - /language_model/layers.3/self_attn/Mul_4_output_0 + - /language_model/layers.3/self_attn/Mul_5_output_0 + - /language_model/layers.3/self_attn/Mul_6_output_0 + - /language_model/layers.3/self_attn/Mul_7_output_0 + - /language_model/layers.3/self_attn/Mul_8_output_0 + - /language_model/layers.3/self_attn/Mul_9_output_0 + - /language_model/layers.4/self_attn/Mul_output_0 + - /language_model/layers.4/self_attn/Mul_1_output_0 + - /language_model/layers.4/self_attn/Mul_2_output_0 + - /language_model/layers.4/self_attn/Mul_3_output_0 + - /language_model/layers.4/self_attn/Mul_4_output_0 + - /language_model/layers.4/self_attn/Mul_5_output_0 + - /language_model/layers.4/self_attn/Mul_6_output_0 + - /language_model/layers.4/self_attn/Mul_7_output_0 + - /language_model/layers.4/self_attn/Mul_8_output_0 + - /language_model/layers.4/self_attn/Mul_9_output_0 + - /language_model/layers.5/self_attn/Mul_output_0 + - /language_model/layers.5/self_attn/Mul_1_output_0 + - /language_model/layers.5/self_attn/Mul_2_output_0 + - /language_model/layers.5/self_attn/Mul_3_output_0 + - /language_model/layers.5/self_attn/Mul_4_output_0 + - /language_model/layers.5/self_attn/Mul_5_output_0 + - /language_model/layers.5/self_attn/Mul_6_output_0 + - /language_model/layers.5/self_attn/Mul_7_output_0 + - /language_model/layers.5/self_attn/Mul_8_output_0 + - /language_model/layers.5/self_attn/Mul_9_output_0 + - /language_model/layers.6/self_attn/Mul_output_0 + - /language_model/layers.6/self_attn/Mul_1_output_0 + - /language_model/layers.6/self_attn/Mul_2_output_0 + - /language_model/layers.6/self_attn/Mul_3_output_0 + - /language_model/layers.6/self_attn/Mul_4_output_0 + - /language_model/layers.6/self_attn/Mul_5_output_0 + - /language_model/layers.6/self_attn/Mul_6_output_0 + - /language_model/layers.6/self_attn/Mul_7_output_0 + - /language_model/layers.6/self_attn/Mul_8_output_0 + - /language_model/layers.6/self_attn/Mul_9_output_0 + - /language_model/layers.7/self_attn/Mul_output_0 + - /language_model/layers.7/self_attn/Mul_1_output_0 + - /language_model/layers.7/self_attn/Mul_2_output_0 + - /language_model/layers.7/self_attn/Mul_3_output_0 + - /language_model/layers.7/self_attn/Mul_4_output_0 + - /language_model/layers.7/self_attn/Mul_5_output_0 + - /language_model/layers.7/self_attn/Mul_6_output_0 + - /language_model/layers.7/self_attn/Mul_7_output_0 + - /language_model/layers.7/self_attn/Mul_8_output_0 + - /language_model/layers.7/self_attn/Mul_9_output_0 + - /language_model/layers.8/self_attn/Mul_output_0 + - /language_model/layers.8/self_attn/Mul_1_output_0 + - /language_model/layers.8/self_attn/Mul_2_output_0 + - /language_model/layers.8/self_attn/Mul_3_output_0 + - /language_model/layers.8/self_attn/Mul_4_output_0 + - /language_model/layers.8/self_attn/Mul_5_output_0 + - /language_model/layers.8/self_attn/Mul_6_output_0 + - /language_model/layers.8/self_attn/Mul_7_output_0 + - /language_model/layers.8/self_attn/Mul_8_output_0 + - /language_model/layers.8/self_attn/Mul_9_output_0 + - /language_model/layers.9/self_attn/Mul_output_0 + - /language_model/layers.9/self_attn/Mul_1_output_0 + - /language_model/layers.9/self_attn/Mul_2_output_0 + - /language_model/layers.9/self_attn/Mul_3_output_0 + - /language_model/layers.9/self_attn/Mul_4_output_0 + - /language_model/layers.9/self_attn/Mul_5_output_0 + - /language_model/layers.9/self_attn/Mul_6_output_0 + - /language_model/layers.9/self_attn/Mul_7_output_0 + - /language_model/layers.9/self_attn/Mul_8_output_0 + - /language_model/layers.9/self_attn/Mul_9_output_0 + - /language_model/layers.10/self_attn/Mul_output_0 + - /language_model/layers.10/self_attn/Mul_1_output_0 + - /language_model/layers.10/self_attn/Mul_2_output_0 + - /language_model/layers.10/self_attn/Mul_3_output_0 + - /language_model/layers.10/self_attn/Mul_4_output_0 + - /language_model/layers.10/self_attn/Mul_5_output_0 + - /language_model/layers.10/self_attn/Mul_6_output_0 + - /language_model/layers.10/self_attn/Mul_7_output_0 + - /language_model/layers.10/self_attn/Mul_8_output_0 + - /language_model/layers.10/self_attn/Mul_9_output_0 + - /language_model/layers.11/self_attn/Mul_output_0 + - /language_model/layers.11/self_attn/Mul_1_output_0 + - /language_model/layers.11/self_attn/Mul_2_output_0 + - /language_model/layers.11/self_attn/Mul_3_output_0 + - /language_model/layers.11/self_attn/Mul_4_output_0 + - /language_model/layers.11/self_attn/Mul_5_output_0 + - /language_model/layers.11/self_attn/Mul_6_output_0 + - /language_model/layers.11/self_attn/Mul_7_output_0 + - /language_model/layers.11/self_attn/Mul_8_output_0 + - /language_model/layers.11/self_attn/Mul_9_output_0 + - /language_model/layers.12/self_attn/Mul_output_0 + - /language_model/layers.12/self_attn/Mul_1_output_0 + - /language_model/layers.12/self_attn/Mul_2_output_0 + - /language_model/layers.12/self_attn/Mul_3_output_0 + - /language_model/layers.12/self_attn/Mul_4_output_0 + - /language_model/layers.12/self_attn/Mul_5_output_0 + - /language_model/layers.12/self_attn/Mul_6_output_0 + - /language_model/layers.12/self_attn/Mul_7_output_0 + - /language_model/layers.12/self_attn/Mul_8_output_0 + - /language_model/layers.12/self_attn/Mul_9_output_0 + - /language_model/layers.13/self_attn/Mul_output_0 + - /language_model/layers.13/self_attn/Mul_1_output_0 + - /language_model/layers.13/self_attn/Mul_2_output_0 + - /language_model/layers.13/self_attn/Mul_3_output_0 + - /language_model/layers.13/self_attn/Mul_4_output_0 + - /language_model/layers.13/self_attn/Mul_5_output_0 + - /language_model/layers.13/self_attn/Mul_6_output_0 + - /language_model/layers.13/self_attn/Mul_7_output_0 + - /language_model/layers.13/self_attn/Mul_8_output_0 + - /language_model/layers.13/self_attn/Mul_9_output_0 + - /language_model/layers.14/self_attn/Mul_output_0 + - /language_model/layers.14/self_attn/Mul_1_output_0 + - /language_model/layers.14/self_attn/Mul_2_output_0 + - /language_model/layers.14/self_attn/Mul_3_output_0 + - /language_model/layers.14/self_attn/Mul_4_output_0 + - /language_model/layers.14/self_attn/Mul_5_output_0 + - /language_model/layers.14/self_attn/Mul_6_output_0 + - /language_model/layers.14/self_attn/Mul_7_output_0 + - /language_model/layers.14/self_attn/Mul_8_output_0 + - /language_model/layers.14/self_attn/Mul_9_output_0 + - /language_model/layers.15/self_attn/Mul_output_0 + - /language_model/layers.15/self_attn/Mul_1_output_0 + - /language_model/layers.15/self_attn/Mul_2_output_0 + - /language_model/layers.15/self_attn/Mul_3_output_0 + - /language_model/layers.15/self_attn/Mul_4_output_0 + - /language_model/layers.15/self_attn/Mul_5_output_0 + - /language_model/layers.15/self_attn/Mul_6_output_0 + - /language_model/layers.15/self_attn/Mul_7_output_0 + - /language_model/layers.15/self_attn/Mul_8_output_0 + - /language_model/layers.15/self_attn/Mul_9_output_0 + - /language_model/layers.16/self_attn/Mul_output_0 + - /language_model/layers.16/self_attn/Mul_1_output_0 + - /language_model/layers.16/self_attn/Mul_2_output_0 + - /language_model/layers.16/self_attn/Mul_3_output_0 + - /language_model/layers.16/self_attn/Mul_4_output_0 + - /language_model/layers.16/self_attn/Mul_5_output_0 + - /language_model/layers.16/self_attn/Mul_6_output_0 + - /language_model/layers.16/self_attn/Mul_7_output_0 + - /language_model/layers.16/self_attn/Mul_8_output_0 + - /language_model/layers.16/self_attn/Mul_9_output_0 + - /language_model/layers.17/self_attn/Mul_output_0 + - /language_model/layers.17/self_attn/Mul_1_output_0 + - /language_model/layers.17/self_attn/Mul_2_output_0 + - /language_model/layers.17/self_attn/Mul_3_output_0 + - /language_model/layers.17/self_attn/Mul_4_output_0 + - /language_model/layers.17/self_attn/Mul_5_output_0 + - /language_model/layers.17/self_attn/Mul_6_output_0 + - /language_model/layers.17/self_attn/Mul_7_output_0 + - /language_model/layers.17/self_attn/Mul_8_output_0 + - /language_model/layers.17/self_attn/Mul_9_output_0 + - /language_model/layers.18/self_attn/Mul_output_0 + - /language_model/layers.18/self_attn/Mul_1_output_0 + - /language_model/layers.18/self_attn/Mul_2_output_0 + - /language_model/layers.18/self_attn/Mul_3_output_0 + - /language_model/layers.18/self_attn/Mul_4_output_0 + - /language_model/layers.18/self_attn/Mul_5_output_0 + - /language_model/layers.18/self_attn/Mul_6_output_0 + - /language_model/layers.18/self_attn/Mul_7_output_0 + - /language_model/layers.18/self_attn/Mul_8_output_0 + - /language_model/layers.18/self_attn/Mul_9_output_0 + - /language_model/layers.19/self_attn/Mul_output_0 + - /language_model/layers.19/self_attn/Mul_1_output_0 + - /language_model/layers.19/self_attn/Mul_2_output_0 + - /language_model/layers.19/self_attn/Mul_3_output_0 + - /language_model/layers.19/self_attn/Mul_4_output_0 + - /language_model/layers.19/self_attn/Mul_5_output_0 + - /language_model/layers.19/self_attn/Mul_6_output_0 + - /language_model/layers.19/self_attn/Mul_7_output_0 + - /language_model/layers.19/self_attn/Mul_8_output_0 + - /language_model/layers.19/self_attn/Mul_9_output_0 + - /language_model/layers.20/self_attn/Mul_output_0 + - /language_model/layers.20/self_attn/Mul_1_output_0 + - /language_model/layers.20/self_attn/Mul_2_output_0 + - /language_model/layers.20/self_attn/Mul_3_output_0 + - /language_model/layers.20/self_attn/Mul_4_output_0 + - /language_model/layers.20/self_attn/Mul_5_output_0 + - /language_model/layers.20/self_attn/Mul_6_output_0 + - /language_model/layers.20/self_attn/Mul_7_output_0 + - /language_model/layers.20/self_attn/Mul_8_output_0 + - /language_model/layers.20/self_attn/Mul_9_output_0 + - /language_model/layers.21/self_attn/Mul_output_0 + - /language_model/layers.21/self_attn/Mul_1_output_0 + - /language_model/layers.21/self_attn/Mul_2_output_0 + - /language_model/layers.21/self_attn/Mul_3_output_0 + - /language_model/layers.21/self_attn/Mul_4_output_0 + - /language_model/layers.21/self_attn/Mul_5_output_0 + - /language_model/layers.21/self_attn/Mul_6_output_0 + - /language_model/layers.21/self_attn/Mul_7_output_0 + - /language_model/layers.21/self_attn/Mul_8_output_0 + - /language_model/layers.21/self_attn/Mul_9_output_0 + - /language_model/layers.22/self_attn/Mul_output_0 + - /language_model/layers.22/self_attn/Mul_1_output_0 + - /language_model/layers.22/self_attn/Mul_2_output_0 + - /language_model/layers.22/self_attn/Mul_3_output_0 + - /language_model/layers.22/self_attn/Mul_4_output_0 + - /language_model/layers.22/self_attn/Mul_5_output_0 + - /language_model/layers.22/self_attn/Mul_6_output_0 + - /language_model/layers.22/self_attn/Mul_7_output_0 + - /language_model/layers.22/self_attn/Mul_8_output_0 + - /language_model/layers.22/self_attn/Mul_9_output_0 + - /language_model/layers.23/self_attn/Mul_output_0 + - /language_model/layers.23/self_attn/Mul_1_output_0 + - /language_model/layers.23/self_attn/Mul_2_output_0 + - /language_model/layers.23/self_attn/Mul_3_output_0 + - /language_model/layers.23/self_attn/Mul_4_output_0 + - /language_model/layers.23/self_attn/Mul_5_output_0 + - /language_model/layers.23/self_attn/Mul_6_output_0 + - /language_model/layers.23/self_attn/Mul_7_output_0 + - /language_model/layers.23/self_attn/Mul_8_output_0 + - /language_model/layers.23/self_attn/Mul_9_output_0 + - /language_model/layers.24/self_attn/Mul_output_0 + - /language_model/layers.24/self_attn/Mul_1_output_0 + - /language_model/layers.24/self_attn/Mul_2_output_0 + - /language_model/layers.24/self_attn/Mul_3_output_0 + - /language_model/layers.24/self_attn/Mul_4_output_0 + - /language_model/layers.24/self_attn/Mul_5_output_0 + - /language_model/layers.24/self_attn/Mul_6_output_0 + - /language_model/layers.24/self_attn/Mul_7_output_0 + - /language_model/layers.24/self_attn/Mul_8_output_0 + - /language_model/layers.24/self_attn/Mul_9_output_0 + - /language_model/layers.25/self_attn/Mul_output_0 + - /language_model/layers.25/self_attn/Mul_1_output_0 + - /language_model/layers.25/self_attn/Mul_2_output_0 + - /language_model/layers.25/self_attn/Mul_3_output_0 + - /language_model/layers.25/self_attn/Mul_4_output_0 + - /language_model/layers.25/self_attn/Mul_5_output_0 + - /language_model/layers.25/self_attn/Mul_6_output_0 + - /language_model/layers.25/self_attn/Mul_7_output_0 + - /language_model/layers.25/self_attn/Mul_8_output_0 + - /language_model/layers.25/self_attn/Mul_9_output_0 + - /language_model/layers.26/self_attn/Mul_output_0 + - /language_model/layers.26/self_attn/Mul_1_output_0 + - /language_model/layers.26/self_attn/Mul_2_output_0 + - /language_model/layers.26/self_attn/Mul_3_output_0 + - /language_model/layers.26/self_attn/Mul_4_output_0 + - /language_model/layers.26/self_attn/Mul_5_output_0 + - /language_model/layers.26/self_attn/Mul_6_output_0 + - /language_model/layers.26/self_attn/Mul_7_output_0 + - /language_model/layers.26/self_attn/Mul_8_output_0 + - /language_model/layers.26/self_attn/Mul_9_output_0 + - /language_model/layers.27/self_attn/Mul_output_0 + - /language_model/layers.27/self_attn/Mul_1_output_0 + - /language_model/layers.27/self_attn/Mul_2_output_0 + - /language_model/layers.27/self_attn/Mul_3_output_0 + - /language_model/layers.27/self_attn/Mul_4_output_0 + - /language_model/layers.27/self_attn/Mul_5_output_0 + - /language_model/layers.27/self_attn/Mul_6_output_0 + - /language_model/layers.27/self_attn/Mul_7_output_0 + - /language_model/layers.27/self_attn/Mul_8_output_0 + - /language_model/layers.27/self_attn/Mul_9_output_0 + - /language_model/layers.28/self_attn/Mul_output_0 + - /language_model/layers.28/self_attn/Mul_1_output_0 + - /language_model/layers.28/self_attn/Mul_2_output_0 + - /language_model/layers.28/self_attn/Mul_3_output_0 + - /language_model/layers.28/self_attn/Mul_4_output_0 + - /language_model/layers.28/self_attn/Mul_5_output_0 + - /language_model/layers.28/self_attn/Mul_6_output_0 + - /language_model/layers.28/self_attn/Mul_7_output_0 + - /language_model/layers.28/self_attn/Mul_8_output_0 + - /language_model/layers.28/self_attn/Mul_9_output_0 + - /language_model/layers.29/self_attn/Mul_output_0 + - /language_model/layers.29/self_attn/Mul_1_output_0 + - /language_model/layers.29/self_attn/Mul_2_output_0 + - /language_model/layers.29/self_attn/Mul_3_output_0 + - /language_model/layers.29/self_attn/Mul_4_output_0 + - /language_model/layers.29/self_attn/Mul_5_output_0 + - /language_model/layers.29/self_attn/Mul_6_output_0 + - /language_model/layers.29/self_attn/Mul_7_output_0 + - /language_model/layers.29/self_attn/Mul_8_output_0 + - /language_model/layers.29/self_attn/Mul_9_output_0 + - /language_model/layers.30/self_attn/Mul_output_0 + - /language_model/layers.30/self_attn/Mul_1_output_0 + - /language_model/layers.30/self_attn/Mul_2_output_0 + - /language_model/layers.30/self_attn/Mul_3_output_0 + - /language_model/layers.30/self_attn/Mul_4_output_0 + - /language_model/layers.30/self_attn/Mul_5_output_0 + - /language_model/layers.30/self_attn/Mul_6_output_0 + - /language_model/layers.30/self_attn/Mul_7_output_0 + - /language_model/layers.30/self_attn/Mul_8_output_0 + - /language_model/layers.30/self_attn/Mul_9_output_0 + - /language_model/layers.31/self_attn/Mul_output_0 + - /language_model/layers.31/self_attn/Mul_1_output_0 + - /language_model/layers.31/self_attn/Mul_2_output_0 + - /language_model/layers.31/self_attn/Mul_3_output_0 + - /language_model/layers.31/self_attn/Mul_4_output_0 + - /language_model/layers.31/self_attn/Mul_5_output_0 + - /language_model/layers.31/self_attn/Mul_6_output_0 + - /language_model/layers.31/self_attn/Mul_7_output_0 + - /language_model/layers.31/self_attn/Mul_8_output_0 + - /language_model/layers.31/self_attn/Mul_9_output_0 + - /language_model/layers.32/self_attn/Mul_output_0 + - /language_model/layers.32/self_attn/Mul_1_output_0 + - /language_model/layers.32/self_attn/Mul_2_output_0 + - /language_model/layers.32/self_attn/Mul_3_output_0 + - /language_model/layers.32/self_attn/Mul_4_output_0 + - /language_model/layers.32/self_attn/Mul_5_output_0 + - /language_model/layers.32/self_attn/Mul_6_output_0 + - /language_model/layers.32/self_attn/Mul_7_output_0 + - /language_model/layers.32/self_attn/Mul_8_output_0 + - /language_model/layers.32/self_attn/Mul_9_output_0 + - /language_model/layers.33/self_attn/Mul_output_0 + - /language_model/layers.33/self_attn/Mul_1_output_0 + - /language_model/layers.33/self_attn/Mul_2_output_0 + - /language_model/layers.33/self_attn/Mul_3_output_0 + - /language_model/layers.33/self_attn/Mul_4_output_0 + - /language_model/layers.33/self_attn/Mul_5_output_0 + - /language_model/layers.33/self_attn/Mul_6_output_0 + - /language_model/layers.33/self_attn/Mul_7_output_0 + - /language_model/layers.33/self_attn/Mul_8_output_0 + - /language_model/layers.33/self_attn/Mul_9_output_0 + - /language_model/layers.0/self_attn/Softmax_output_0 + - /language_model/layers.1/self_attn/Softmax_output_0 + - /language_model/layers.2/self_attn/Softmax_output_0 + - /language_model/layers.3/self_attn/Softmax_output_0 + - /language_model/layers.4/self_attn/Softmax_output_0 + - /language_model/layers.5/self_attn/Softmax_output_0 + - /language_model/layers.6/self_attn/Softmax_output_0 + - /language_model/layers.7/self_attn/Softmax_output_0 + - /language_model/layers.8/self_attn/Softmax_output_0 + - /language_model/layers.9/self_attn/Softmax_output_0 + - /language_model/layers.10/self_attn/Softmax_output_0 + - /language_model/layers.11/self_attn/Softmax_output_0 + - /language_model/layers.12/self_attn/Softmax_output_0 + - /language_model/layers.13/self_attn/Softmax_output_0 + - /language_model/layers.14/self_attn/Softmax_output_0 + - /language_model/layers.15/self_attn/Softmax_output_0 + - /language_model/layers.16/self_attn/Softmax_output_0 + - /language_model/layers.17/self_attn/Softmax_output_0 + - /language_model/layers.18/self_attn/Softmax_output_0 + - /language_model/layers.19/self_attn/Softmax_output_0 + - /language_model/layers.20/self_attn/Softmax_output_0 + - /language_model/layers.21/self_attn/Softmax_output_0 + - /language_model/layers.22/self_attn/Softmax_output_0 + - /language_model/layers.23/self_attn/Softmax_output_0 + - /language_model/layers.24/self_attn/Softmax_output_0 + - /language_model/layers.25/self_attn/Softmax_output_0 + - /language_model/layers.26/self_attn/Softmax_output_0 + - /language_model/layers.27/self_attn/Softmax_output_0 + - /language_model/layers.28/self_attn/Softmax_output_0 + - /language_model/layers.29/self_attn/Softmax_output_0 + - /language_model/layers.30/self_attn/Softmax_output_0 + - /language_model/layers.31/self_attn/Softmax_output_0 + - /language_model/layers.32/self_attn/Softmax_output_0 + - /language_model/layers.33/self_attn/Softmax_output_0 + From 0520c2d7f6f9c10363a56206497691b58160bde8 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Mon, 1 Dec 2025 16:42:42 -0800 Subject: [PATCH 06/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- examples/performance/compute_context_length/gemma3.py | 4 ++-- examples/performance/compute_context_length/gpt_oss.py | 4 ++-- examples/performance/compute_context_length/llama4.py | 8 ++++---- .../performance/compute_context_length/qwen2_5_vl_cb.py | 2 +- examples/performance/compute_context_length/qwen3moe.py | 8 ++++---- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py index 830e2ce43..9127afad1 100644 --- a/examples/performance/compute_context_length/gemma3.py +++ b/examples/performance/compute_context_length/gemma3.py @@ -29,8 +29,8 @@ ## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. ctx_len = 8192 -comp_ctx_lengths_prefill = [3072] # None # -comp_ctx_lengths_decode = [4096, ctx_len] # None # +comp_ctx_lengths_prefill = [3072] +comp_ctx_lengths_decode = [4096, ctx_len] # pass HF_TOKEN if gated model # For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ### diff --git a/examples/performance/compute_context_length/gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py index ff5334d4e..dbbec22d2 100644 --- a/examples/performance/compute_context_length/gpt_oss.py +++ b/examples/performance/compute_context_length/gpt_oss.py @@ -22,9 +22,9 @@ ctx_len = 4096 # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists. # Set the list of ccl during prefilling process -comp_ctx_lengths_prefill = [512, ctx_len] # None # +comp_ctx_lengths_prefill = [512, ctx_len] # Set the list of ccl during decoding process -comp_ctx_lengths_decode = [512, ctx_len] # None # +comp_ctx_lengths_decode = [512, ctx_len] qeff_model = QEFFAutoModelForCausalLM.from_pretrained( diff --git a/examples/performance/compute_context_length/llama4.py b/examples/performance/compute_context_length/llama4.py index cb0cb1939..c2cc5a84a 100644 --- a/examples/performance/compute_context_length/llama4.py +++ b/examples/performance/compute_context_length/llama4.py @@ -51,7 +51,7 @@ ctx_len=ctx_len, img_size=336, num_cores=16, - num_devices=4, + num_devices=8, max_num_tiles=17, mxfp6_matmul=True, mxint8_kv_cache=True, @@ -83,7 +83,7 @@ ) streamer = TextStreamer(tokenizer) - output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3], generation_len=100) + output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3, 4, 5, 6, 7], generation_len=100) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) print(output) @@ -95,7 +95,7 @@ ctx_len=ctx_len, img_size=336, num_cores=16, - num_devices=4, + num_devices=8, max_num_tiles=17, mxfp6_matmul=True, mxint8_kv_cache=True, @@ -129,7 +129,7 @@ ) inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) streamer = TextStreamer(tokenizer) - output = qeff_model.generate(inputs=inputs, device_ids=[8, 9, 10, 11], generation_len=100) + output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3, 4, 5, 6, 7], generation_len=100) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) print(output) diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py index 75fece6db..990cf95be 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py +++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py @@ -81,7 +81,7 @@ processor=processor, images=image_urls, generation_len=100, - device_ids=[28, 29, 30, 31], + device_ids=[0,1,2,3], ) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py index 9fb4c4d43..2daea7db2 100644 --- a/examples/performance/compute_context_length/qwen3moe.py +++ b/examples/performance/compute_context_length/qwen3moe.py @@ -12,7 +12,7 @@ model_name = "Qwen/Qwen3-30B-A3B-Instruct-2507" """ -# For CB inference, set continuous_batching to True and add full_batch_size,mxfp6,mint8 argument in compile function +# For CB inference, set continuous_batching to True and add full_batch_size,mxfp6,mxint8 argument in compile function # We will use prompt_len=1 for compilation for both cb and non-cb inference """ @@ -27,8 +27,8 @@ ctx_len = 1024 prefill_seq_len = 1 # In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same ccl specializations. -comp_ctx_lengths_prefill = [256, 512, ctx_len] # None # -comp_ctx_lengths_decode = [256, 512, ctx_len] # None # +comp_ctx_lengths_prefill = [256, 512, ctx_len] +comp_ctx_lengths_decode = [256, 512, ctx_len] model = QEFFAutoModelForCausalLM.from_pretrained( model_name, @@ -49,6 +49,6 @@ comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) -# mos=1, + tokenizer = AutoTokenizer.from_pretrained(model_name) exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer) From a97115aea47426b3a8cd5bf43d7fe924becb332d Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Tue, 2 Dec 2025 14:46:12 -0800 Subject: [PATCH 07/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- .../transformers/models/modeling_auto.py | 46 +++++++++---------- .../compute_context_length/basic_inference.py | 4 +- .../compute_context_length/gemma3.py | 10 ++-- .../compute_context_length/gpt_oss.py | 11 ++--- .../compute_context_length/granite_vision.py | 4 +- .../compute_context_length/internvl.py | 4 +- .../compute_context_length/llama4.py | 4 +- .../compute_context_length/llama4_cb.py | 8 +++- .../llama4_multi_image.py | 4 +- .../compute_context_length/mistral3.py | 4 +- .../compute_context_length/molmo.py | 8 ++-- .../compute_context_length/qwen2_5_vl.py | 7 ++- .../compute_context_length/qwen2_5_vl_cb.py | 5 +- .../compute_context_length/qwen3moe.py | 8 ++-- .../compute_context_length/vlm_inference.py | 6 ++- 15 files changed, 80 insertions(+), 53 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index dfa64270d..ce49b26aa 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -914,7 +914,7 @@ def __init__( self, model: nn.Module, continuous_batching: bool = False, - ccl_enabled: bool = False, + qaic_config: Optional[dict] = None, **kwargs, ): """ @@ -940,7 +940,7 @@ def __init__( self.vision_model = QEffVisionEncoderForTextImageToTextModel(model, **kwargs) self.lang_model = QEffCausalLMForTextImageToTextModel(model, qaic_config=qaic_config, **kwargs) self.continuous_batching = continuous_batching - self.ccl_enabled = ccl_enabled + self.ccl_enabled = qaic_config.get("ccl_enabled", False) self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None self.input_shapes, self.output_names = None, None @@ -960,7 +960,7 @@ def model_name(self) -> str: return mname @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Optional[dict] = None, **kwargs): """ Load a QEfficient multimodal model for dual QPC from a pretrained HuggingFace model or local path. @@ -985,13 +985,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) - ccl_enabled = kwargs.pop("ccl_enabled", None) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) return cls( model, pretrained_model_name_or_path=pretrained_model_name_or_path, - ccl_enabled=ccl_enabled, + qaic_config=qaic_config, **kwargs, ) @@ -1195,8 +1194,9 @@ def compile( # For supporting VLLM and Disaggregated with CCL if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: - self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill - self.comp_ctx_lengths_decode = comp_ctx_lengths_decode + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len + ) specializations, compiler_options = self.model.get_specializations( batch_size=batch_size, @@ -1619,7 +1619,7 @@ class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, Multimodal def __init__( self, model: nn.Module, - ccl_enabled: bool = False, + qaic_config: Optional[dict] = None, **kwargs, ): """ @@ -1661,7 +1661,7 @@ def __init__( else: self.model.config.use_cache = True self.hash_params["qeff_auto_class"] = self.__class__.__name__ - self.ccl_enabled = ccl_enabled + self.ccl_enabled = qaic_config.get("ccl_enabled", False) self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None if self.model.qaic_config is not None and self.model.qaic_config.get("num_kv_blocks", None) is not None: @@ -1671,6 +1671,7 @@ def __init__( def from_pretrained( cls, pretrained_model_name_or_path, + qaic_config: Optional[dict] = None, *args, **kwargs, ): @@ -1701,7 +1702,6 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) - ccl_enabled = kwargs.pop("ccl_enabled", None) from transformers import AutoConfig @@ -1713,7 +1713,7 @@ def from_pretrained( return cls( model, pretrained_model_name_or_path=pretrained_model_name_or_path, - ccl_enabled=ccl_enabled, + qaic_config=qaic_config, **kwargs, ) @@ -1840,8 +1840,9 @@ def compile( # For supporting VLLM and Disaggregated with CCL if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: - self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill - self.comp_ctx_lengths_decode = comp_ctx_lengths_decode + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len + ) # Get specializations from modelling file # TODO: expose this via the auto class as well @@ -2224,7 +2225,7 @@ def __new__( model: nn.Module, kv_offload: Optional[bool] = True, continuous_batching: bool = False, - ccl_enabled: bool = False, + qaic_config: Optional[dict] = None, **kwargs, ): """ @@ -2248,10 +2249,10 @@ def __new__( """ if kv_offload: return _QEffAutoModelForImageTextToTextDualQPC( - model, continuous_batching, ccl_enabled=ccl_enabled, **kwargs + model, continuous_batching, qaic_config=qaic_config, **kwargs ) else: - return _QEFFAutoModelForImageTextToTextSingleQPC(model, ccl_enabled=ccl_enabled, **kwargs) + return _QEFFAutoModelForImageTextToTextSingleQPC(model, qaic_config=qaic_config, **kwargs) @classmethod @with_replaced_quantizers @@ -2301,7 +2302,6 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) - ccl_enabled = kwargs.pop("ccl_enabled", None) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) return cls( @@ -2309,7 +2309,7 @@ def from_pretrained( kv_offload=kv_offload, continuous_batching=continuous_batching, pretrained_model_name_or_path=pretrained_model_name_or_path, - ccl_enabled=ccl_enabled, + qaic_config=qaic_config, **kwargs, ) @@ -2360,7 +2360,6 @@ def __init__( model: nn.Module, continuous_batching: bool = False, qaic_config: Optional[dict] = None, - ccl_enabled: bool = False, **kwargs, ): """ @@ -2416,7 +2415,7 @@ def __init__( self.is_tlm = transformed self.hash_params["qeff_auto_class"] = self.__class__.__name__ - self.ccl_enabled = ccl_enabled + self.ccl_enabled = qaic_config.get("ccl_enabled", False) self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None # ---Sampling--- @@ -2513,7 +2512,6 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kv_offload = kwargs.pop("kv_offload", None) - ccl_enabled = kwargs.pop("ccl_enabled", None) kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) @@ -2527,7 +2525,7 @@ def from_pretrained( model, kv_offload=kv_offload, pretrained_model_name_or_path=pretrained_model_name_or_path, - ccl_enabled=ccl_enabled, + qaic_config=qaic_config, **kwargs, ) return cls( @@ -2535,7 +2533,6 @@ def from_pretrained( continuous_batching=continuous_batching, qaic_config=qaic_config, pretrained_model_name_or_path=pretrained_model_name_or_path, - ccl_enabled=ccl_enabled, **kwargs, ) @@ -2983,6 +2980,9 @@ def compile( self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill self.comp_ctx_lengths_decode = comp_ctx_lengths_decode + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len, prefill_seq_len + ) # --- Validation --- if prefill_only is not None and not isinstance(prefill_only, bool): raise TypeError("`prefill_only` must be a boolean.") diff --git a/examples/performance/compute_context_length/basic_inference.py b/examples/performance/compute_context_length/basic_inference.py index 425d038d9..9615f0b30 100644 --- a/examples/performance/compute_context_length/basic_inference.py +++ b/examples/performance/compute_context_length/basic_inference.py @@ -117,7 +117,9 @@ def main(): model = QEFFAutoModelForCausalLM.from_pretrained( args.model_name, continuous_batching=args.continuous_batching, - ccl_enabled=args.ccl_enabled, + qaic_config={ + "ccl_enabled":args.ccl_enabled, + }, ) # Compile the model diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py index 9127afad1..3d9647a6e 100644 --- a/examples/performance/compute_context_length/gemma3.py +++ b/examples/performance/compute_context_length/gemma3.py @@ -38,8 +38,10 @@ model_id, config=config, attn_implementation="eager", - kv_offload=True, - ccl_enabled=True, + kv_offload=False, + qaic_config={ + "ccl_enabled":True, + }, ) ### use skip_vision=True, if want to run only text, or false ### @@ -58,7 +60,7 @@ aic_enable_depth_first=True, skip_vision=True, mos=1, - node_precision_info="examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml", + node_precision_info="examples/performance/compute_context_length/fp32_nodes_gemma3_4b.yaml", comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) @@ -96,7 +98,7 @@ mxint8_kv_cache=False, aic_enable_depth_first=True, mos=1, - node_precision_info="examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml", + node_precision_info="examples/performance/compute_context_length/fp32_nodes_gemma3_4b.yaml", comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) diff --git a/examples/performance/compute_context_length/gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py index dbbec22d2..c2f8398c1 100644 --- a/examples/performance/compute_context_length/gpt_oss.py +++ b/examples/performance/compute_context_length/gpt_oss.py @@ -21,15 +21,14 @@ ctx_len = 4096 # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists. -# Set the list of ccl during prefilling process -comp_ctx_lengths_prefill = [512, ctx_len] -# Set the list of ccl during decoding process -comp_ctx_lengths_decode = [512, ctx_len] - +# Set the list of ccl during prefilling and decoding processes +comp_ctx_lengths_prefill = comp_ctx_lengths_decode = [1024, ctx_len] qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_id, - ccl_enabled=True, + qaic_config={ + "ccl_enabled":True, + }, ) tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/granite_vision.py b/examples/performance/compute_context_length/granite_vision.py index 507ba11a4..9cb0afb04 100644 --- a/examples/performance/compute_context_length/granite_vision.py +++ b/examples/performance/compute_context_length/granite_vision.py @@ -41,7 +41,9 @@ def run_model( model_name, token=token, kv_offload=kv_offload, - ccl_enabled=ccl_enabled, + qaic_config={ + "ccl_enabled":ccl_enabled, + }, ) ## STEP - 2 Export & Compile the Model diff --git a/examples/performance/compute_context_length/internvl.py b/examples/performance/compute_context_length/internvl.py index bea3b49d3..86a73fbbb 100644 --- a/examples/performance/compute_context_length/internvl.py +++ b/examples/performance/compute_context_length/internvl.py @@ -188,7 +188,9 @@ def run_intern_on_aic( model_name, kv_offload=kv_offload, trust_remote_code=True, - ccl_enabled=ccl_enabled, + qaic_config={ + "ccl_enabled":ccl_enabled, + }, ) ## STEP 2 -- EXPORT & COMPILE THE MODEL diff --git a/examples/performance/compute_context_length/llama4.py b/examples/performance/compute_context_length/llama4.py index c2cc5a84a..774f2003f 100644 --- a/examples/performance/compute_context_length/llama4.py +++ b/examples/performance/compute_context_length/llama4.py @@ -36,7 +36,9 @@ attn_implementation="eager", kv_offload=True, config=config, - ccl_enabled=True, + qaic_config={ + "ccl_enabled":True, + }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py index 98653080c..119fb8256 100644 --- a/examples/performance/compute_context_length/llama4_cb.py +++ b/examples/performance/compute_context_length/llama4_cb.py @@ -41,7 +41,9 @@ kv_offload=True, config=config, continuous_batching=True, - ccl_enabled=True, + qaic_config={ + "ccl_enabled":True, + }, ) qeff_model.compile( @@ -66,7 +68,9 @@ attn_implementation="eager", kv_offload=True, config=config, - ccl_enabled=True, + qaic_config={ + "ccl_enabled":True, + }, ) qeff_model.compile( diff --git a/examples/performance/compute_context_length/llama4_multi_image.py b/examples/performance/compute_context_length/llama4_multi_image.py index 0fe8ffb78..8764de9ce 100644 --- a/examples/performance/compute_context_length/llama4_multi_image.py +++ b/examples/performance/compute_context_length/llama4_multi_image.py @@ -36,7 +36,9 @@ attn_implementation="eager", kv_offload=True, config=config, - ccl_enabled=True, + qaic_config={ + "ccl_enabled":True, + }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/mistral3.py b/examples/performance/compute_context_length/mistral3.py index 664e3c1c1..ae6d4b072 100644 --- a/examples/performance/compute_context_length/mistral3.py +++ b/examples/performance/compute_context_length/mistral3.py @@ -46,7 +46,9 @@ def run_model( model_name, kv_offload=kv_offload, config=config, - ccl_enabled=ccl_enabled, + qaic_config={ + "ccl_enabled":ccl_enabled, + }, ) ## STEP - 2 Export & Compile the Model diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py index 7434c62b2..6875068d2 100644 --- a/examples/performance/compute_context_length/molmo.py +++ b/examples/performance/compute_context_length/molmo.py @@ -15,7 +15,7 @@ model_id = "allenai/Molmo-7B-D-0924" config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) - +# For Testing Purpose Only # config.num_hidden_layers = 2 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). @@ -33,10 +33,12 @@ qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_id, - kv_offload=True, + kv_offload=False, trust_remote_code=True, config=config, - ccl_enabled=True, + qaic_config={ + "ccl_enabled":True, + }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) diff --git a/examples/performance/compute_context_length/qwen2_5_vl.py b/examples/performance/compute_context_length/qwen2_5_vl.py index 3266be634..86da61a5f 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl.py +++ b/examples/performance/compute_context_length/qwen2_5_vl.py @@ -19,7 +19,8 @@ ## For AWQ model update pytorch version to 2.8.* model_id = "Qwen/Qwen2.5-VL-32B-Instruct" config = AutoConfig.from_pretrained(model_id) -# config.text_config.num_hidden_layers = 2 +# For Testing Purpose Only +config.text_config.num_hidden_layers = 2 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). ## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. @@ -38,7 +39,9 @@ attn_implementation="eager", kv_offload=True, config=config, - ccl_enabled=True, + qaic_config={ + "ccl_enabled":True, + }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py index 990cf95be..485a36694 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py +++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py @@ -16,6 +16,7 @@ ## For AWQ model update pytorch version to 2.8.* model_id = "Qwen/Qwen2.5-VL-32B-Instruct" config = AutoConfig.from_pretrained(model_id) +# For Testing Purpose Only config.text_config.num_hidden_layers = 4 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). @@ -36,7 +37,9 @@ kv_offload=True, config=config, continuous_batching=True, - ccl_enabled=True, + qaic_config={ + "ccl_enabled":True, + }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py index 2daea7db2..fd70f4684 100644 --- a/examples/performance/compute_context_length/qwen3moe.py +++ b/examples/performance/compute_context_length/qwen3moe.py @@ -27,14 +27,14 @@ ctx_len = 1024 prefill_seq_len = 1 # In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same ccl specializations. -comp_ctx_lengths_prefill = [256, 512, ctx_len] -comp_ctx_lengths_decode = [256, 512, ctx_len] +comp_ctx_lengths_prefill = comp_ctx_lengths_decode = [256, 512, ctx_len] model = QEFFAutoModelForCausalLM.from_pretrained( model_name, continuous_batching=False, - ccl_enabled=True, - num_hidden_layers=4, + qaic_config={ + "ccl_enabled":True, + }, ) model.compile( diff --git a/examples/performance/compute_context_length/vlm_inference.py b/examples/performance/compute_context_length/vlm_inference.py index f9577bd2c..6ec132a17 100644 --- a/examples/performance/compute_context_length/vlm_inference.py +++ b/examples/performance/compute_context_length/vlm_inference.py @@ -77,7 +77,9 @@ def run_model( token=hf_token, attn_implementation="eager", kv_offload=kv_offload, - ccl_enabled=ccl_enabled, + qaic_config={ + "ccl_enabled":ccl_enabled, + }, ) ## STEP 2: Export & Compile the Model @@ -202,7 +204,7 @@ def main(): parser.add_argument( "--img-size", type=int, - default=336, + default=560, help="Image size for processing", ) parser.add_argument( From b9613c422e0901028d1cacced705736f48973bbb Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Tue, 2 Dec 2025 14:50:40 -0800 Subject: [PATCH 08/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- examples/performance/compute_context_length/gemma3.py | 2 +- examples/performance/compute_context_length/molmo.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py index 3d9647a6e..752640ef7 100644 --- a/examples/performance/compute_context_length/gemma3.py +++ b/examples/performance/compute_context_length/gemma3.py @@ -38,7 +38,7 @@ model_id, config=config, attn_implementation="eager", - kv_offload=False, + kv_offload=True, qaic_config={ "ccl_enabled":True, }, diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py index 6875068d2..3d3bae058 100644 --- a/examples/performance/compute_context_length/molmo.py +++ b/examples/performance/compute_context_length/molmo.py @@ -33,7 +33,7 @@ qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_id, - kv_offload=False, + kv_offload=True, trust_remote_code=True, config=config, qaic_config={ From e4579b6e03b1b0eb1951396e525ac195bf070a3f Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Wed, 3 Dec 2025 16:52:14 -0800 Subject: [PATCH 09/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- QEfficient/transformers/models/modeling_auto.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index ce49b26aa..3f6f191a5 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -940,7 +940,9 @@ def __init__( self.vision_model = QEffVisionEncoderForTextImageToTextModel(model, **kwargs) self.lang_model = QEffCausalLMForTextImageToTextModel(model, qaic_config=qaic_config, **kwargs) self.continuous_batching = continuous_batching - self.ccl_enabled = qaic_config.get("ccl_enabled", False) + self.ccl_enabled = False + if qaic_config: + self.ccl_enabled = qaic_config.get("ccl_enabled", False) self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None self.input_shapes, self.output_names = None, None @@ -1661,7 +1663,9 @@ def __init__( else: self.model.config.use_cache = True self.hash_params["qeff_auto_class"] = self.__class__.__name__ - self.ccl_enabled = qaic_config.get("ccl_enabled", False) + self.ccl_enabled = False + if qaic_config: + self.ccl_enabled = qaic_config.get("ccl_enabled", False) self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None if self.model.qaic_config is not None and self.model.qaic_config.get("num_kv_blocks", None) is not None: @@ -2415,7 +2419,9 @@ def __init__( self.is_tlm = transformed self.hash_params["qeff_auto_class"] = self.__class__.__name__ - self.ccl_enabled = qaic_config.get("ccl_enabled", False) + self.ccl_enabled = False + if qaic_config: + self.ccl_enabled = qaic_config.get("ccl_enabled", False) self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None # ---Sampling--- From 70938296c047542d3070bf2531dfd64137153cde Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Wed, 3 Dec 2025 23:15:33 -0800 Subject: [PATCH 10/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- .../performance/compute_context_length/basic_inference.py | 2 +- examples/performance/compute_context_length/gemma3.py | 4 ++-- examples/performance/compute_context_length/gpt_oss.py | 2 +- examples/performance/compute_context_length/granite_vision.py | 2 +- examples/performance/compute_context_length/internvl.py | 2 +- examples/performance/compute_context_length/llama4.py | 2 +- examples/performance/compute_context_length/llama4_cb.py | 4 ++-- .../performance/compute_context_length/llama4_multi_image.py | 2 +- examples/performance/compute_context_length/mistral3.py | 2 +- examples/performance/compute_context_length/molmo.py | 2 +- examples/performance/compute_context_length/qwen2_5_vl.py | 2 +- examples/performance/compute_context_length/qwen2_5_vl_cb.py | 4 ++-- examples/performance/compute_context_length/qwen3moe.py | 2 +- examples/performance/compute_context_length/vlm_inference.py | 2 +- 14 files changed, 17 insertions(+), 17 deletions(-) diff --git a/examples/performance/compute_context_length/basic_inference.py b/examples/performance/compute_context_length/basic_inference.py index 9615f0b30..4533c47e8 100644 --- a/examples/performance/compute_context_length/basic_inference.py +++ b/examples/performance/compute_context_length/basic_inference.py @@ -118,7 +118,7 @@ def main(): args.model_name, continuous_batching=args.continuous_batching, qaic_config={ - "ccl_enabled":args.ccl_enabled, + "ccl_enabled": args.ccl_enabled, }, ) diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py index 752640ef7..d9672b9e3 100644 --- a/examples/performance/compute_context_length/gemma3.py +++ b/examples/performance/compute_context_length/gemma3.py @@ -30,7 +30,7 @@ ctx_len = 8192 comp_ctx_lengths_prefill = [3072] -comp_ctx_lengths_decode = [4096, ctx_len] +comp_ctx_lengths_decode = [4096, ctx_len] # pass HF_TOKEN if gated model # For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ### @@ -40,7 +40,7 @@ attn_implementation="eager", kv_offload=True, qaic_config={ - "ccl_enabled":True, + "ccl_enabled": True, }, ) diff --git a/examples/performance/compute_context_length/gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py index c2f8398c1..39a5d48ed 100644 --- a/examples/performance/compute_context_length/gpt_oss.py +++ b/examples/performance/compute_context_length/gpt_oss.py @@ -27,7 +27,7 @@ qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_id, qaic_config={ - "ccl_enabled":True, + "ccl_enabled": True, }, ) tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/granite_vision.py b/examples/performance/compute_context_length/granite_vision.py index 9cb0afb04..6dd38395c 100644 --- a/examples/performance/compute_context_length/granite_vision.py +++ b/examples/performance/compute_context_length/granite_vision.py @@ -42,7 +42,7 @@ def run_model( token=token, kv_offload=kv_offload, qaic_config={ - "ccl_enabled":ccl_enabled, + "ccl_enabled": ccl_enabled, }, ) diff --git a/examples/performance/compute_context_length/internvl.py b/examples/performance/compute_context_length/internvl.py index 86a73fbbb..19bcf4bc1 100644 --- a/examples/performance/compute_context_length/internvl.py +++ b/examples/performance/compute_context_length/internvl.py @@ -189,7 +189,7 @@ def run_intern_on_aic( kv_offload=kv_offload, trust_remote_code=True, qaic_config={ - "ccl_enabled":ccl_enabled, + "ccl_enabled": ccl_enabled, }, ) diff --git a/examples/performance/compute_context_length/llama4.py b/examples/performance/compute_context_length/llama4.py index 774f2003f..8cdbd70a1 100644 --- a/examples/performance/compute_context_length/llama4.py +++ b/examples/performance/compute_context_length/llama4.py @@ -37,7 +37,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled":True, + "ccl_enabled": True, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py index 119fb8256..ffbbff67f 100644 --- a/examples/performance/compute_context_length/llama4_cb.py +++ b/examples/performance/compute_context_length/llama4_cb.py @@ -42,7 +42,7 @@ config=config, continuous_batching=True, qaic_config={ - "ccl_enabled":True, + "ccl_enabled": True, }, ) @@ -69,7 +69,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled":True, + "ccl_enabled": True, }, ) diff --git a/examples/performance/compute_context_length/llama4_multi_image.py b/examples/performance/compute_context_length/llama4_multi_image.py index 8764de9ce..fd513fe45 100644 --- a/examples/performance/compute_context_length/llama4_multi_image.py +++ b/examples/performance/compute_context_length/llama4_multi_image.py @@ -37,7 +37,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled":True, + "ccl_enabled": True, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/mistral3.py b/examples/performance/compute_context_length/mistral3.py index ae6d4b072..3763fbcde 100644 --- a/examples/performance/compute_context_length/mistral3.py +++ b/examples/performance/compute_context_length/mistral3.py @@ -47,7 +47,7 @@ def run_model( kv_offload=kv_offload, config=config, qaic_config={ - "ccl_enabled":ccl_enabled, + "ccl_enabled": ccl_enabled, }, ) diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py index 3d3bae058..b5f1f50e6 100644 --- a/examples/performance/compute_context_length/molmo.py +++ b/examples/performance/compute_context_length/molmo.py @@ -37,7 +37,7 @@ trust_remote_code=True, config=config, qaic_config={ - "ccl_enabled":True, + "ccl_enabled": True, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) diff --git a/examples/performance/compute_context_length/qwen2_5_vl.py b/examples/performance/compute_context_length/qwen2_5_vl.py index 86da61a5f..20960b6a9 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl.py +++ b/examples/performance/compute_context_length/qwen2_5_vl.py @@ -40,7 +40,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled":True, + "ccl_enabled": True, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py index 485a36694..fc330e14e 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py +++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py @@ -38,7 +38,7 @@ config=config, continuous_batching=True, qaic_config={ - "ccl_enabled":True, + "ccl_enabled": True, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) @@ -84,7 +84,7 @@ processor=processor, images=image_urls, generation_len=100, - device_ids=[0,1,2,3], + device_ids=[0, 1, 2, 3], ) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py index fd70f4684..b53a28362 100644 --- a/examples/performance/compute_context_length/qwen3moe.py +++ b/examples/performance/compute_context_length/qwen3moe.py @@ -33,7 +33,7 @@ model_name, continuous_batching=False, qaic_config={ - "ccl_enabled":True, + "ccl_enabled": True, }, ) diff --git a/examples/performance/compute_context_length/vlm_inference.py b/examples/performance/compute_context_length/vlm_inference.py index 6ec132a17..876daa3e6 100644 --- a/examples/performance/compute_context_length/vlm_inference.py +++ b/examples/performance/compute_context_length/vlm_inference.py @@ -78,7 +78,7 @@ def run_model( attn_implementation="eager", kv_offload=kv_offload, qaic_config={ - "ccl_enabled":ccl_enabled, + "ccl_enabled": ccl_enabled, }, ) From 75607498c6dc84e1edbb57dd108045da326e29e7 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Thu, 4 Dec 2025 16:03:01 -0800 Subject: [PATCH 11/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- QEfficient/transformers/models/modeling_auto.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 3f6f191a5..39ad30b80 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1646,12 +1646,9 @@ def __init__( raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") super().__init__(model, **kwargs) -<<<<<<< HEAD self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(qaic_config) self.model.qaic_config = qaic_config -======= ->>>>>>> 29b555b (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process) # to handle internvl models if hasattr(self.model.config, "llm_config") and hasattr(self.model.config, "vision_config"): self.model.config.llm_config.use_cache = True From 399fdbd6bb81fcbd504eb659a352600eb36f50e2 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Fri, 5 Dec 2025 03:24:24 -0800 Subject: [PATCH 12/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- QEfficient/transformers/models/modeling_auto.py | 1 - 1 file changed, 1 deletion(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 39ad30b80..99ed21c90 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1646,7 +1646,6 @@ def __init__( raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") super().__init__(model, **kwargs) - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(qaic_config) self.model.qaic_config = qaic_config # to handle internvl models From c7774b8787aee4311ebd257864f1d84864cb19d8 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Fri, 5 Dec 2025 13:31:23 -0800 Subject: [PATCH 13/13] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- QEfficient/transformers/spd/spd_transform_forward.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/QEfficient/transformers/spd/spd_transform_forward.py b/QEfficient/transformers/spd/spd_transform_forward.py index e82bf4cdf..4703cb18d 100644 --- a/QEfficient/transformers/spd/spd_transform_forward.py +++ b/QEfficient/transformers/spd/spd_transform_forward.py @@ -76,6 +76,7 @@ def tlm_forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + comp_ctx_lengths: Optional[torch.LongTensor] = None, batch_index: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, @@ -123,6 +124,7 @@ def tlm_forward( attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, + comp_ctx_lengths=comp_ctx_lengths, batch_index=batch_index, inputs_embeds=inputs_embeds, use_cache=use_cache,