From 1ba2224746f0192fed6c861db37f2e4c68efc182 Mon Sep 17 00:00:00 2001 From: titaiwangms Date: Fri, 8 May 2026 17:24:05 +0000 Subject: [PATCH 1/6] feat: add components_to_skip to OnnxBlockWiseRtnQuantization Add optional `components_to_skip` parameter (list of component names) to OnnxBlockWiseRtnQuantization pass. When a composite model component's name appears in this list, its model files are copied to the output path unchanged instead of being quantized. Non-listed components are quantized as usual. When not set (default None), all components are quantized (backward compatible). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/passes/onnx/rtn_quantization.py | 64 ++++++++++++- test/passes/onnx/test_rtn_quantization.py | 107 ++++++++++++++++++++++ 2 files changed, 170 insertions(+), 1 deletion(-) diff --git a/olive/passes/onnx/rtn_quantization.py b/olive/passes/onnx/rtn_quantization.py index e66cec112..a27419c4c 100644 --- a/olive/passes/onnx/rtn_quantization.py +++ b/olive/passes/onnx/rtn_quantization.py @@ -3,8 +3,9 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import logging +import shutil from pathlib import Path -from typing import Optional +from typing import List, Optional import numpy as np import numpy.typing as npt @@ -69,9 +70,70 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon default_value=None, description="List of node names to include in quantization.", ), + "components_to_skip": PassConfigParam( + type_=list, + required=False, + default_value=None, + description=( + "Optional list of component names to skip quantization for " + "(e.g. ['embedding'] to pass the embedding model through unchanged). " + "When a composite model component's name matches an entry in this list, " + "its files are copied to the output path without modification. " + "When not set, all components are quantized (default, backward compatible). " + "Has no effect on single-component (non-composite) models." + ), + ), **get_external_data_config(), } + def run(self, model, output_model_path: str): + """Run quantization, skipping components listed in components_to_skip. + + Overrides the base Pass.run() to intercept CompositeModelHandler processing. + Components whose names appear in config.components_to_skip are copied to the + output path unchanged instead of being quantized. + """ + from olive.model import CompositeModelHandler + from olive.model.handler.onnx import ONNXModelHandler as OnnxHandler + + components_to_skip: set[str] = set(self.config.components_to_skip or []) + if not components_to_skip or not isinstance(model, CompositeModelHandler): + return super().run(model, output_model_path) + + model_dir = Path(output_model_path).with_suffix("") + model_dir.mkdir(parents=True, exist_ok=True) + + components = [] + component_names = [] + for component_name, component_model in model.get_model_components(): + component_output_path = model_dir / component_name + if component_name in components_to_skip: + logger.info( + "OnnxBlockWiseRtnQuantization: skipping quantization for component '%s'.", + component_name, + ) + src = Path(component_model.model_path) + # model_path may point to the .onnx file rather than its parent dir + src_dir = src.parent if src.is_file() else src + if src_dir != component_output_path: + if component_output_path.exists(): + shutil.rmtree(str(component_output_path)) + shutil.copytree(str(src_dir), str(component_output_path)) + output_component = OnnxHandler( + model_path=str(component_output_path), + onnx_file_name=component_model.onnx_file_name, + model_attributes=component_model.model_attributes, + ) + else: + output_component = self.run(component_model, str(component_output_path)) + components.append(output_component) + component_names.append(component_name) + + output_model = CompositeModelHandler(components, component_names, model_path=model_dir) + output_model.model_attributes = output_model.model_attributes or model.model_attributes + Pass._carry_forward_additional_files(model, output_model) + return output_model + def _run_for_config( self, model: ONNXModelHandler, config: type[BasePassConfig], output_model_path: str ) -> ONNXModelHandler: diff --git a/test/passes/onnx/test_rtn_quantization.py b/test/passes/onnx/test_rtn_quantization.py index edec80ec0..0b8e2d725 100644 --- a/test/passes/onnx/test_rtn_quantization.py +++ b/test/passes/onnx/test_rtn_quantization.py @@ -427,3 +427,110 @@ def test_rtn_quantization_removes_unused_initializers(self, matmul_model_path, t assert "weight" not in init_names, ( f"Original FP32 'weight' initializer should have been removed, found: {init_names}" ) + + +class TestRTNQuantizationComponentsToSkip: + """Tests for the components_to_skip parameter on OnnxBlockWiseRtnQuantization.""" + + @staticmethod + def _make_matmul_model(tmp_path, name: str) -> ONNXModelHandler: + """Create a tiny MatMul ONNX model and return an ONNXModelHandler.""" + weight = np.random.randn(64, 128).astype(np.float32) + inp = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 64]) + out = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, [1, 128]) + weight_init = onnx.helper.make_tensor(name="weight", data_type=onnx.TensorProto.FLOAT, dims=[64, 128], vals=weight.flatten().tolist()) + node = onnx.helper.make_node("MatMul", ["input", "weight"], ["output"], name="MatMul_Node") + graph = onnx.helper.make_graph([node], "g", [inp], [out], initializer=[weight_init]) + model_def = onnx.helper.make_model(graph, producer_name="test") + model_def.opset_import[0].version = 13 + + model_dir = tmp_path / name + model_dir.mkdir(parents=True, exist_ok=True) + onnx.save(model_def, str(model_dir / "model.onnx")) + return ONNXModelHandler(model_path=str(model_dir), onnx_file_name="model.onnx") + + @staticmethod + def _make_pass(components_to_skip=None) -> OnnxBlockWiseRtnQuantization: + from olive.hardware.accelerator import AcceleratorSpec + accelerator_spec = AcceleratorSpec(accelerator_type="CPU", execution_provider="CPUExecutionProvider") + config = {"bits": 4, "block_size": 128, "axis": 0, "is_symmetric": True} + if components_to_skip is not None: + config["components_to_skip"] = components_to_skip + return create_pass_from_dict(OnnxBlockWiseRtnQuantization, config, disable_search=True, accelerator_spec=accelerator_spec) + + def test_components_to_skip_passes_component_through_unchanged(self, tmp_path): + """Skipped component's model files are copied without quantization.""" + from olive.model.handler.composite import CompositeModelHandler + + decoder = self._make_matmul_model(tmp_path / "src", "decoder") + embedding = self._make_matmul_model(tmp_path / "src", "embedding") + + composite = CompositeModelHandler( + model_components=[decoder, embedding], + model_component_names=["decoder", "embedding"], + model_path=str(tmp_path / "src"), + ) + + p = self._make_pass(components_to_skip=["embedding"]) + result = p.run(composite, str(tmp_path / "out")) + + assert isinstance(result, CompositeModelHandler) + assert result.model_component_names == ["decoder", "embedding"] + + # decoder should be quantized (MatMulNBits present) + decoder_out = next(m for name, m in result.get_model_components() if name == "decoder") + decoder_ir = ir.load(decoder_out.model_path) + assert any(n.op_type == str(OpType.MatMulNBits) for n in decoder_ir.graph.all_nodes()), ( + "decoder should be quantized (MatMulNBits expected)" + ) + + # embedding should be unchanged (original MatMul still present) + emb_out = next(m for name, m in result.get_model_components() if name == "embedding") + emb_ir = ir.load(emb_out.model_path) + has_matmul = any(n.op_type == str(OpType.MatMul) for n in emb_ir.graph.all_nodes()) + has_nbits = any(n.op_type == str(OpType.MatMulNBits) for n in emb_ir.graph.all_nodes()) + assert has_matmul and not has_nbits, "embedding should be passed through unchanged (no MatMulNBits)" + + def test_components_to_skip_none_quantizes_all(self, tmp_path): + """When components_to_skip is not set, all composite components are quantized.""" + from olive.model.handler.composite import CompositeModelHandler + + decoder = self._make_matmul_model(tmp_path / "src", "decoder") + embedding = self._make_matmul_model(tmp_path / "src", "embedding") + + composite = CompositeModelHandler( + model_components=[decoder, embedding], + model_component_names=["decoder", "embedding"], + model_path=str(tmp_path / "src"), + ) + + p = self._make_pass(components_to_skip=None) + result = p.run(composite, str(tmp_path / "out")) + + assert isinstance(result, CompositeModelHandler) + + for name, component in result.get_model_components(): + component_ir = ir.load(component.model_path) + assert any(n.op_type == str(OpType.MatMulNBits) for n in component_ir.graph.all_nodes()), ( + f"component '{name}' should be quantized when components_to_skip is None" + ) + + def test_components_to_skip_does_not_affect_single_model(self, tmp_path): + """components_to_skip has no effect on non-composite (single) models.""" + model = self._make_matmul_model(tmp_path, "single") + p = self._make_pass(components_to_skip=["single"]) + result = p.run(model, str(tmp_path / "out")) + + # Single model should still be quantized despite its path matching the skip list + result_ir = ir.load(result.model_path) + assert any(n.op_type == str(OpType.MatMulNBits) for n in result_ir.graph.all_nodes()), ( + "Single-component model should be quantized even when components_to_skip is set" + ) + + def test_components_to_skip_in_default_config(self): + """components_to_skip must appear in _default_config with None as default.""" + accelerator_spec = AcceleratorSpec(accelerator_type="CPU", execution_provider="CPUExecutionProvider") + config = OnnxBlockWiseRtnQuantization._default_config(accelerator_spec) # pylint: disable=protected-access + assert "components_to_skip" in config + assert config["components_to_skip"].default_value is None + assert config["components_to_skip"].required is False From 89af8e7ebf565cbfd3d4094bac7ca1f4e9229efb Mon Sep 17 00:00:00 2001 From: titaiwangms Date: Fri, 8 May 2026 17:27:17 +0000 Subject: [PATCH 2/6] fix: remove unused List import and redundant required=False - Remove unused List from typing import (only Optional is needed) - Remove redundant required=False (implied by default_value=None) to match conventions of neighboring PassConfigParam declarations Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/passes/onnx/rtn_quantization.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/olive/passes/onnx/rtn_quantization.py b/olive/passes/onnx/rtn_quantization.py index a27419c4c..2a6d094d0 100644 --- a/olive/passes/onnx/rtn_quantization.py +++ b/olive/passes/onnx/rtn_quantization.py @@ -5,7 +5,7 @@ import logging import shutil from pathlib import Path -from typing import List, Optional +from typing import Optional import numpy as np import numpy.typing as npt @@ -72,7 +72,6 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon ), "components_to_skip": PassConfigParam( type_=list, - required=False, default_value=None, description=( "Optional list of component names to skip quantization for " From 326dbfbc5ec7d871ffd7e67f28289fb07af04caf Mon Sep 17 00:00:00 2001 From: titaiwangms Date: Fri, 8 May 2026 17:29:39 +0000 Subject: [PATCH 3/6] fix: mirror _initialize() guard from base Pass.run() in composite override The base Pass.run() gates on self._initialized before calling _initialize(). When components_to_skip is set and the model is a CompositeModelHandler, the override bypasses super().run() entirely, so the init guard must be replicated explicitly. This ensures consistent lifecycle semantics if a subclass ever overrides _initialize(). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/passes/onnx/rtn_quantization.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/olive/passes/onnx/rtn_quantization.py b/olive/passes/onnx/rtn_quantization.py index 2a6d094d0..5b1603e44 100644 --- a/olive/passes/onnx/rtn_quantization.py +++ b/olive/passes/onnx/rtn_quantization.py @@ -99,6 +99,11 @@ def run(self, model, output_model_path: str): if not components_to_skip or not isinstance(model, CompositeModelHandler): return super().run(model, output_model_path) + # Mirror the initialization guard from the base class run(). + if not self._initialized: + self._initialize() + self._initialized = True + model_dir = Path(output_model_path).with_suffix("") model_dir.mkdir(parents=True, exist_ok=True) From 9c21a2360ea05a188be36dc92ffee57faddfb3a9 Mon Sep 17 00:00:00 2001 From: titaiwangms Date: Fri, 8 May 2026 18:20:22 +0000 Subject: [PATCH 4/6] fix: address PR review comments for components_to_skip - Call _carry_forward_additional_files per-skipped component to mirror what base Pass.run() does for each individual component - Remove redundant AcceleratorSpec reimport in test _make_pass - Wrap long lines in test helpers to stay under 120 chars (Black) - Split compound assertion into two separate assertions for clarity Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/passes/onnx/rtn_quantization.py | 5 +++++ test/passes/onnx/test_rtn_quantization.py | 15 +++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/olive/passes/onnx/rtn_quantization.py b/olive/passes/onnx/rtn_quantization.py index 5b1603e44..eca0abc74 100644 --- a/olive/passes/onnx/rtn_quantization.py +++ b/olive/passes/onnx/rtn_quantization.py @@ -128,6 +128,11 @@ def run(self, model, output_model_path: str): onnx_file_name=component_model.onnx_file_name, model_attributes=component_model.model_attributes, ) + # Mirror what the base run() does for each individual component. + output_component.model_attributes = ( + output_component.model_attributes or component_model.model_attributes + ) + Pass._carry_forward_additional_files(component_model, output_component) else: output_component = self.run(component_model, str(component_output_path)) components.append(output_component) diff --git a/test/passes/onnx/test_rtn_quantization.py b/test/passes/onnx/test_rtn_quantization.py index 0b8e2d725..f011cc99c 100644 --- a/test/passes/onnx/test_rtn_quantization.py +++ b/test/passes/onnx/test_rtn_quantization.py @@ -438,7 +438,12 @@ def _make_matmul_model(tmp_path, name: str) -> ONNXModelHandler: weight = np.random.randn(64, 128).astype(np.float32) inp = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 64]) out = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, [1, 128]) - weight_init = onnx.helper.make_tensor(name="weight", data_type=onnx.TensorProto.FLOAT, dims=[64, 128], vals=weight.flatten().tolist()) + weight_init = onnx.helper.make_tensor( + name="weight", + data_type=onnx.TensorProto.FLOAT, + dims=[64, 128], + vals=weight.flatten().tolist(), + ) node = onnx.helper.make_node("MatMul", ["input", "weight"], ["output"], name="MatMul_Node") graph = onnx.helper.make_graph([node], "g", [inp], [out], initializer=[weight_init]) model_def = onnx.helper.make_model(graph, producer_name="test") @@ -451,12 +456,13 @@ def _make_matmul_model(tmp_path, name: str) -> ONNXModelHandler: @staticmethod def _make_pass(components_to_skip=None) -> OnnxBlockWiseRtnQuantization: - from olive.hardware.accelerator import AcceleratorSpec accelerator_spec = AcceleratorSpec(accelerator_type="CPU", execution_provider="CPUExecutionProvider") config = {"bits": 4, "block_size": 128, "axis": 0, "is_symmetric": True} if components_to_skip is not None: config["components_to_skip"] = components_to_skip - return create_pass_from_dict(OnnxBlockWiseRtnQuantization, config, disable_search=True, accelerator_spec=accelerator_spec) + return create_pass_from_dict( + OnnxBlockWiseRtnQuantization, config, disable_search=True, accelerator_spec=accelerator_spec + ) def test_components_to_skip_passes_component_through_unchanged(self, tmp_path): """Skipped component's model files are copied without quantization.""" @@ -489,7 +495,8 @@ def test_components_to_skip_passes_component_through_unchanged(self, tmp_path): emb_ir = ir.load(emb_out.model_path) has_matmul = any(n.op_type == str(OpType.MatMul) for n in emb_ir.graph.all_nodes()) has_nbits = any(n.op_type == str(OpType.MatMulNBits) for n in emb_ir.graph.all_nodes()) - assert has_matmul and not has_nbits, "embedding should be passed through unchanged (no MatMulNBits)" + assert has_matmul, "embedding should still contain the original MatMul op" + assert not has_nbits, "embedding should not be quantized (no MatMulNBits expected)" def test_components_to_skip_none_quantizes_all(self, tmp_path): """When components_to_skip is not set, all composite components are quantized.""" From d937391981f9e653af305617f726d235dac6692f Mon Sep 17 00:00:00 2001 From: titaiwangms Date: Fri, 8 May 2026 18:58:05 +0000 Subject: [PATCH 5/6] fix: address PR review issues in OnnxBlockWiseRtnQuantization - Use ONNXModelHandler directly (remove OnnxHandler alias imported from internal module path) - Use list[str] type annotation for components_to_skip (matches mobius_model_builder.py convention from feat/mobius-builder-components-filter) - Add warning when components_to_skip names a component not found in the composite model (misspellings were silently ignored before) - Add comment explaining the _initialized guard in run() override: it mirrors Pass.run() behavior and is required because we bypass super().run() for composite models - Remove dead code: model_attributes was set in the constructor and then redundantly reassigned on the next line for skipped components Also adds test_components_to_skip_unknown_name_warns to cover the warning case. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/passes/onnx/rtn_quantization.py | 26 ++++++++++++------ test/passes/onnx/test_rtn_quantization.py | 32 +++++++++++++++++++++++ 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/olive/passes/onnx/rtn_quantization.py b/olive/passes/onnx/rtn_quantization.py index eca0abc74..a2daf959d 100644 --- a/olive/passes/onnx/rtn_quantization.py +++ b/olive/passes/onnx/rtn_quantization.py @@ -71,7 +71,7 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon description="List of node names to include in quantization.", ), "components_to_skip": PassConfigParam( - type_=list, + type_=list[str], default_value=None, description=( "Optional list of component names to skip quantization for " @@ -93,13 +93,27 @@ def run(self, model, output_model_path: str): output path unchanged instead of being quantized. """ from olive.model import CompositeModelHandler - from olive.model.handler.onnx import ONNXModelHandler as OnnxHandler components_to_skip: set[str] = set(self.config.components_to_skip or []) if not components_to_skip or not isinstance(model, CompositeModelHandler): return super().run(model, output_model_path) - # Mirror the initialization guard from the base class run(). + # Warn about component names that won't match anything — misspellings are + # silently ignored otherwise since skipping is non-fatal. + all_component_names = {name for name, _ in model.get_model_components()} + unknown_skips = components_to_skip - all_component_names + if unknown_skips: + logger.warning( + "OnnxBlockWiseRtnQuantization: components_to_skip contains name(s) not found " + "in this composite model: %s. Available components: %s", + sorted(unknown_skips), + sorted(all_component_names), + ) + + # Mirror the _initialized guard from the base Pass.run() implementation. + # Pass.run() checks and sets self._initialized before calling _run_for_config; + # since we bypass super().run() for composite models, we must replicate it here + # so lazy initialization (e.g. loading config, setting up hardware state) still runs. if not self._initialized: self._initialize() self._initialized = True @@ -123,15 +137,11 @@ def run(self, model, output_model_path: str): if component_output_path.exists(): shutil.rmtree(str(component_output_path)) shutil.copytree(str(src_dir), str(component_output_path)) - output_component = OnnxHandler( + output_component = ONNXModelHandler( model_path=str(component_output_path), onnx_file_name=component_model.onnx_file_name, model_attributes=component_model.model_attributes, ) - # Mirror what the base run() does for each individual component. - output_component.model_attributes = ( - output_component.model_attributes or component_model.model_attributes - ) Pass._carry_forward_additional_files(component_model, output_component) else: output_component = self.run(component_model, str(component_output_path)) diff --git a/test/passes/onnx/test_rtn_quantization.py b/test/passes/onnx/test_rtn_quantization.py index f011cc99c..460cf9142 100644 --- a/test/passes/onnx/test_rtn_quantization.py +++ b/test/passes/onnx/test_rtn_quantization.py @@ -541,3 +541,35 @@ def test_components_to_skip_in_default_config(self): assert "components_to_skip" in config assert config["components_to_skip"].default_value is None assert config["components_to_skip"].required is False + + def test_components_to_skip_unknown_name_warns(self, tmp_path): + """Misspelled or missing component names in components_to_skip must log a warning.""" + from olive.model.handler.composite import CompositeModelHandler + + decoder = self._make_matmul_model(tmp_path / "src", "decoder") + vision = self._make_matmul_model(tmp_path / "src", "vision") + composite = CompositeModelHandler( + model_components=[decoder, vision], + model_component_names=["decoder", "vision"], + ) + + p = self._make_pass(components_to_skip=["typo_component"]) + + import logging + + records = [] + + class _Handler(logging.Handler): + def emit(self, record): + records.append(record.getMessage()) + + rtn_logger = logging.getLogger("olive.passes.onnx.rtn_quantization") + rtn_logger.addHandler(_Handler()) + try: + p.run(composite, str(tmp_path / "out")) + finally: + rtn_logger.handlers = [h for h in rtn_logger.handlers if not isinstance(h, _Handler)] + + assert any("typo_component" in msg for msg in records), ( + f"Expected warning about unknown component name 'typo_component', got: {records}" + ) From 4da744628a3346d09c111cac43c953071ee7f952 Mon Sep 17 00:00:00 2001 From: titaiwangms Date: Fri, 8 May 2026 23:11:18 +0000 Subject: [PATCH 6/6] fix: add onnx_file_name fallback when skipping component in OnnxBlockWiseRtnQuantization When copying a skipped component, onnx_file_name may be None if the ONNXModelHandler was constructed without an explicit file name. Fall back to 'model.onnx' which is the standard Olive convention to avoid passing None to the output ONNXModelHandler constructor. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/passes/onnx/rtn_quantization.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/olive/passes/onnx/rtn_quantization.py b/olive/passes/onnx/rtn_quantization.py index a2daf959d..9148903f4 100644 --- a/olive/passes/onnx/rtn_quantization.py +++ b/olive/passes/onnx/rtn_quantization.py @@ -137,9 +137,12 @@ def run(self, model, output_model_path: str): if component_output_path.exists(): shutil.rmtree(str(component_output_path)) shutil.copytree(str(src_dir), str(component_output_path)) + # onnx_file_name may be None if the handler was created without an explicit name; + # fall back to 'model.onnx' which is the standard Olive convention. + onnx_file_name = getattr(component_model, "onnx_file_name", None) or "model.onnx" output_component = ONNXModelHandler( model_path=str(component_output_path), - onnx_file_name=component_model.onnx_file_name, + onnx_file_name=onnx_file_name, model_attributes=component_model.model_attributes, ) Pass._carry_forward_additional_files(component_model, output_component)