Arm backend: Test partial quantization of models (#16249)

martinlsm · Martin Lindström · oscarandersson8218 · web-flow · commit 1cb85ef2d344 · 2025-12-16T11:01:35.000+01:00
Run tests of partial quantization on MobileNetV2 and Llama using the INT+FP profile. The tests quantize only some layer types and runs the rest in floating point. cc @freddan80 @per @zingo @oscarandersson8218 @digantdesai Signed-off-by: Martin Lindström <Martin.Lindstroem@arm.com> Co-authored-by: Martin Lindström <Martin.Lindstroem@arm.com> Co-authored-by: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py
@@ -16,6 +16,7 @@
 import pytest
 import torch
 from executorch.backends.arm._passes import InsertInt32CastsAfterInt64PlaceholdersPass
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
 
 from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -99,6 +100,14 @@ def prepare_model(self):
         return llama_model, llama_inputs, llama_meta
 
 
+def _use_partial_quantizer(pipeline):
+    """Set the pipeline's quantizer to only include Linear layers"""
+    pipeline.quantizer.set_global(None)
+    pipeline.quantizer.set_module_type(
+        torch.nn.Linear, get_symmetric_quantization_config()
+    )
+
+
 def test_llama_tosa_FP():
     llama_model, llama_inputs, llama_meta = TestLlama().prepare_model()
 
@@ -179,3 +188,21 @@ def test_llama_vgf_quant():
             quantize=True,
         )
         pipeline.run()
+
+
+def test_llama_partial_quant_tosa_INT_FP():
+    llama_model, llama_inputs, llama_meta = TestLlama().prepare_model()
+
+    if llama_model is None or llama_inputs is None:
+        pytest.skip("Missing model and/or input files")
+
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t](
+            llama_model,
+            llama_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_extensions=["FP"],
+        )
+        _use_partial_quantizer(pipeline)
+        pipeline.run()
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -10,6 +10,7 @@
 import pytest
 
 import torch
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineINT,
@@ -39,6 +40,14 @@
 }
 
 
+def _use_partial_quantizer(pipeline):
+    """Set the pipeline's quantizer to only include Conv2d and ReLU6"""
+    quant_cfg = get_symmetric_quantization_config()
+    pipeline.quantizer.set_global(None)
+    pipeline.quantizer.set_module_type(torch.nn.Conv2d, quant_cfg)
+    pipeline.quantizer.set_module_type(torch.nn.ReLU6, quant_cfg)
+
+
 def test_mv2_tosa_FP():
     pipeline = TosaPipelineFP[input_t](
         mv2, model_inputs, aten_op=[], exir_op=[], use_to_edge_transform_and_lower=True
@@ -140,3 +149,17 @@ def test_mv2_vgf_no_quant():
         quantize=False,
     )
     pipeline.run()
+
+
+def test_mv2_partial_quant_tosa_INT_FP():
+    pipeline = TosaPipelineINT[input_t](
+        mv2,
+        model_inputs,
+        aten_op=[],
+        exir_op=[],
+        tosa_extensions=["FP"],
+        use_to_edge_transform_and_lower=True,
+        atol=0.20,
+    )
+    _use_partial_quantizer(pipeline)
+    pipeline.run()