From 9d3a78c6a907f7bd9921903c1580e472c03478de Mon Sep 17 00:00:00 2001
From: Gunes Bayir <gunes.bayir@arm.com>
Date: Mon, 8 Dec 2025 11:43:50 +0000
Subject: [PATCH] test: Adjust the tolerance in bf16 CpuGemmAssemblyDispatch
 tests

This patch adjusts the tolerances in some of the CpuGemmAssemblyDispatch tests. It's been practically tested with ~1K iterations in RunLarge test suite.

Resolves: COMPMID-8465
Change-Id: I64db7d6d8e767410cc99cf4e24b6f3ac5333710c
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
---
 .../low_level/CpuGemmAssemblyDispatch.cpp            | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp b/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp
index 031f7abd8b..f0123fd975 100644
--- a/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp
+++ b/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp
@@ -57,9 +57,17 @@ constexpr float tolerance_num = 0.07f; /**< Tolerance number for FP16 data types
 #endif                                 /* ARM_COMPUTE_ENABLE_FP16 */
 #ifdef ARM_COMPUTE_ENABLE_BF16
 const AbsoluteTolerance<float> abs_tolerance_bf16(
-    0.02f); /**< Absolute tolerance value for comparing reference's output against implementation's output for BF16 data types */
+    0.02f); /**< Absolute tolerance value for comparing reference's output against implementation's output for BF16 data types
+                We have a large absolute error tolerance for bf16 because even though we're computing with bf16 precision in
+                the reference implementation, the actual implementation might still be choosing fp32 implementation due to
+                performance reasons. This might particularly happen in small shapes as the conversion of fp32 input to bf16
+                isn't worth it. We don't apply this large absolute tolerance to tests with actual bf16 inputs because we
+                also do the calculation in bf16 arithmetic in the reference implementation. Therefore, we do not expect large
+                differences in reference vs. optimized runs.
+            */
 const RelativeTolerance<float> rel_tolerance_bf16(
     0.02f); /**< Relative tolerance value for comparing reference's output against implementation's output for BF16 data types */
+constexpr float tolerance_num_bf16 = 1e-5f;
 #endif /* ARM_COMPUTE_ENABLE_BF16 */
 /** CNN data types */
 const auto CNNDataTypes = make("DataType",
@@ -552,7 +560,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     if(CPUInfo::get().has_bf16())
     {
     // Validate output
-        validate(Accessor(_target), _reference, rel_tolerance_bf16);
+        validate(Accessor(_target), _reference, rel_tolerance_bf16, tolerance_num_bf16);
     }
     else
     {