From ef180f7b59f89805349aa125cd75475eb36db7db Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Tue, 9 Dec 2025 01:49:21 -0800
Subject: [PATCH 01/30] fix(simd): umasked AVX2 load

---
 include/svs/core/distance/simd_utils.h | 38 ++++++++++++--------------
 1 file changed, 17 insertions(+), 21 deletions(-)
diff --git a/include/svs/core/distance/simd_utils.h b/include/svs/core/distance/simd_utils.h
index f883abcaf..f7467c636 100644
--- a/include/svs/core/distance/simd_utils.h
+++ b/include/svs/core/distance/simd_utils.h
@@ -19,6 +19,7 @@
 #if defined(__i386__) || defined(__x86_64__)
 
 #include <array>
+#include <cstring>
 #include <limits>
 #include <type_traits>
 
@@ -332,11 +333,10 @@ template <> struct ConvertToFloat<8> {
     // from float
     static __m256 load(const float* ptr) { return _mm256_loadu_ps(ptr); }
     static __m256 load(mask_t m, const float* ptr) {
-        // AVX2 doesn't have native masked load, so we load and then blend
-        auto data = _mm256_loadu_ps(ptr);
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Full width load with blending may case out-of-bounds read (SEGV)
+        // Therefore we use _mm256_maskload_ps which safely handles masked loads
+        auto mask_vec = _mm256_castps_si256(create_blend_mask_avx2(m));
+        return _mm256_maskload_ps(ptr, mask_vec);
     }
 
     // from float16
@@ -345,10 +345,10 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const Float16* ptr) {
-        auto data = _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Safe masked load using a temporary buffer to avoid SEGV
+        __m128i buffer = _mm_setzero_si128();
+        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(Float16));
+        return _mm256_cvtph_ps(buffer);
     }
 
     // from uint8
@@ -359,12 +359,10 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const uint8_t* ptr) {
-        auto data = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-            _mm_cvtsi64_si128(*(reinterpret_cast<const int64_t*>(ptr)))
-        ));
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Safe masked load using a temporary buffer to avoid SEGV
+        int64_t buffer = 0;
+        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(uint8_t));
+        return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_cvtsi64_si128(buffer)));
     }
 
     // from int8
@@ -375,12 +373,10 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const int8_t* ptr) {
-        auto data = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_cvtsi64_si128(*(reinterpret_cast<const int64_t*>(ptr)))
-        ));
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Safe masked load using a temporary buffer to avoid SEGV
+        int64_t buffer = 0;
+        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(int8_t));
+        return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(buffer)));
     }
 
     // We do not need to treat the left or right-hand differently.

From c1705f5062865303bfc3920ee46b55503a33e06c Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Tue, 9 Dec 2025 06:45:17 -0800
Subject: [PATCH 02/30] remove L2Impl specific test

---
 tests/svs/core/distance.cpp | 58 +++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/tests/svs/core/distance.cpp b/tests/svs/core/distance.cpp
index 3b65c2b91..91d363a06 100644
--- a/tests/svs/core/distance.cpp
+++ b/tests/svs/core/distance.cpp
@@ -20,6 +20,9 @@
 // catch 2
 #include "catch2/catch_test_macros.hpp"
 
+#include <numeric>
+#include <vector>
+
 namespace {
 
 std::string_view test_table = R"(
@@ -94,3 +97,58 @@ CATCH_TEST_CASE("Distance Utils", "[core][distance][distance_type]") {
         }
     }
 }
+
+CATCH_TEST_CASE("Distance asan L2", "[distance][simd][asan][l2]") {
+    // Try various sizes to hit the case where vector capacity == size
+    // and the SIMD load reads past the end into the redzone.
+    // We test sizes that are not multiples of 8 (AVX2 width) or 16 (AVX512 width).
+    for (size_t size = 1; size < 128; ++size) {
+        std::vector<float> a(size);
+        std::vector<float> b(size);
+
+        std::iota(a.begin(), a.end(), 0.0f);
+        std::iota(b.begin(), b.end(), 1.0f);
+
+        // Ensure no spare capacity
+        a.shrink_to_fit();
+        b.shrink_to_fit();
+
+        auto dist = svs::distance::L2::compute(a.data(), b.data(), size);
+        CATCH_REQUIRE(dist >= 0);
+    }
+}
+
+CATCH_TEST_CASE("Distance asan Cosine", "[distance][simd][asan][cosine]") {
+    for (size_t size = 1; size < 128; ++size) {
+        std::vector<float> a(size);
+        std::vector<float> b(size);
+
+        std::iota(a.begin(), a.end(), 0.0f);
+        std::iota(b.begin(), b.end(), 1.0f);
+
+        // Ensure no spare capacity
+        a.shrink_to_fit();
+        b.shrink_to_fit();
+
+        auto dist =
+            svs::distance::CosineSimilarity::compute(a.data(), b.data(), 1.0f, size);
+        CATCH_REQUIRE(dist >= 0);
+    }
+}
+
+CATCH_TEST_CASE("Distance asan IP", "[distance][simd][asan][ip]") {
+    for (size_t size = 1; size < 128; ++size) {
+        std::vector<float> a(size);
+        std::vector<float> b(size);
+
+        std::iota(a.begin(), a.end(), 0.0f);
+        std::iota(b.begin(), b.end(), 1.0f);
+
+        // Ensure no spare capacity
+        a.shrink_to_fit();
+        b.shrink_to_fit();
+
+        auto dist = svs::distance::IP::compute(a.data(), b.data(), size);
+        CATCH_REQUIRE(dist >= 0);
+    }
+}

From 05dce8faa151b79eb0d4cf03bc9a2a7128167d94 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Tue, 9 Dec 2025 07:04:19 -0800
Subject: [PATCH 03/30] add asan yml

---
 .github/workflows/asan.yml | 54 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 .github/workflows/asan.yml

diff --git a/.github/workflows/asan.yml b/.github/workflows/asan.yml
new file mode 100644
index 000000000..504c8bcbe
--- /dev/null
+++ b/.github/workflows/asan.yml
@@ -0,0 +1,54 @@
+name: Linux ASan Test
+run-name: ${{ github.event.inputs.run_name || github.event.pull_request.title }}
+
+on:
+  workflow_dispatch:
+  pull_request:
+
+concurrency:
+  group: "${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}"
+  cancel-in-progress: true
+
+jobs:
+  asan-test:
+    name: Clang ASan
+    runs-on: [self-hosted, Linux, ubuntu-22.04]
+    env:
+      CXX: clang++-15
+      CC: clang-15
+
+    steps:
+      - name: "Cleanup build folder"
+        run: |
+          ls -la ./
+          sudo rm -rf ./* || true
+          sudo rm -rf ./.??* || true
+          ls -la ./
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Configure build
+        working-directory: ${{ runner.temp }}
+        env:
+          TEMP_WORKSPACE: ${{ runner.temp }}
+        run: |
+          cmake -B${TEMP_WORKSPACE}/build -S${GITHUB_WORKSPACE} \
+                -DCMAKE_BUILD_TYPE=Debug \
+                -DCMAKE_CXX_FLAGS="-fsanitize=address -fno-omit-frame-pointer -g" \
+                -DCMAKE_EXE_LINKER_FLAGS="-fsanitize=address" \
+                -DSVS_BUILD_TESTS=YES \
+                -DSVS_BUILD_BINARIES=NO \
+                -DSVS_BUILD_EXAMPLES=NO
+
+      - name: Build tests
+        working-directory: ${{ runner.temp }}/build
+        run: make tests -j10
+
+      - name: Run tests
+        env:
+          CTEST_OUTPUT_ON_FAILURE: 1
+          ASAN_OPTIONS: detect_leaks=0
+        working-directory: ${{ runner.temp }}/build/tests
+        run: ./tests "[distance][simd][asan]"

From ecf5b2861052d604cbf43560731489d40e11572b Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Tue, 9 Dec 2025 07:07:50 -0800
Subject: [PATCH 04/30] fix naming and clang version

---
 .github/workflows/asan.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/asan.yml b/.github/workflows/asan.yml
index 504c8bcbe..d2b59cfe5 100644
--- a/.github/workflows/asan.yml
+++ b/.github/workflows/asan.yml
@@ -1,4 +1,4 @@
-name: Linux ASan Test
+name: Linux Build and Test
 run-name: ${{ github.event.inputs.run_name || github.event.pull_request.title }}
 
 on:
@@ -11,11 +11,11 @@ concurrency:
 
 jobs:
   asan-test:
-    name: Clang ASan
+    name: clang-18 fsanitize=address
     runs-on: [self-hosted, Linux, ubuntu-22.04]
     env:
-      CXX: clang++-15
-      CC: clang-15
+      CXX: clang++-18
+      CC: clang-18
 
     steps:
       - name: "Cleanup build folder"
@@ -44,11 +44,11 @@ jobs:
 
       - name: Build tests
         working-directory: ${{ runner.temp }}/build
-        run: make tests -j10
+        run: make tests -j
 
       - name: Run tests
         env:
           CTEST_OUTPUT_ON_FAILURE: 1
           ASAN_OPTIONS: detect_leaks=0
         working-directory: ${{ runner.temp }}/build/tests
-        run: ./tests "[distance][simd][asan]"
+        run: ./tests

From e5d46aabd4fef468903456674f18109a1ea86e70 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Tue, 9 Dec 2025 07:08:45 -0800
Subject: [PATCH 05/30] copyright

---
 .github/workflows/asan.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/asan.yml b/.github/workflows/asan.yml
index d2b59cfe5..63200cd66 100644
--- a/.github/workflows/asan.yml
+++ b/.github/workflows/asan.yml
@@ -1,3 +1,16 @@
+# Copyright (C) 2025 Intel Corporation
+#
+# This software and the related documents are Intel copyrighted materials,
+# and your use of them is governed by the express license under which they
+# were provided to you ("License"). Unless the License provides otherwise,
+# you may not use, modify, copy, publish, distribute, disclose or transmit
+# this software or the related documents without Intel's prior written
+# permission.
+#
+# This software and the related documents are provided as is, with no
+# express or implied warranties, other than those that are expressly stated
+# in the License.
+
 name: Linux Build and Test
 run-name: ${{ github.event.inputs.run_name || github.event.pull_request.title }}
 

From f271dc97ed4e23b5c059014b90c27da20bbf017e Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Tue, 9 Dec 2025 07:10:37 -0800
Subject: [PATCH 06/30] typo

---
 include/svs/core/distance/simd_utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/svs/core/distance/simd_utils.h b/include/svs/core/distance/simd_utils.h
index f7467c636..bd9834da5 100644
--- a/include/svs/core/distance/simd_utils.h
+++ b/include/svs/core/distance/simd_utils.h
@@ -333,7 +333,7 @@ template <> struct ConvertToFloat<8> {
     // from float
     static __m256 load(const float* ptr) { return _mm256_loadu_ps(ptr); }
     static __m256 load(mask_t m, const float* ptr) {
-        // Full width load with blending may case out-of-bounds read (SEGV)
+        // Full width load with blending may cause out-of-bounds read (SEGV)
         // Therefore we use _mm256_maskload_ps which safely handles masked loads
         auto mask_vec = _mm256_castps_si256(create_blend_mask_avx2(m));
         return _mm256_maskload_ps(ptr, mask_vec);

From 2f8dd96f232d0093188c8de61565006271f472be Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Tue, 9 Dec 2025 10:10:55 -0800
Subject: [PATCH 07/30] comments on workflow file

---
 .github/workflows/asan.yml | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/asan.yml b/.github/workflows/asan.yml
index 63200cd66..b4d1c7ffa 100644
--- a/.github/workflows/asan.yml
+++ b/.github/workflows/asan.yml
@@ -1,17 +1,18 @@
-# Copyright (C) 2025 Intel Corporation
+# Copyright 2025 Intel Corporation
 #
-# This software and the related documents are Intel copyrighted materials,
-# and your use of them is governed by the express license under which they
-# were provided to you ("License"). Unless the License provides otherwise,
-# you may not use, modify, copy, publish, distribute, disclose or transmit
-# this software or the related documents without Intel's prior written
-# permission.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-# This software and the related documents are provided as is, with no
-# express or implied warranties, other than those that are expressly stated
-# in the License.
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-name: Linux Build and Test
+name: ASan Build and Test
 run-name: ${{ github.event.inputs.run_name || github.event.pull_request.title }}
 
 on:
@@ -24,7 +25,7 @@ concurrency:
 
 jobs:
   asan-test:
-    name: clang-18 fsanitize=address
+    name: clang++-18 -fsanitize=address
     runs-on: [self-hosted, Linux, ubuntu-22.04]
     env:
       CXX: clang++-18

From 4046e99831b9e21dad751b5ca0c6d2ed82c7a326 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 00:24:59 -0800
Subject: [PATCH 08/30] chore(ci): streamline asan & linux workflows

---
 .github/workflows/asan.yml        | 68 -----------------------
 .github/workflows/build-linux.yml | 92 ++++++++++++++++++-------------
 2 files changed, 54 insertions(+), 106 deletions(-)
 delete mode 100644 .github/workflows/asan.yml

diff --git a/.github/workflows/asan.yml b/.github/workflows/asan.yml
deleted file mode 100644
index b4d1c7ffa..000000000
--- a/.github/workflows/asan.yml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: ASan Build and Test
-run-name: ${{ github.event.inputs.run_name || github.event.pull_request.title }}
-
-on:
-  workflow_dispatch:
-  pull_request:
-
-concurrency:
-  group: "${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}"
-  cancel-in-progress: true
-
-jobs:
-  asan-test:
-    name: clang++-18 -fsanitize=address
-    runs-on: [self-hosted, Linux, ubuntu-22.04]
-    env:
-      CXX: clang++-18
-      CC: clang-18
-
-    steps:
-      - name: "Cleanup build folder"
-        run: |
-          ls -la ./
-          sudo rm -rf ./* || true
-          sudo rm -rf ./.??* || true
-          ls -la ./
-
-      - uses: actions/checkout@v4
-        with:
-          submodules: true
-
-      - name: Configure build
-        working-directory: ${{ runner.temp }}
-        env:
-          TEMP_WORKSPACE: ${{ runner.temp }}
-        run: |
-          cmake -B${TEMP_WORKSPACE}/build -S${GITHUB_WORKSPACE} \
-                -DCMAKE_BUILD_TYPE=Debug \
-                -DCMAKE_CXX_FLAGS="-fsanitize=address -fno-omit-frame-pointer -g" \
-                -DCMAKE_EXE_LINKER_FLAGS="-fsanitize=address" \
-                -DSVS_BUILD_TESTS=YES \
-                -DSVS_BUILD_BINARIES=NO \
-                -DSVS_BUILD_EXAMPLES=NO
-
-      - name: Build tests
-        working-directory: ${{ runner.temp }}/build
-        run: make tests -j
-
-      - name: Run tests
-        env:
-          CTEST_OUTPUT_ON_FAILURE: 1
-          ASAN_OPTIONS: detect_leaks=0
-        working-directory: ${{ runner.temp }}/build/tests
-        run: ./tests
diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml
index af4adf4d6..618cd9d18 100644
--- a/.github/workflows/build-linux.yml
+++ b/.github/workflows/build-linux.yml
@@ -25,18 +25,20 @@ permissions:
 
 # This allows a subsequently queued workflow run to interrupt previous runs
 concurrency:
-  group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}'
+  group: "${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}"
   cancel-in-progress: true
 
 jobs:
   build:
-    name: ${{ matrix.cxx }}, ${{ matrix.build_type }}, ivf=${{ matrix.ivf }}
+    name: ${{ matrix.cxx }}, ${{ matrix.build_type }}, ivf=${{ matrix.ivf }}, asan=${{ matrix.asan }}
     runs-on: ubuntu-22.04
     strategy:
       matrix:
         build_type: [RelWithDebugInfo]
         ivf: [OFF, ON]
         cxx: [g++-11, g++-12, clang++-15]
+        asan: [OFF]
+        cmake_extra_args: ["-DSVS_BUILD_BINARIES=YES -DSVS_BUILD_EXAMPLES=YES"]
         include:
           - cxx: g++-11
             cc: gcc-11
@@ -44,50 +46,64 @@ jobs:
             cc: gcc-12
           - cxx: clang++-15
             cc: clang-15
+          - cxx: clang++-18
+            cc: clang-18
+            build_type: Debug
+            ivf: OFF
+            asan: ON
+            cmake_extra_args: "-DCMAKE_CXX_FLAGS='-fsanitize=address -fno-omit-frame-pointer -g' -DCMAKE_EXE_LINKER_FLAGS='-fsanitize=address' -DSVS_BUILD_BINARIES=NO -DSVS_BUILD_EXAMPLES=NO"
         exclude:
           - cxx: g++-12
             ivf: ON
 
     steps:
-    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
-    - name: Install MKL
-      timeout-minutes: 5
-      run: |
-       .github/scripts/setup_apt_repo_linux.sh
-       sudo apt install intel-oneapi-mkl intel-oneapi-mkl-devel
-       # Setup environment variables for building against MKL.
-       # Persist the environment variables for use across multiple subsequent actions.
-       source /opt/intel/oneapi/setvars.sh
-       printenv >> $GITHUB_ENV
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+      - name: Install MKL
+        timeout-minutes: 5
+        run: |
+          .github/scripts/setup_apt_repo_linux.sh
+          sudo apt install intel-oneapi-mkl intel-oneapi-mkl-devel
+          # Setup environment variables for building against MKL.
+          # Persist the environment variables for use across multiple subsequent actions.
+          source /opt/intel/oneapi/setvars.sh
+          printenv >> $GITHUB_ENV
 
-    - name: Configure build
-      working-directory: ${{ runner.temp }}
-      env:
-        CXX: ${{ matrix.cxx }}
-        CC: ${{ matrix.cc }}
-        TEMP_WORKSPACE: ${{ runner.temp }}
-      run: |
-        cmake -B${TEMP_WORKSPACE}/build -S${GITHUB_WORKSPACE} \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DSVS_BUILD_BINARIES=YES \
-              -DSVS_BUILD_TESTS=YES \
-              -DSVS_BUILD_EXAMPLES=YES \
-              -DSVS_EXPERIMENTAL_LEANVEC=YES \
-              -DSVS_NO_AVX512=NO \
-              -DSVS_EXPERIMENTAL_ENABLE_IVF=${{ matrix.ivf }}
+      - name: Install Clang 18
+        if: matrix.cxx == 'clang++-18'
+        run: |
+          wget https://apt.llvm.org/llvm.sh
+          chmod +x llvm.sh
+          sudo ./llvm.sh 18
 
-    - name: Build Tests and Utilities
-      working-directory: ${{ runner.temp }}/build
-      run: make -j$(nproc)
+      - name: Configure build
+        working-directory: ${{ runner.temp }}
+        env:
+          CXX: ${{ matrix.cxx }}
+          CC: ${{ matrix.cc }}
+          TEMP_WORKSPACE: ${{ runner.temp }}
+        run: |
+          cmake -B${TEMP_WORKSPACE}/build -S${GITHUB_WORKSPACE} \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+                -DSVS_BUILD_TESTS=YES \
+                -DSVS_EXPERIMENTAL_LEANVEC=YES \
+                -DSVS_NO_AVX512=NO \
+                -DSVS_EXPERIMENTAL_ENABLE_IVF=${{ matrix.ivf }} \
+                ${{ matrix.cmake_extra_args }}
 
-    - name: Run tests
-      env:
+      - name: Build Tests and Utilities
+        working-directory: ${{ runner.temp }}/build
+        run: make -j$(nproc)
+
+      - name: Run tests
+        env:
           CTEST_OUTPUT_ON_FAILURE: 1
-      working-directory: ${{ runner.temp }}/build/tests
-      run: ctest -C ${{ matrix.build_type }}
+          ASAN_OPTIONS: detect_leaks=0
+        working-directory: ${{ runner.temp }}/build/tests
+        run: ctest -C ${{ matrix.build_type }}
 
-    - name: Run Cpp Examples
-      env:
+      - name: Run Cpp Examples
+        if: matrix.asan != 'ON'
+        env:
           CTEST_OUTPUT_ON_FAILURE: 1
-      working-directory: ${{ runner.temp }}/build/examples/cpp
-      run: ctest -C RelWithDebugInfo
+        working-directory: ${{ runner.temp }}/build/examples/cpp
+        run: ctest -C ${{ matrix.build_type }}

From 9bc53132e8174c76f1c30777788291e5fdf02dad Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 00:28:21 -0800
Subject: [PATCH 09/30] fixup: undo format

---
 .github/workflows/build-linux.yml | 91 ++++++++++++++++---------------
 1 file changed, 46 insertions(+), 45 deletions(-)

diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml
index 618cd9d18..ffd450a2f 100644
--- a/.github/workflows/build-linux.yml
+++ b/.github/workflows/build-linux.yml
@@ -57,53 +57,54 @@ jobs:
             ivf: ON
 
     steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
-      - name: Install MKL
-        timeout-minutes: 5
-        run: |
-          .github/scripts/setup_apt_repo_linux.sh
-          sudo apt install intel-oneapi-mkl intel-oneapi-mkl-devel
-          # Setup environment variables for building against MKL.
-          # Persist the environment variables for use across multiple subsequent actions.
-          source /opt/intel/oneapi/setvars.sh
-          printenv >> $GITHUB_ENV
+    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+    - name: Install MKL
+      timeout-minutes: 5
+      run: |
+       .github/scripts/setup_apt_repo_linux.sh
+       sudo apt install intel-oneapi-mkl intel-oneapi-mkl-devel
+       # Setup environment variables for building against MKL.
+       # Persist the environment variables for use across multiple subsequent actions.
+       source /opt/intel/oneapi/setvars.sh
+       printenv >> $GITHUB_ENV
 
-      - name: Install Clang 18
-        if: matrix.cxx == 'clang++-18'
-        run: |
-          wget https://apt.llvm.org/llvm.sh
-          chmod +x llvm.sh
-          sudo ./llvm.sh 18
+    - name: Install Clang 18
+      if: matrix.cxx == 'clang++-18'
+      run: |
+        wget https://apt.llvm.org/llvm.sh
+        chmod +x llvm.sh
+        sudo ./llvm.sh 18
 
-      - name: Configure build
-        working-directory: ${{ runner.temp }}
-        env:
-          CXX: ${{ matrix.cxx }}
-          CC: ${{ matrix.cc }}
-          TEMP_WORKSPACE: ${{ runner.temp }}
-        run: |
-          cmake -B${TEMP_WORKSPACE}/build -S${GITHUB_WORKSPACE} \
-                -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-                -DSVS_BUILD_TESTS=YES \
-                -DSVS_EXPERIMENTAL_LEANVEC=YES \
-                -DSVS_NO_AVX512=NO \
-                -DSVS_EXPERIMENTAL_ENABLE_IVF=${{ matrix.ivf }} \
-                ${{ matrix.cmake_extra_args }}
 
-      - name: Build Tests and Utilities
-        working-directory: ${{ runner.temp }}/build
-        run: make -j$(nproc)
+    - name: Configure build
+      working-directory: ${{ runner.temp }}
+      env:
+        CXX: ${{ matrix.cxx }}
+        CC: ${{ matrix.cc }}
+        TEMP_WORKSPACE: ${{ runner.temp }}
+      run: |
+        cmake -B${TEMP_WORKSPACE}/build -S${GITHUB_WORKSPACE} \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+              -DSVS_BUILD_TESTS=YES \
+              -DSVS_EXPERIMENTAL_LEANVEC=YES \
+              -DSVS_NO_AVX512=NO \
+              -DSVS_EXPERIMENTAL_ENABLE_IVF=${{ matrix.ivf }} \
+              ${{ matrix.cmake_extra_args }}
 
-      - name: Run tests
-        env:
-          CTEST_OUTPUT_ON_FAILURE: 1
-          ASAN_OPTIONS: detect_leaks=0
-        working-directory: ${{ runner.temp }}/build/tests
-        run: ctest -C ${{ matrix.build_type }}
+    - name: Build Tests and Utilities
+      working-directory: ${{ runner.temp }}/build
+      run: make -j$(nproc)
 
-      - name: Run Cpp Examples
-        if: matrix.asan != 'ON'
-        env:
-          CTEST_OUTPUT_ON_FAILURE: 1
-        working-directory: ${{ runner.temp }}/build/examples/cpp
-        run: ctest -C ${{ matrix.build_type }}
+    - name: Run tests
+      env:
+        CTEST_OUTPUT_ON_FAILURE: 1
+        ASAN_OPTIONS: detect_leaks=0
+      working-directory: ${{ runner.temp }}/build/tests
+      run: ctest -C ${{ matrix.build_type }}
+
+    - name: Run Cpp Examples
+      if: matrix.asan != 'ON'
+      env:
+        CTEST_OUTPUT_ON_FAILURE: 1
+      working-directory: ${{ runner.temp }}/build/examples/cpp
+      run: ctest -C ${{ matrix.build_type }}

From 2a43709022532660427b68c96fc02ff410a6713f Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 01:02:22 -0800
Subject: [PATCH 10/30] refactor: use CATCH_TEMPLATE_TEST_CASE

---
 tests/svs/core/distance.cpp | 52 ++++++++++---------------------------
 1 file changed, 13 insertions(+), 39 deletions(-)

diff --git a/tests/svs/core/distance.cpp b/tests/svs/core/distance.cpp
index 91d363a06..d32ee3f3e 100644
--- a/tests/svs/core/distance.cpp
+++ b/tests/svs/core/distance.cpp
@@ -18,6 +18,7 @@
 #include "svs/core/distance.h"
 
 // catch 2
+#include "catch2/catch_template_test_macros.hpp"
 #include "catch2/catch_test_macros.hpp"
 
 #include <numeric>
@@ -98,7 +99,15 @@ CATCH_TEST_CASE("Distance Utils", "[core][distance][distance_type]") {
     }
 }
 
-CATCH_TEST_CASE("Distance asan L2", "[distance][simd][asan][l2]") {
+CATCH_TEMPLATE_TEST_CASE(
+    "Distance ASan",
+    "[distance][simd][asan]",
+    svs::DistanceL2,
+    svs::DistanceIP,
+    svs::DistanceCosineSimilarity
+) {
+    using Distance = TestType;
+
     // Try various sizes to hit the case where vector capacity == size
     // and the SIMD load reads past the end into the redzone.
     // We test sizes that are not multiples of 8 (AVX2 width) or 16 (AVX512 width).
@@ -106,49 +115,14 @@ CATCH_TEST_CASE("Distance asan L2", "[distance][simd][asan][l2]") {
         std::vector<float> a(size);
         std::vector<float> b(size);
 
-        std::iota(a.begin(), a.end(), 0.0f);
-        std::iota(b.begin(), b.end(), 1.0f);
-
-        // Ensure no spare capacity
-        a.shrink_to_fit();
-        b.shrink_to_fit();
-
-        auto dist = svs::distance::L2::compute(a.data(), b.data(), size);
-        CATCH_REQUIRE(dist >= 0);
-    }
-}
-
-CATCH_TEST_CASE("Distance asan Cosine", "[distance][simd][asan][cosine]") {
-    for (size_t size = 1; size < 128; ++size) {
-        std::vector<float> a(size);
-        std::vector<float> b(size);
-
-        std::iota(a.begin(), a.end(), 0.0f);
-        std::iota(b.begin(), b.end(), 1.0f);
-
-        // Ensure no spare capacity
-        a.shrink_to_fit();
-        b.shrink_to_fit();
-
-        auto dist =
-            svs::distance::CosineSimilarity::compute(a.data(), b.data(), 1.0f, size);
-        CATCH_REQUIRE(dist >= 0);
-    }
-}
-
-CATCH_TEST_CASE("Distance asan IP", "[distance][simd][asan][ip]") {
-    for (size_t size = 1; size < 128; ++size) {
-        std::vector<float> a(size);
-        std::vector<float> b(size);
-
-        std::iota(a.begin(), a.end(), 0.0f);
-        std::iota(b.begin(), b.end(), 1.0f);
+        std::iota(a.begin(), a.end(), 1.0f);
+        std::iota(b.begin(), b.end(), 2.0f);
 
         // Ensure no spare capacity
         a.shrink_to_fit();
         b.shrink_to_fit();
 
-        auto dist = svs::distance::IP::compute(a.data(), b.data(), size);
+        auto dist = svs::distance::compute(Distance(), std::span(a), std::span(b));
         CATCH_REQUIRE(dist >= 0);
     }
 }

From 1e74a554cd2c157013025200e383d9b581ad103b Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 01:04:04 -0800
Subject: [PATCH 11/30] fix(ci): remove unused cmake option

---
 .github/workflows/build-linux.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml
index ffd450a2f..ca2ffea49 100644
--- a/.github/workflows/build-linux.yml
+++ b/.github/workflows/build-linux.yml
@@ -86,7 +86,6 @@ jobs:
         cmake -B${TEMP_WORKSPACE}/build -S${GITHUB_WORKSPACE} \
               -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
               -DSVS_BUILD_TESTS=YES \
-              -DSVS_EXPERIMENTAL_LEANVEC=YES \
               -DSVS_NO_AVX512=NO \
               -DSVS_EXPERIMENTAL_ENABLE_IVF=${{ matrix.ivf }} \
               ${{ matrix.cmake_extra_args }}

From dbe4ae8c447ad7155b93e183e9bae777e0f8fd82 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 01:47:14 -0800
Subject: [PATCH 12/30] fix(ci): skip integration testsin debug asan build

---
 .github/workflows/build-linux.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml
index ca2ffea49..28a7b6226 100644
--- a/.github/workflows/build-linux.yml
+++ b/.github/workflows/build-linux.yml
@@ -39,6 +39,7 @@ jobs:
         cxx: [g++-11, g++-12, clang++-15]
         asan: [OFF]
         cmake_extra_args: ["-DSVS_BUILD_BINARIES=YES -DSVS_BUILD_EXAMPLES=YES"]
+        ctest_args: [""]
         include:
           - cxx: g++-11
             cc: gcc-11
@@ -51,7 +52,14 @@ jobs:
             build_type: Debug
             ivf: OFF
             asan: ON
-            cmake_extra_args: "-DCMAKE_CXX_FLAGS='-fsanitize=address -fno-omit-frame-pointer -g' -DCMAKE_EXE_LINKER_FLAGS='-fsanitize=address' -DSVS_BUILD_BINARIES=NO -DSVS_BUILD_EXAMPLES=NO"
+            # address sanitizer flags
+            cmake_extra_args: >-
+              -DCMAKE_CXX_FLAGS='-fsanitize=address -fno-omit-frame-pointer -g'
+              -DCMAKE_EXE_LINKER_FLAGS='-fsanitize=address'
+              -DSVS_BUILD_BINARIES=NO
+              -DSVS_BUILD_EXAMPLES=NO
+            # skip longer-running integration tests
+            ctest_args: "-LE integration"
         exclude:
           - cxx: g++-12
             ivf: ON
@@ -75,7 +83,6 @@ jobs:
         chmod +x llvm.sh
         sudo ./llvm.sh 18
 
-
     - name: Configure build
       working-directory: ${{ runner.temp }}
       env:
@@ -99,7 +106,7 @@ jobs:
         CTEST_OUTPUT_ON_FAILURE: 1
         ASAN_OPTIONS: detect_leaks=0
       working-directory: ${{ runner.temp }}/build/tests
-      run: ctest -C ${{ matrix.build_type }}
+      run: ctest -C ${{ matrix.build_type }} ${{ matrix.ctest_args }}
 
     - name: Run Cpp Examples
       if: matrix.asan != 'ON'

From 2b00b78abd8e275509e720183a905dca32c9fed5 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 01:50:21 -0800
Subject: [PATCH 13/30] feat(ci): add new tag 'long' that's skipped for asan

---
 .github/workflows/build-linux.yml | 4 ++--
 tests/svs/index/vamana/index.cpp  | 2 +-
 tests/svs/index/vamana/multi.cpp  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml
index 28a7b6226..04213c675 100644
--- a/.github/workflows/build-linux.yml
+++ b/.github/workflows/build-linux.yml
@@ -58,8 +58,8 @@ jobs:
               -DCMAKE_EXE_LINKER_FLAGS='-fsanitize=address'
               -DSVS_BUILD_BINARIES=NO
               -DSVS_BUILD_EXAMPLES=NO
-            # skip longer-running integration tests
-            ctest_args: "-LE integration"
+            # skip longer-running tests
+            ctest_args: "-LE long"
         exclude:
           - cxx: g++-12
             ivf: ON
diff --git a/tests/svs/index/vamana/index.cpp b/tests/svs/index/vamana/index.cpp
index 464b12349..b94b902b8 100644
--- a/tests/svs/index/vamana/index.cpp
+++ b/tests/svs/index/vamana/index.cpp
@@ -181,7 +181,7 @@ CATCH_TEST_CASE("Static VamanaIndex Per-Index Logging", "[logging]") {
     CATCH_REQUIRE(captured_logs[2].find("Batch Size:") != std::string::npos);
 }
 
-CATCH_TEST_CASE("Vamana Index Default Parameters", "[parameter][vamana]") {
+CATCH_TEST_CASE("Vamana Index Default Parameters", "[long][parameter][vamana]") {
     using Catch::Approx;
     std::filesystem::path data_path = test_dataset::data_svs_file();
 
diff --git a/tests/svs/index/vamana/multi.cpp b/tests/svs/index/vamana/multi.cpp
index af52864f6..63d450b3e 100644
--- a/tests/svs/index/vamana/multi.cpp
+++ b/tests/svs/index/vamana/multi.cpp
@@ -48,7 +48,7 @@ template <typename Distance> float pick_alpha(Distance SVS_UNUSED(dist)) {
 
 CATCH_TEMPLATE_TEST_CASE(
     "Multi-vector dynamic vamana index",
-    "[index][vamana][multi]",
+    "[long][index][vamana][multi]",
     svs::DistanceL2,
     svs::DistanceIP,
     svs::DistanceCosineSimilarity

From 6cf9ecd5a87cabc6d49346717850396485fe9394 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 05:53:25 -0800
Subject: [PATCH 14/30] update catch2 and use ADD_TAGS_AS_LABELS
 ADD_TAGS_AS_LABELS

---
 tests/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ad82db1c3..63c55a934 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -37,7 +37,7 @@ set(CMAKE_CXX_STANDARD ${SVS_CXX_STANDARD})
 FetchContent_Declare(
     Catch2
     GIT_REPOSITORY https://github.com/catchorg/Catch2.git
-    GIT_TAG v3.4.0
+    GIT_TAG v3.11.0
 )
 
 FetchContent_MakeAvailable(Catch2)
@@ -230,5 +230,4 @@ target_include_directories(tests PRIVATE ${PROJECT_SOURCE_DIR})
 list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
 include(CTest)
 include(Catch)
-catch_discover_tests(tests)
-
+catch_discover_tests(tests ADD_TAGS_AS_LABELS SKIP_IS_FAILURE)

From d94e2a79bcd3b46f2abff370c90bcebdc869af04 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 05:57:16 -0800
Subject: [PATCH 15/30] add more [long] labels

---
 tests/svs/index/inverted/clustering.cpp   | 2 +-
 tests/svs/index/inverted/memory_based.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/svs/index/inverted/clustering.cpp b/tests/svs/index/inverted/clustering.cpp
index 29844f611..6ac896256 100644
--- a/tests/svs/index/inverted/clustering.cpp
+++ b/tests/svs/index/inverted/clustering.cpp
@@ -385,7 +385,7 @@ void test_end_to_end_clustering(
 
 } // namespace
 
-CATCH_TEST_CASE("Random Clustering - End to End", "[inverted][random_clustering]") {
+CATCH_TEST_CASE("Random Clustering - End to End", "[long][inverted][random_clustering]") {
     CATCH_SECTION("Uncompressed Data") {
         auto data = svs::data::SimpleData<float>::load(test_dataset::data_svs_file());
         test_end_to_end_clustering(data, svs::DistanceL2(), 1.2f);
diff --git a/tests/svs/index/inverted/memory_based.cpp b/tests/svs/index/inverted/memory_based.cpp
index 604791485..ad7d01b46 100644
--- a/tests/svs/index/inverted/memory_based.cpp
+++ b/tests/svs/index/inverted/memory_based.cpp
@@ -23,7 +23,7 @@
 #include "tests/utils/test_dataset.h"
 #include <filesystem>
 
-CATCH_TEST_CASE("InvertedIndex Logging Test", "[logging]") {
+CATCH_TEST_CASE("InvertedIndex Logging Test", "[long][logging]") {
     // Vector to store captured log messages
     std::vector<std::string> captured_logs;
     std::vector<std::string> global_captured_logs;

From 3ba1fd84180171f1d67a76b5d38f2576ae710276 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 06:00:32 -0800
Subject: [PATCH 16/30] revert simd_utils.h to trip new asan check in CI

---
 include/svs/core/distance/simd_utils.h | 38 ++++++++++++++------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/include/svs/core/distance/simd_utils.h b/include/svs/core/distance/simd_utils.h
index bd9834da5..f883abcaf 100644
--- a/include/svs/core/distance/simd_utils.h
+++ b/include/svs/core/distance/simd_utils.h
@@ -19,7 +19,6 @@
 #if defined(__i386__) || defined(__x86_64__)
 
 #include <array>
-#include <cstring>
 #include <limits>
 #include <type_traits>
 
@@ -333,10 +332,11 @@ template <> struct ConvertToFloat<8> {
     // from float
     static __m256 load(const float* ptr) { return _mm256_loadu_ps(ptr); }
     static __m256 load(mask_t m, const float* ptr) {
-        // Full width load with blending may cause out-of-bounds read (SEGV)
-        // Therefore we use _mm256_maskload_ps which safely handles masked loads
-        auto mask_vec = _mm256_castps_si256(create_blend_mask_avx2(m));
-        return _mm256_maskload_ps(ptr, mask_vec);
+        // AVX2 doesn't have native masked load, so we load and then blend
+        auto data = _mm256_loadu_ps(ptr);
+        auto zero = _mm256_setzero_ps();
+        auto mask_vec = create_blend_mask_avx2(m);
+        return _mm256_blendv_ps(zero, data, mask_vec);
     }
 
     // from float16
@@ -345,10 +345,10 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const Float16* ptr) {
-        // Safe masked load using a temporary buffer to avoid SEGV
-        __m128i buffer = _mm_setzero_si128();
-        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(Float16));
-        return _mm256_cvtph_ps(buffer);
+        auto data = _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
+        auto zero = _mm256_setzero_ps();
+        auto mask_vec = create_blend_mask_avx2(m);
+        return _mm256_blendv_ps(zero, data, mask_vec);
     }
 
     // from uint8
@@ -359,10 +359,12 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const uint8_t* ptr) {
-        // Safe masked load using a temporary buffer to avoid SEGV
-        int64_t buffer = 0;
-        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(uint8_t));
-        return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_cvtsi64_si128(buffer)));
+        auto data = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+            _mm_cvtsi64_si128(*(reinterpret_cast<const int64_t*>(ptr)))
+        ));
+        auto zero = _mm256_setzero_ps();
+        auto mask_vec = create_blend_mask_avx2(m);
+        return _mm256_blendv_ps(zero, data, mask_vec);
     }
 
     // from int8
@@ -373,10 +375,12 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const int8_t* ptr) {
-        // Safe masked load using a temporary buffer to avoid SEGV
-        int64_t buffer = 0;
-        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(int8_t));
-        return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(buffer)));
+        auto data = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
+            _mm_cvtsi64_si128(*(reinterpret_cast<const int64_t*>(ptr)))
+        ));
+        auto zero = _mm256_setzero_ps();
+        auto mask_vec = create_blend_mask_avx2(m);
+        return _mm256_blendv_ps(zero, data, mask_vec);
     }
 
     // We do not need to treat the left or right-hand differently.

From 528ff192e614b80779e283e99329984968b41333 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 07:00:15 -0800
Subject: [PATCH 17/30] add AVX2 L2 calculation back to trigger asan

---
 tests/svs/core/distance.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/svs/core/distance.cpp b/tests/svs/core/distance.cpp
index d32ee3f3e..877294b47 100644
--- a/tests/svs/core/distance.cpp
+++ b/tests/svs/core/distance.cpp
@@ -124,5 +124,10 @@ CATCH_TEMPLATE_TEST_CASE(
 
         auto dist = svs::distance::compute(Distance(), std::span(a), std::span(b));
         CATCH_REQUIRE(dist >= 0);
+
+        dist = svs::distance::
+            L2Impl<svs::Dynamic, float, float, svs::distance::AVX_AVAILABILITY::AVX2>::
+                compute(a.data(), b.data(), svs::lib::MaybeStatic(size));
+        CATCH_REQUIRE(dist >= 0);
     }
 }

From 99dbac307e599dd1bd182296313de99501feb185 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 07:01:00 -0800
Subject: [PATCH 18/30] Revert "add AVX2 L2 calculation back to trigger asan"

This reverts commit 528ff192e614b80779e283e99329984968b41333.
---
 tests/svs/core/distance.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/svs/core/distance.cpp b/tests/svs/core/distance.cpp
index 877294b47..d32ee3f3e 100644
--- a/tests/svs/core/distance.cpp
+++ b/tests/svs/core/distance.cpp
@@ -124,10 +124,5 @@ CATCH_TEMPLATE_TEST_CASE(
 
         auto dist = svs::distance::compute(Distance(), std::span(a), std::span(b));
         CATCH_REQUIRE(dist >= 0);
-
-        dist = svs::distance::
-            L2Impl<svs::Dynamic, float, float, svs::distance::AVX_AVAILABILITY::AVX2>::
-                compute(a.data(), b.data(), svs::lib::MaybeStatic(size));
-        CATCH_REQUIRE(dist >= 0);
     }
 }

From 055214f37c7d569648af0ebf7ac6cfa7a6790153 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 07:01:12 -0800
Subject: [PATCH 19/30] Revert "revert simd_utils.h to trip new asan check in
 CI"

This reverts commit 3ba1fd84180171f1d67a76b5d38f2576ae710276.
---
 include/svs/core/distance/simd_utils.h | 38 ++++++++++++--------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/include/svs/core/distance/simd_utils.h b/include/svs/core/distance/simd_utils.h
index f883abcaf..bd9834da5 100644
--- a/include/svs/core/distance/simd_utils.h
+++ b/include/svs/core/distance/simd_utils.h
@@ -19,6 +19,7 @@
 #if defined(__i386__) || defined(__x86_64__)
 
 #include <array>
+#include <cstring>
 #include <limits>
 #include <type_traits>
 
@@ -332,11 +333,10 @@ template <> struct ConvertToFloat<8> {
     // from float
     static __m256 load(const float* ptr) { return _mm256_loadu_ps(ptr); }
     static __m256 load(mask_t m, const float* ptr) {
-        // AVX2 doesn't have native masked load, so we load and then blend
-        auto data = _mm256_loadu_ps(ptr);
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Full width load with blending may cause out-of-bounds read (SEGV)
+        // Therefore we use _mm256_maskload_ps which safely handles masked loads
+        auto mask_vec = _mm256_castps_si256(create_blend_mask_avx2(m));
+        return _mm256_maskload_ps(ptr, mask_vec);
     }
 
     // from float16
@@ -345,10 +345,10 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const Float16* ptr) {
-        auto data = _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Safe masked load using a temporary buffer to avoid SEGV
+        __m128i buffer = _mm_setzero_si128();
+        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(Float16));
+        return _mm256_cvtph_ps(buffer);
     }
 
     // from uint8
@@ -359,12 +359,10 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const uint8_t* ptr) {
-        auto data = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-            _mm_cvtsi64_si128(*(reinterpret_cast<const int64_t*>(ptr)))
-        ));
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Safe masked load using a temporary buffer to avoid SEGV
+        int64_t buffer = 0;
+        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(uint8_t));
+        return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_cvtsi64_si128(buffer)));
     }
 
     // from int8
@@ -375,12 +373,10 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const int8_t* ptr) {
-        auto data = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_cvtsi64_si128(*(reinterpret_cast<const int64_t*>(ptr)))
-        ));
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Safe masked load using a temporary buffer to avoid SEGV
+        int64_t buffer = 0;
+        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(int8_t));
+        return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(buffer)));
     }
 
     // We do not need to treat the left or right-hand differently.

From 478c0dc6906740c3ec1b0341b1cd7f45f6acbd93 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 07:25:15 -0800
Subject: [PATCH 20/30] run all ISA paths in test

---
 include/svs/lib/avx_detection.h |  2 +-
 tests/svs/core/distance.cpp     | 65 ++++++++++++++++++++++++++-------
 tests/svs/lib/avx_detection.cpp |  7 ++++
 3 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/include/svs/lib/avx_detection.h b/include/svs/lib/avx_detection.h
index fc9b246f9..75a7b1190 100644
--- a/include/svs/lib/avx_detection.h
+++ b/include/svs/lib/avx_detection.h
@@ -54,6 +54,6 @@ struct AVXRuntimeFlags {
 };
 #endif
 
-inline const AVXRuntimeFlags avx_runtime_flags = {};
+inline AVXRuntimeFlags avx_runtime_flags = {};
 
 } // namespace svs::detail
diff --git a/tests/svs/core/distance.cpp b/tests/svs/core/distance.cpp
index d32ee3f3e..479ebb1df 100644
--- a/tests/svs/core/distance.cpp
+++ b/tests/svs/core/distance.cpp
@@ -24,6 +24,8 @@
 #include <numeric>
 #include <vector>
 
+#include "svs/lib/avx_detection.h"
+
 namespace {
 
 std::string_view test_table = R"(
@@ -108,21 +110,58 @@ CATCH_TEMPLATE_TEST_CASE(
 ) {
     using Distance = TestType;
 
-    // Try various sizes to hit the case where vector capacity == size
-    // and the SIMD load reads past the end into the redzone.
-    // We test sizes that are not multiples of 8 (AVX2 width) or 16 (AVX512 width).
-    for (size_t size = 1; size < 128; ++size) {
-        std::vector<float> a(size);
-        std::vector<float> b(size);
+    auto run_test = []() {
+        // Try various sizes to hit the case where vector capacity == size
+        // and the SIMD load reads past the end into the redzone.
+        // We test sizes that are not multiples of 8 (AVX2 width) or 16 (AVX512 width).
+        for (size_t size = 1; size < 128; ++size) {
+            std::vector<float> a(size);
+            std::vector<float> b(size);
+
+            std::iota(a.begin(), a.end(), 1.0f);
+            std::iota(b.begin(), b.end(), 2.0f);
+
+            // Ensure no spare capacity
+            a.shrink_to_fit();
+            b.shrink_to_fit();
 
-        std::iota(a.begin(), a.end(), 1.0f);
-        std::iota(b.begin(), b.end(), 2.0f);
+            auto dist = svs::distance::compute(Distance(), std::span(a), std::span(b));
+            CATCH_REQUIRE(dist >= 0);
+        }
+    };
 
-        // Ensure no spare capacity
-        a.shrink_to_fit();
-        b.shrink_to_fit();
+    CATCH_SECTION("Default") { run_test(); }
 
-        auto dist = svs::distance::compute(Distance(), std::span(a), std::span(b));
-        CATCH_REQUIRE(dist >= 0);
+    CATCH_SECTION("No AVX512VNNI") {
+        if (!svs::detail::avx_runtime_flags.is_avx512vnni_supported()) {
+            CATCH_SKIP("AVX512VNNI not supported on this platform");
+        }
+        auto original = svs::detail::avx_runtime_flags;
+        svs::detail::avx_runtime_flags.avx512vnni = false;
+        run_test();
+        svs::detail::avx_runtime_flags = original;
+    }
+
+    CATCH_SECTION("No AVX512F") {
+        if (!svs::detail::avx_runtime_flags.is_avx512f_supported()) {
+            CATCH_SKIP("AVX512F not supported on this platform");
+        }
+        auto original = svs::detail::avx_runtime_flags;
+        svs::detail::avx_runtime_flags.avx512vnni = false;
+        svs::detail::avx_runtime_flags.avx512f = false;
+        run_test();
+        svs::detail::avx_runtime_flags = original;
+    }
+
+    CATCH_SECTION("No AVX2") {
+        if (!svs::detail::avx_runtime_flags.is_avx2_supported()) {
+            CATCH_SKIP("AVX2 not supported on this platform");
+        }
+        auto original = svs::detail::avx_runtime_flags;
+        svs::detail::avx_runtime_flags.avx512vnni = false;
+        svs::detail::avx_runtime_flags.avx512f = false;
+        svs::detail::avx_runtime_flags.avx2 = false;
+        run_test();
+        svs::detail::avx_runtime_flags = original;
     }
 }
diff --git a/tests/svs/lib/avx_detection.cpp b/tests/svs/lib/avx_detection.cpp
index 02d5f9e36..00d373ba4 100644
--- a/tests/svs/lib/avx_detection.cpp
+++ b/tests/svs/lib/avx_detection.cpp
@@ -29,4 +29,11 @@ CATCH_TEST_CASE("AVX detection", "[lib][lib-avx-detection]") {
               << svs::detail::avx_runtime_flags.is_avx512f_supported() << "\n";
     std::cout << "AVX512VNNI: " << std::boolalpha
               << svs::detail::avx_runtime_flags.is_avx512vnni_supported() << "\n";
+
+    CATCH_SECTION("Patching") {
+        auto original = svs::detail::avx_runtime_flags.avx512f;
+        svs::detail::avx_runtime_flags.avx512f = false;
+        CATCH_REQUIRE(svs::detail::avx_runtime_flags.is_avx512f_supported() == false);
+        svs::detail::avx_runtime_flags.avx512f = original;
+    }
 }

From c205443e61917fc94fba1cff049fcb6cce0e9311 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 07:33:42 -0800
Subject: [PATCH 21/30] keep avx_runtime_flags const by using const_cast in
 tests

---
 include/svs/lib/avx_detection.h |  2 +-
 tests/svs/core/distance.cpp     | 30 ++++++++++++++++++------------
 tests/svs/lib/avx_detection.cpp |  8 +++++---
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/include/svs/lib/avx_detection.h b/include/svs/lib/avx_detection.h
index 75a7b1190..fc9b246f9 100644
--- a/include/svs/lib/avx_detection.h
+++ b/include/svs/lib/avx_detection.h
@@ -54,6 +54,6 @@ struct AVXRuntimeFlags {
 };
 #endif
 
-inline AVXRuntimeFlags avx_runtime_flags = {};
+inline const AVXRuntimeFlags avx_runtime_flags = {};
 
 } // namespace svs::detail
diff --git a/tests/svs/core/distance.cpp b/tests/svs/core/distance.cpp
index 479ebb1df..0c1713913 100644
--- a/tests/svs/core/distance.cpp
+++ b/tests/svs/core/distance.cpp
@@ -136,32 +136,38 @@ CATCH_TEMPLATE_TEST_CASE(
         if (!svs::detail::avx_runtime_flags.is_avx512vnni_supported()) {
             CATCH_SKIP("AVX512VNNI not supported on this platform");
         }
-        auto original = svs::detail::avx_runtime_flags;
-        svs::detail::avx_runtime_flags.avx512vnni = false;
+        auto& mutable_flags =
+            const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
+        auto original = mutable_flags;
+        mutable_flags.avx512vnni = false;
         run_test();
-        svs::detail::avx_runtime_flags = original;
+        mutable_flags = original;
     }
 
     CATCH_SECTION("No AVX512F") {
         if (!svs::detail::avx_runtime_flags.is_avx512f_supported()) {
             CATCH_SKIP("AVX512F not supported on this platform");
         }
-        auto original = svs::detail::avx_runtime_flags;
-        svs::detail::avx_runtime_flags.avx512vnni = false;
-        svs::detail::avx_runtime_flags.avx512f = false;
+        auto& mutable_flags =
+            const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
+        auto original = mutable_flags;
+        mutable_flags.avx512vnni = false;
+        mutable_flags.avx512f = false;
         run_test();
-        svs::detail::avx_runtime_flags = original;
+        mutable_flags = original;
     }
 
     CATCH_SECTION("No AVX2") {
         if (!svs::detail::avx_runtime_flags.is_avx2_supported()) {
             CATCH_SKIP("AVX2 not supported on this platform");
         }
-        auto original = svs::detail::avx_runtime_flags;
-        svs::detail::avx_runtime_flags.avx512vnni = false;
-        svs::detail::avx_runtime_flags.avx512f = false;
-        svs::detail::avx_runtime_flags.avx2 = false;
+        auto& mutable_flags =
+            const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
+        auto original = mutable_flags;
+        mutable_flags.avx512vnni = false;
+        mutable_flags.avx512f = false;
+        mutable_flags.avx2 = false;
         run_test();
-        svs::detail::avx_runtime_flags = original;
+        mutable_flags = original;
     }
 }
diff --git a/tests/svs/lib/avx_detection.cpp b/tests/svs/lib/avx_detection.cpp
index 00d373ba4..306673180 100644
--- a/tests/svs/lib/avx_detection.cpp
+++ b/tests/svs/lib/avx_detection.cpp
@@ -31,9 +31,11 @@ CATCH_TEST_CASE("AVX detection", "[lib][lib-avx-detection]") {
               << svs::detail::avx_runtime_flags.is_avx512vnni_supported() << "\n";
 
     CATCH_SECTION("Patching") {
-        auto original = svs::detail::avx_runtime_flags.avx512f;
-        svs::detail::avx_runtime_flags.avx512f = false;
+        auto& mutable_flags =
+            const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
+        auto original = mutable_flags.avx512f;
+        mutable_flags.avx512f = false;
         CATCH_REQUIRE(svs::detail::avx_runtime_flags.is_avx512f_supported() == false);
-        svs::detail::avx_runtime_flags.avx512f = original;
+        mutable_flags.avx512f = original;
     }
 }

From de0bfac6b7b362f1c94aa46d359ff6c249b47df5 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 08:13:32 -0800
Subject: [PATCH 22/30] fix false positive failure for skipped tests

---
 tests/svs/core/distance.cpp | 57 ++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/tests/svs/core/distance.cpp b/tests/svs/core/distance.cpp
index 0c1713913..f254d19e4 100644
--- a/tests/svs/core/distance.cpp
+++ b/tests/svs/core/distance.cpp
@@ -132,42 +132,39 @@ CATCH_TEMPLATE_TEST_CASE(
 
     CATCH_SECTION("Default") { run_test(); }
 
-    CATCH_SECTION("No AVX512VNNI") {
-        if (!svs::detail::avx_runtime_flags.is_avx512vnni_supported()) {
-            CATCH_SKIP("AVX512VNNI not supported on this platform");
+    if (svs::detail::avx_runtime_flags.is_avx512vnni_supported()) {
+        CATCH_SECTION("No AVX512VNNI") {
+            auto& mutable_flags =
+                const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
+            auto original = mutable_flags;
+            mutable_flags.avx512vnni = false;
+            run_test();
+            mutable_flags = original;
         }
-        auto& mutable_flags =
-            const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
-        auto original = mutable_flags;
-        mutable_flags.avx512vnni = false;
-        run_test();
-        mutable_flags = original;
     }
 
-    CATCH_SECTION("No AVX512F") {
-        if (!svs::detail::avx_runtime_flags.is_avx512f_supported()) {
-            CATCH_SKIP("AVX512F not supported on this platform");
+    if (svs::detail::avx_runtime_flags.is_avx512f_supported()) {
+        CATCH_SECTION("No AVX512F") {
+            auto& mutable_flags =
+                const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
+            auto original = mutable_flags;
+            mutable_flags.avx512vnni = false;
+            mutable_flags.avx512f = false;
+            run_test();
+            mutable_flags = original;
         }
-        auto& mutable_flags =
-            const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
-        auto original = mutable_flags;
-        mutable_flags.avx512vnni = false;
-        mutable_flags.avx512f = false;
-        run_test();
-        mutable_flags = original;
     }
 
-    CATCH_SECTION("No AVX2") {
-        if (!svs::detail::avx_runtime_flags.is_avx2_supported()) {
-            CATCH_SKIP("AVX2 not supported on this platform");
+    if (svs::detail::avx_runtime_flags.is_avx2_supported()) {
+        CATCH_SECTION("No AVX2") {
+            auto& mutable_flags =
+                const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
+            auto original = mutable_flags;
+            mutable_flags.avx512vnni = false;
+            mutable_flags.avx512f = false;
+            mutable_flags.avx2 = false;
+            run_test();
+            mutable_flags = original;
         }
-        auto& mutable_flags =
-            const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
-        auto original = mutable_flags;
-        mutable_flags.avx512vnni = false;
-        mutable_flags.avx512f = false;
-        mutable_flags.avx2 = false;
-        run_test();
-        mutable_flags = original;
     }
 }

From be64f341aa0c5c4796f80d8fd9d5ffd7beea38b9 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 08:18:17 -0800
Subject: [PATCH 23/30] fix: only modify isa dispatching on x86

---
 tests/svs/core/distance.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/svs/core/distance.cpp b/tests/svs/core/distance.cpp
index f254d19e4..06ecb3acb 100644
--- a/tests/svs/core/distance.cpp
+++ b/tests/svs/core/distance.cpp
@@ -132,6 +132,7 @@ CATCH_TEMPLATE_TEST_CASE(
 
     CATCH_SECTION("Default") { run_test(); }
 
+#ifdef __x86_64__
     if (svs::detail::avx_runtime_flags.is_avx512vnni_supported()) {
         CATCH_SECTION("No AVX512VNNI") {
             auto& mutable_flags =
@@ -167,4 +168,5 @@ CATCH_TEMPLATE_TEST_CASE(
             mutable_flags = original;
         }
     }
+#endif // __x86_64__
 }

From f856a965523a461e9e438bc01389e522450ce998 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 08:22:46 -0800
Subject: [PATCH 24/30] fixup

---
 include/svs/core/distance/simd_utils.h | 38 ++++++++++++++------------
 tests/svs/lib/avx_detection.cpp        |  2 ++
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/include/svs/core/distance/simd_utils.h b/include/svs/core/distance/simd_utils.h
index bd9834da5..f883abcaf 100644
--- a/include/svs/core/distance/simd_utils.h
+++ b/include/svs/core/distance/simd_utils.h
@@ -19,7 +19,6 @@
 #if defined(__i386__) || defined(__x86_64__)
 
 #include <array>
-#include <cstring>
 #include <limits>
 #include <type_traits>
 
@@ -333,10 +332,11 @@ template <> struct ConvertToFloat<8> {
     // from float
     static __m256 load(const float* ptr) { return _mm256_loadu_ps(ptr); }
     static __m256 load(mask_t m, const float* ptr) {
-        // Full width load with blending may cause out-of-bounds read (SEGV)
-        // Therefore we use _mm256_maskload_ps which safely handles masked loads
-        auto mask_vec = _mm256_castps_si256(create_blend_mask_avx2(m));
-        return _mm256_maskload_ps(ptr, mask_vec);
+        // AVX2 doesn't have native masked load, so we load and then blend
+        auto data = _mm256_loadu_ps(ptr);
+        auto zero = _mm256_setzero_ps();
+        auto mask_vec = create_blend_mask_avx2(m);
+        return _mm256_blendv_ps(zero, data, mask_vec);
     }
 
     // from float16
@@ -345,10 +345,10 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const Float16* ptr) {
-        // Safe masked load using a temporary buffer to avoid SEGV
-        __m128i buffer = _mm_setzero_si128();
-        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(Float16));
-        return _mm256_cvtph_ps(buffer);
+        auto data = _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
+        auto zero = _mm256_setzero_ps();
+        auto mask_vec = create_blend_mask_avx2(m);
+        return _mm256_blendv_ps(zero, data, mask_vec);
     }
 
     // from uint8
@@ -359,10 +359,12 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const uint8_t* ptr) {
-        // Safe masked load using a temporary buffer to avoid SEGV
-        int64_t buffer = 0;
-        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(uint8_t));
-        return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_cvtsi64_si128(buffer)));
+        auto data = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+            _mm_cvtsi64_si128(*(reinterpret_cast<const int64_t*>(ptr)))
+        ));
+        auto zero = _mm256_setzero_ps();
+        auto mask_vec = create_blend_mask_avx2(m);
+        return _mm256_blendv_ps(zero, data, mask_vec);
     }
 
     // from int8
@@ -373,10 +375,12 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const int8_t* ptr) {
-        // Safe masked load using a temporary buffer to avoid SEGV
-        int64_t buffer = 0;
-        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(int8_t));
-        return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(buffer)));
+        auto data = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
+            _mm_cvtsi64_si128(*(reinterpret_cast<const int64_t*>(ptr)))
+        ));
+        auto zero = _mm256_setzero_ps();
+        auto mask_vec = create_blend_mask_avx2(m);
+        return _mm256_blendv_ps(zero, data, mask_vec);
     }
 
     // We do not need to treat the left or right-hand differently.
diff --git a/tests/svs/lib/avx_detection.cpp b/tests/svs/lib/avx_detection.cpp
index 306673180..6ac72e4f9 100644
--- a/tests/svs/lib/avx_detection.cpp
+++ b/tests/svs/lib/avx_detection.cpp
@@ -30,6 +30,7 @@ CATCH_TEST_CASE("AVX detection", "[lib][lib-avx-detection]") {
     std::cout << "AVX512VNNI: " << std::boolalpha
               << svs::detail::avx_runtime_flags.is_avx512vnni_supported() << "\n";
 
+#ifdef __x86_64__
     CATCH_SECTION("Patching") {
         auto& mutable_flags =
             const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
@@ -38,4 +39,5 @@ CATCH_TEST_CASE("AVX detection", "[lib][lib-avx-detection]") {
         CATCH_REQUIRE(svs::detail::avx_runtime_flags.is_avx512f_supported() == false);
         mutable_flags.avx512f = original;
     }
+#endif // __x86_64__
 }

From 80d1d840814a180577c0dcbe704c6978c355cae9 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 08:30:17 -0800
Subject: [PATCH 25/30] simplify test

---
 tests/svs/core/distance.cpp | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/tests/svs/core/distance.cpp b/tests/svs/core/distance.cpp
index 06ecb3acb..4f19b7334 100644
--- a/tests/svs/core/distance.cpp
+++ b/tests/svs/core/distance.cpp
@@ -111,23 +111,20 @@ CATCH_TEMPLATE_TEST_CASE(
     using Distance = TestType;
 
     auto run_test = []() {
-        // Try various sizes to hit the case where vector capacity == size
-        // and the SIMD load reads past the end into the redzone.
-        // We test sizes that are not multiples of 8 (AVX2 width) or 16 (AVX512 width).
-        for (size_t size = 1; size < 128; ++size) {
-            std::vector<float> a(size);
-            std::vector<float> b(size);
-
-            std::iota(a.begin(), a.end(), 1.0f);
-            std::iota(b.begin(), b.end(), 2.0f);
-
-            // Ensure no spare capacity
-            a.shrink_to_fit();
-            b.shrink_to_fit();
-
-            auto dist = svs::distance::compute(Distance(), std::span(a), std::span(b));
-            CATCH_REQUIRE(dist >= 0);
-        }
+        // some full-width AVX2/AVX512 registers plus (crucially) ragged epilogue
+        constexpr size_t size = 64 + 2;
+        std::vector<float> a(size);
+        std::vector<float> b(size);
+
+        std::iota(a.begin(), a.end(), 1.0f);
+        std::iota(b.begin(), b.end(), 2.0f);
+
+        // Ensure no spare capacity
+        a.shrink_to_fit();
+        b.shrink_to_fit();
+
+        auto dist = svs::distance::compute(Distance(), std::span(a), std::span(b));
+        CATCH_REQUIRE(dist >= 0);
     };
 
     CATCH_SECTION("Default") { run_test(); }

From 966d58ccd5994fc6fa800d511348704909567c6a Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 09:01:06 -0800
Subject: [PATCH 26/30] Revert "fixup"

This reverts commit f856a965523a461e9e438bc01389e522450ce998.
---
 include/svs/core/distance/simd_utils.h | 38 ++++++++++++--------------
 tests/svs/lib/avx_detection.cpp        |  2 --
 2 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/include/svs/core/distance/simd_utils.h b/include/svs/core/distance/simd_utils.h
index f883abcaf..bd9834da5 100644
--- a/include/svs/core/distance/simd_utils.h
+++ b/include/svs/core/distance/simd_utils.h
@@ -19,6 +19,7 @@
 #if defined(__i386__) || defined(__x86_64__)
 
 #include <array>
+#include <cstring>
 #include <limits>
 #include <type_traits>
 
@@ -332,11 +333,10 @@ template <> struct ConvertToFloat<8> {
     // from float
     static __m256 load(const float* ptr) { return _mm256_loadu_ps(ptr); }
     static __m256 load(mask_t m, const float* ptr) {
-        // AVX2 doesn't have native masked load, so we load and then blend
-        auto data = _mm256_loadu_ps(ptr);
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Full width load with blending may cause out-of-bounds read (SEGV)
+        // Therefore we use _mm256_maskload_ps which safely handles masked loads
+        auto mask_vec = _mm256_castps_si256(create_blend_mask_avx2(m));
+        return _mm256_maskload_ps(ptr, mask_vec);
     }
 
     // from float16
@@ -345,10 +345,10 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const Float16* ptr) {
-        auto data = _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Safe masked load using a temporary buffer to avoid SEGV
+        __m128i buffer = _mm_setzero_si128();
+        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(Float16));
+        return _mm256_cvtph_ps(buffer);
     }
 
     // from uint8
@@ -359,12 +359,10 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const uint8_t* ptr) {
-        auto data = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-            _mm_cvtsi64_si128(*(reinterpret_cast<const int64_t*>(ptr)))
-        ));
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Safe masked load using a temporary buffer to avoid SEGV
+        int64_t buffer = 0;
+        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(uint8_t));
+        return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_cvtsi64_si128(buffer)));
     }
 
     // from int8
@@ -375,12 +373,10 @@ template <> struct ConvertToFloat<8> {
     }
 
     static __m256 load(mask_t m, const int8_t* ptr) {
-        auto data = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_cvtsi64_si128(*(reinterpret_cast<const int64_t*>(ptr)))
-        ));
-        auto zero = _mm256_setzero_ps();
-        auto mask_vec = create_blend_mask_avx2(m);
-        return _mm256_blendv_ps(zero, data, mask_vec);
+        // Safe masked load using a temporary buffer to avoid SEGV
+        int64_t buffer = 0;
+        std::memcpy(&buffer, ptr, __builtin_popcount(m) * sizeof(int8_t));
+        return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(buffer)));
     }
 
     // We do not need to treat the left or right-hand differently.
diff --git a/tests/svs/lib/avx_detection.cpp b/tests/svs/lib/avx_detection.cpp
index 6ac72e4f9..306673180 100644
--- a/tests/svs/lib/avx_detection.cpp
+++ b/tests/svs/lib/avx_detection.cpp
@@ -30,7 +30,6 @@ CATCH_TEST_CASE("AVX detection", "[lib][lib-avx-detection]") {
     std::cout << "AVX512VNNI: " << std::boolalpha
               << svs::detail::avx_runtime_flags.is_avx512vnni_supported() << "\n";
 
-#ifdef __x86_64__
     CATCH_SECTION("Patching") {
         auto& mutable_flags =
             const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
@@ -39,5 +38,4 @@ CATCH_TEST_CASE("AVX detection", "[lib][lib-avx-detection]") {
         CATCH_REQUIRE(svs::detail::avx_runtime_flags.is_avx512f_supported() == false);
         mutable_flags.avx512f = original;
     }
-#endif // __x86_64__
 }

From 6fcc214814a74cdb4ec77f2a041047f79e594f74 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 09:01:38 -0800
Subject: [PATCH 27/30] fixup

---
 tests/svs/lib/avx_detection.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/svs/lib/avx_detection.cpp b/tests/svs/lib/avx_detection.cpp
index 306673180..bdfe24578 100644
--- a/tests/svs/lib/avx_detection.cpp
+++ b/tests/svs/lib/avx_detection.cpp
@@ -30,6 +30,7 @@ CATCH_TEST_CASE("AVX detection", "[lib][lib-avx-detection]") {
     std::cout << "AVX512VNNI: " << std::boolalpha
               << svs::detail::avx_runtime_flags.is_avx512vnni_supported() << "\n";
 
+#ifdef __x86_64__
     CATCH_SECTION("Patching") {
         auto& mutable_flags =
             const_cast<svs::detail::AVXRuntimeFlags&>(svs::detail::avx_runtime_flags);
@@ -38,4 +39,5 @@ CATCH_TEST_CASE("AVX detection", "[lib][lib-avx-detection]") {
         CATCH_REQUIRE(svs::detail::avx_runtime_flags.is_avx512f_supported() == false);
         mutable_flags.avx512f = original;
     }
+#endifx
 }

From 7d5b6ed21a5dca731cfb5a01ea770234ce5e6dd0 Mon Sep 17 00:00:00 2001
From: Andreas Huber <9201869+ahuber21@users.noreply.github.com>
Date: Wed, 10 Dec 2025 18:02:41 +0100
Subject: [PATCH 28/30] Include asan in C flags

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .github/workflows/build-linux.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml
index 04213c675..2d055e54f 100644
--- a/.github/workflows/build-linux.yml
+++ b/.github/workflows/build-linux.yml
@@ -55,7 +55,9 @@ jobs:
             # address sanitizer flags
             cmake_extra_args: >-
               -DCMAKE_CXX_FLAGS='-fsanitize=address -fno-omit-frame-pointer -g'
+              -DCMAKE_C_FLAGS='-fsanitize=address -fno-omit-frame-pointer -g'
               -DCMAKE_EXE_LINKER_FLAGS='-fsanitize=address'
+              -DCMAKE_SHARED_LINKER_FLAGS='-fsanitize=address'
               -DSVS_BUILD_BINARIES=NO
               -DSVS_BUILD_EXAMPLES=NO
             # skip longer-running tests

From 63e58cd076c79d29b82dd2cd184de01044e7a5f9 Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Wed, 10 Dec 2025 09:04:21 -0800
Subject: [PATCH 29/30] fixup

---
 tests/svs/lib/avx_detection.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/svs/lib/avx_detection.cpp b/tests/svs/lib/avx_detection.cpp
index bdfe24578..6c090efce 100644
--- a/tests/svs/lib/avx_detection.cpp
+++ b/tests/svs/lib/avx_detection.cpp
@@ -39,5 +39,5 @@ CATCH_TEST_CASE("AVX detection", "[lib][lib-avx-detection]") {
         CATCH_REQUIRE(svs::detail::avx_runtime_flags.is_avx512f_supported() == false);
         mutable_flags.avx512f = original;
     }
-#endifx
+#endif
 }

From 8b36bef94e906b4a0578a780c1dd5842a260a8cf Mon Sep 17 00:00:00 2001
From: Andreas Huber <andreas.huber@intel.com>
Date: Thu, 11 Dec 2025 02:01:14 -0800
Subject: [PATCH 30/30] remove asan_options; remove auto-formatted double-quote
 change

---
 .github/workflows/build-linux.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml
index 2d055e54f..d520b020e 100644
--- a/.github/workflows/build-linux.yml
+++ b/.github/workflows/build-linux.yml
@@ -25,7 +25,7 @@ permissions:
 
 # This allows a subsequently queued workflow run to interrupt previous runs
 concurrency:
-  group: "${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}"
+  group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}'
   cancel-in-progress: true
 
 jobs:
@@ -106,7 +106,6 @@ jobs:
     - name: Run tests
       env:
         CTEST_OUTPUT_ON_FAILURE: 1
-        ASAN_OPTIONS: detect_leaks=0
       working-directory: ${{ runner.temp }}/build/tests
       run: ctest -C ${{ matrix.build_type }} ${{ matrix.ctest_args }}