alibaba · Aanerud · May 20, 2026 · May 20, 2026 · May 20, 2026 · feihongxu0824
@@ -18,7 +18,16 @@ jobs:
       matrix:
         include:
           - platform: windows-2022
+            msvc_arch: x64
+            python_version: '3.10'
           - platform: windows-2025
+            msvc_arch: x64
+            python_version: '3.10'
+          # Windows ARM64: Python 3.10 has no official ARM64 installer;
+          # 3.11 is the first CPython release with a Windows-on-ARM build.
+          - platform: windows-11-arm
+            msvc_arch: arm64
+            python_version: '3.11'
 
     steps:
       - name: Show env info
@@ -41,14 +50,14 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v6
         with:
-          python-version: '3.10'
+          python-version: ${{ matrix.python_version }}
           cache: 'pip'
           cache-dependency-path: 'pyproject.toml'
 
       - name: Set up MSVC environment
         uses: ilammy/msvc-dev-cmd@v1
         with:
-          arch: x64
+          arch: ${{ matrix.msvc_arch }}
 
       - name: Set up environment variables
         run: |

@@ -91,29 +91,35 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH)
         )
     endforeach()
     elseif (HOST_ARCH MATCHES "^(arm|arm64)$")
-        if(MSVC)
-            return()
-        endif()
-        set(MATH_MARCH_FLAG_NEON "-march=armv8-a")
-
-        file(GLOB_RECURSE MATH_FILES_NEON
-          ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc
-          ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c
-          ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.cc
-          ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.c
-          ${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.cc
-          ${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.c
-          ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.cc
-          ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.c
-        )
+        if(NOT MSVC)
+            set(MATH_MARCH_FLAG_NEON "-march=armv8-a")
+
+            file(GLOB_RECURSE MATH_FILES_NEON
+              ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc
+              ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c
+              ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.cc
+              ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.c
+              ${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.cc
+              ${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.c
+              ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.cc
+              ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.c
+            )
 
-        foreach(MATH_FILE ${MATH_FILES_NEON})
-          set_source_files_properties(
-              ${MATH_FILE}
-              PROPERTIES
-              COMPILE_FLAGS "${MATH_MARCH_FLAG_NEON}"
-          )
-        endforeach()
+            foreach(MATH_FILE ${MATH_FILES_NEON})
+              set_source_files_properties(
+                  ${MATH_FILE}
+                  PROPERTIES
+                  COMPILE_FLAGS "${MATH_MARCH_FLAG_NEON}"
+              )
+            endforeach()
+        else()
+            # MSVC on ARM64: NEON is the ARMv8 baseline and is always enabled,
+            # so no `-march` flag is required (MSVC does not accept GCC-style
+            # `-march=` anyway). The NEON math kernels still get compiled via
+            # the ALL_SRCS glob above; their `#if defined(__ARM_NEON)` guards
+            # were extended in this PR to also accept `_M_ARM64` so the bodies
+            # actually emit code under MSVC.
+        endif()
     endif()
 endif()
 

@@ -15,9 +15,10 @@
 #include "cpu_features.h"
 #include <cstddef>
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
 #include <intrin.h>
-#elif !defined(__ARM_ARCH)
+#elif !defined(_MSC_VER) && !defined(__ARM_ARCH) && \
+    !(defined(__aarch64__) || defined(_M_ARM64))
 #include <cpuid.h>
 #endif
 
@@ -34,7 +35,7 @@ namespace internal {
 
 CpuFeatures::CpuFlags CpuFeatures::flags_;
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
 CpuFeatures::CpuFlags::CpuFlags(void)
     : L1_ECX(0), L1_EDX(0), L7_EBX(0), L7_ECX(0), L7_EDX(0) {
   int l1[4] = {0, 0, 0, 0};
@@ -48,7 +49,8 @@ CpuFeatures::CpuFlags::CpuFlags(void)
   L7_ECX = l7[2];
   L7_EDX = l7[3];
 }
-#elif !defined(__ARM_ARCH)
+#elif !defined(_MSC_VER) && !defined(__ARM_ARCH) && \
+    !(defined(__aarch64__) || defined(_M_ARM64))
 CpuFeatures::CpuFlags::CpuFlags(void)
     : L1_ECX(0), L1_EDX(0), L7_EBX(0), L7_ECX(0), L7_EDX(0) {
   uint32_t eax, ebx, ecx, edx;
@@ -336,7 +338,7 @@ bool CpuFeatures::HYPERVISOR(void) {
 
 const char *CpuFeatures::Intrinsics(void) {
   return ""
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
          "Neon"
 #if defined(__ARM_FEATURE_CRC32)
          "+CRC"

@@ -30,7 +30,7 @@
                                             _mm512_castps_si512(b)))
 #endif  // __AVX512DQ__
 
-#if defined(__ARM_NEON) && !defined(__aarch64__)
+#if (defined(__ARM_NEON) || defined(_M_ARM64)) && !(defined(__aarch64__) || defined(_M_ARM64))
 static inline float32_t vaddvq_f32(float32x4_t v) {
   float32x2_t s = vadd_f32(vget_low_f32(v), vget_high_f32(v));
   return vget_lane_f32(vpadd_f32(s, s), 0);
@@ -42,7 +42,7 @@ static inline int32_t vaddvq_s32(int32x4_t v) {
 }
 #endif  //__ARM_NEON && !__aarch64__
 
-#if defined(__aarch64__)
+#if (defined(__aarch64__) || defined(_M_ARM64))
 #define ACCUM_FP32_2X1_NEON ACCUM_FP32_2X1_NEON_A64
 #else
 #define ACCUM_FP32_2X1_NEON ACCUM_FP32_2X1_NEON_A32

@@ -26,7 +26,7 @@
   _mm256_insertf128_ps(_mm256_castps128_ps256(b), (a), 1)
 #endif  // __AVX__
 
-#if defined(__ARM_NEON) && !defined(__aarch64__)
+#if (defined(__ARM_NEON) || defined(_M_ARM64)) && !(defined(__aarch64__) || defined(_M_ARM64))
 #define vdupq_laneq_f32(a, b) vdupq_n_f32(vgetq_lane_f32(a, b))
 #endif  // __ARM_NEON && __aarch64__
 

@@ -18,7 +18,7 @@
 namespace zvec {
 namespace ailego {
 
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
 float SquaredEuclideanDistanceFp16NEON(const Float16 *lhs, const Float16 *rhs,
                                        size_t size);
 #endif
@@ -46,7 +46,7 @@ void SquaredEuclideanDistanceMatrix<Float16, 1, 1>::Compute(const ValueType *m,
                                                             const ValueType *q,
                                                             size_t dim,
                                                             float *out) {
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
   *out = SquaredEuclideanDistanceFp16NEON(m, q, dim);
 #else
 #if defined(__AVX512FP16__)

@@ -19,7 +19,7 @@
 namespace zvec {
 namespace ailego {
 
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
 float SquaredEuclideanDistanceFp16NEON(const Float16 *lhs, const Float16 *rhs,
                                        size_t size) {
   float score{0.0f};

@@ -18,7 +18,7 @@
 namespace zvec {
 namespace ailego {
 
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
 void SquaredEuclideanDistanceFp32NEON(const float *lhs, const float *rhs,
                                       size_t size, float *out);
 #endif
@@ -49,7 +49,7 @@ void SquaredEuclideanDistanceMatrix<float, 1, 1>::Compute(const ValueType *m,
                                                           const ValueType *q,
                                                           size_t dim,
                                                           float *out) {
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
   SquaredEuclideanDistanceFp32NEON(m, q, dim, out);
 #else
 #if defined(__AVX512F__)

@@ -19,7 +19,7 @@
 namespace zvec {
 namespace ailego {
 
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
 //! Squared Euclidean Distance
 void SquaredEuclideanDistanceFp32NEON(const float *lhs, const float *rhs,
                                       size_t size, float *out) {

@@ -21,7 +21,7 @@ namespace ailego {
 //--------------------------------------------------
 // Dense
 //--------------------------------------------------
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
 float InnerProductFp16NEON(const Float16 *lhs, const Float16 *rhs, size_t size);
 float MinusInnerProductFp16NEON(const Float16 *lhs, const Float16 *rhs,
                                 size_t size);
@@ -56,7 +56,7 @@ float MinusInnerProductFp16Scalar(const Float16 *lhs, const Float16 *rhs,
 void InnerProductMatrix<Float16, 1, 1>::Compute(const ValueType *m,
                                                 const ValueType *q, size_t dim,
                                                 float *out) {
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
   *out = InnerProductFp16NEON(m, q, dim);
 #else
 #if defined(__AVX512FP16__)
@@ -86,7 +86,7 @@ void InnerProductMatrix<Float16, 1, 1>::Compute(const ValueType *m,
 void MinusInnerProductMatrix<Float16, 1, 1>::Compute(const ValueType *m,
                                                      const ValueType *q,
                                                      size_t dim, float *out) {
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
   *out = MinusInnerProductFp16NEON(m, q, dim);
 #else
 #if defined(__AVX512FP16__)

@@ -19,7 +19,7 @@
 namespace zvec {
 namespace ailego {
 
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
 float InnerProductFp16NEON(const Float16 *lhs, const Float16 *rhs,
                            size_t size) {
   float score;

@@ -20,7 +20,7 @@ namespace ailego {
 //--------------------------------------------------
 // Dense
 //--------------------------------------------------
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
 float InnerProductFp32NEON(const float *lhs, const float *rhs, size_t size);
 float MinusInnerProductFp32NEON(const float *lhs, const float *rhs,
                                 size_t size);
@@ -49,7 +49,7 @@ float MinusInnerProductFp32Scalar(const float *lhs, const float *rhs,
 //! Compute the distance between matrix and query (FP32, M=1, N=1)
 void InnerProductMatrix<float, 1, 1>::Compute(const float *m, const float *q,
                                               size_t dim, float *out) {
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
   *out = InnerProductFp32NEON(m, q, dim);
 #else
 #if defined(__AVX512F__)
@@ -80,7 +80,7 @@ void InnerProductMatrix<float, 1, 1>::Compute(const float *m, const float *q,
 void MinusInnerProductMatrix<float, 1, 1>::Compute(const float *m,
                                                    const float *q, size_t dim,
                                                    float *out) {
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
   *out = MinusInnerProductFp32NEON(m, q, dim);
 #else
 #if defined(__AVX512F__)

@@ -22,7 +22,7 @@ namespace ailego {
 //--------------------------------------------------
 // Dense
 //--------------------------------------------------
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
 float InnerProductFp32NEON(const float *lhs, const float *rhs, size_t size) {
   const float *last = lhs + size;
   const float *last_aligned = lhs + ((size >> 3) << 3);

@@ -18,7 +18,7 @@
 namespace zvec {
 namespace ailego {
 
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
 float MipsEuclideanDistanceRepeatedQuadraticInjectionFp16NEON(
     const Float16 *lhs, const Float16 *rhs, size_t size, size_t m, float e2);
 float MipsEuclideanDistanceSphericalInjectionFp16NEON(const Float16 *lhs,
@@ -51,7 +51,7 @@ float MipsEuclideanDistanceSphericalInjectionFp16Scalar(
 //! Compute the distance between matrix and query by SphericalInjection
 void MipsSquaredEuclideanDistanceMatrix<Float16, 1, 1>::Compute(
     const ValueType *p, const ValueType *q, size_t dim, float e2, float *out) {
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
   *out = MipsEuclideanDistanceSphericalInjectionFp16NEON(p, q, dim, e2);
 #else
 #if defined(__AVX512F__)
@@ -75,7 +75,7 @@ void MipsSquaredEuclideanDistanceMatrix<Float16, 1, 1>::Compute(
 void MipsSquaredEuclideanDistanceMatrix<Float16, 1, 1>::Compute(
     const ValueType *p, const ValueType *q, size_t dim, size_t m, float e2,
     float *out) {
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
   *out =
       MipsEuclideanDistanceRepeatedQuadraticInjectionFp16NEON(p, q, dim, m, e2);
 #else

@@ -19,7 +19,8 @@
 namespace zvec {
 namespace ailego {
 
-#if defined(__ARM_NEON) && defined(__aarch64__)
+#if (defined(__ARM_NEON) || defined(_M_ARM64)) && \
+    (defined(__aarch64__) || defined(_M_ARM64))
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 //! Compute the Inner Product between p and q, and each Squared L2-Norm value
 float InnerProductAndSquaredNormFp16NEON(const Float16 *lhs, const Float16 *rhs,

@@ -18,7 +18,7 @@
 namespace zvec {
 namespace ailego {
 
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
 float InnerProductAndSquaredNormFp32NEON(const float *lhs, const float *rhs,
                                          size_t size, float *sql, float *sqr);
 #endif
@@ -98,7 +98,7 @@ void MipsSquaredEuclideanDistanceMatrix<float, 1, 1>::Compute(
 void MipsSquaredEuclideanDistanceMatrix<float, 1, 1>::Compute(
     const ValueType *p, const ValueType *q, size_t dim, size_t m, float e2,
     float *out) {
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
   float u2{0.0f};
   float v2{0.0f};
   float sum = InnerProductAndSquaredNormFp32NEON(p, q, dim, &u2, &v2);

@@ -19,7 +19,7 @@
 namespace zvec {
 namespace ailego {
 
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
 //! Compute the Inner Product between p and q, and each Squared L2-Norm value
 float InnerProductAndSquaredNormFp32NEON(const float *lhs, const float *rhs,
                                          size_t size, float *sql, float *sqr) {

@@ -116,7 +116,8 @@ struct Norm1Matrix<
   }
 };
 
-#if defined(__SSE__) || (defined(__ARM_NEON) && defined(__aarch64__))
+#if defined(__SSE__) || ((defined(__ARM_NEON) || defined(_M_ARM64)) && \
+                         (defined(__aarch64__) || defined(_M_ARM64)))
 /*! L1-Norm Matrix (FP32, M=1)
  */
 template <>
@@ -129,8 +130,9 @@ struct Norm1Matrix<float, 1> {
 };
 #endif  // __SSE__ || (__ARM_NEON && __aarch64__)
 
-#if (defined(__F16C__) && defined(__AVX__)) || \
-    (defined(__ARM_NEON) && defined(__aarch64__))
+#if (defined(__F16C__) && defined(__AVX__)) ||     \
+    ((defined(__ARM_NEON) || defined(_M_ARM64)) && \
+     (defined(__aarch64__) || defined(_M_ARM64)))
 /*! L1-Norm Matrix (FP16, M=1)
  */
 template <>

@@ -67,12 +67,13 @@ static const __m512 ABS_MASK_FP32_AVX512 =
 //! Calculate sum of absolute (NEON)
 #define SA_FP16_NEON(v_m, v_sum) v_sum = vaddq_f16(vabsq_f16(v_m), v_sum);
 
-#if (defined(__F16C__) && defined(__AVX__)) || \
-    (defined(__ARM_NEON) && defined(__aarch64__))
+#if (defined(__F16C__) && defined(__AVX__)) ||     \
+    ((defined(__ARM_NEON) || defined(_M_ARM64)) && \
+     (defined(__aarch64__) || defined(_M_ARM64)))
 //! Compute the L1-norm of vectors (FP16, M=1)
 void Norm1Matrix<Float16, 1>::Compute(const ValueType *m, size_t dim,
                                       float *out) {
-#if defined(__ARM_NEON)
+#if (defined(__ARM_NEON) || defined(_M_ARM64))
   NORM_FP16_1_NEON(m, dim, out, )
 #else
 #if defined(__AVX512F__)