diff --git a/.github/workflows/05-windows-build.yml b/.github/workflows/05-windows-build.yml index 0db11c200..fa7c25d6c 100644 --- a/.github/workflows/05-windows-build.yml +++ b/.github/workflows/05-windows-build.yml @@ -18,7 +18,16 @@ jobs: matrix: include: - platform: windows-2022 + msvc_arch: x64 + python_version: '3.10' - platform: windows-2025 + msvc_arch: x64 + python_version: '3.10' + # Windows ARM64: Python 3.10 has no official ARM64 installer; + # 3.11 is the first CPython release with a Windows-on-ARM build. + - platform: windows-11-arm + msvc_arch: arm64 + python_version: '3.11' steps: - name: Show env info @@ -41,14 +50,14 @@ jobs: - name: Set up Python uses: actions/setup-python@v6 with: - python-version: '3.10' + python-version: ${{ matrix.python_version }} cache: 'pip' cache-dependency-path: 'pyproject.toml' - name: Set up MSVC environment uses: ilammy/msvc-dev-cmd@v1 with: - arch: x64 + arch: ${{ matrix.msvc_arch }} - name: Set up environment variables run: | diff --git a/src/ailego/CMakeLists.txt b/src/ailego/CMakeLists.txt index fdaa1b13c..9ce8691a7 100644 --- a/src/ailego/CMakeLists.txt +++ b/src/ailego/CMakeLists.txt @@ -91,29 +91,35 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH) ) endforeach() elseif (HOST_ARCH MATCHES "^(arm|arm64)$") - if(MSVC) - return() - endif() - set(MATH_MARCH_FLAG_NEON "-march=armv8-a") - - file(GLOB_RECURSE MATH_FILES_NEON - ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc - ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c - ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.cc - ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.c - ${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.cc - ${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.c - ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.cc - ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.c - ) + if(NOT MSVC) + set(MATH_MARCH_FLAG_NEON "-march=armv8-a") + + file(GLOB_RECURSE MATH_FILES_NEON + ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc + ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c + ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.cc + ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.c + ${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.cc + ${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.c + ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.cc + ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.c + ) - foreach(MATH_FILE ${MATH_FILES_NEON}) - set_source_files_properties( - ${MATH_FILE} - PROPERTIES - COMPILE_FLAGS "${MATH_MARCH_FLAG_NEON}" - ) - endforeach() + foreach(MATH_FILE ${MATH_FILES_NEON}) + set_source_files_properties( + ${MATH_FILE} + PROPERTIES + COMPILE_FLAGS "${MATH_MARCH_FLAG_NEON}" + ) + endforeach() + else() + # MSVC on ARM64: NEON is the ARMv8 baseline and is always enabled, + # so no `-march` flag is required (MSVC does not accept GCC-style + # `-march=` anyway). The NEON math kernels still get compiled via + # the ALL_SRCS glob above; their `#if defined(__ARM_NEON)` guards + # were extended in this PR to also accept `_M_ARM64` so the bodies + # actually emit code under MSVC. + endif() endif() endif() diff --git a/src/ailego/internal/cpu_features.cc b/src/ailego/internal/cpu_features.cc index e2dd2b23a..395e6fc13 100644 --- a/src/ailego/internal/cpu_features.cc +++ b/src/ailego/internal/cpu_features.cc @@ -15,9 +15,10 @@ #include "cpu_features.h" #include -#if defined(_MSC_VER) +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) #include -#elif !defined(__ARM_ARCH) +#elif !defined(_MSC_VER) && !defined(__ARM_ARCH) && \ + !(defined(__aarch64__) || defined(_M_ARM64)) #include #endif @@ -34,7 +35,7 @@ namespace internal { CpuFeatures::CpuFlags CpuFeatures::flags_; -#if defined(_MSC_VER) +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) CpuFeatures::CpuFlags::CpuFlags(void) : L1_ECX(0), L1_EDX(0), L7_EBX(0), L7_ECX(0), L7_EDX(0) { int l1[4] = {0, 0, 0, 0}; @@ -48,7 +49,8 @@ CpuFeatures::CpuFlags::CpuFlags(void) L7_ECX = l7[2]; L7_EDX = l7[3]; } -#elif !defined(__ARM_ARCH) +#elif !defined(_MSC_VER) && !defined(__ARM_ARCH) && \ + !(defined(__aarch64__) || defined(_M_ARM64)) CpuFeatures::CpuFlags::CpuFlags(void) : L1_ECX(0), L1_EDX(0), L7_EBX(0), L7_ECX(0), L7_EDX(0) { uint32_t eax, ebx, ecx, edx; @@ -336,7 +338,7 @@ bool CpuFeatures::HYPERVISOR(void) { const char *CpuFeatures::Intrinsics(void) { return "" -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) "Neon" #if defined(__ARM_FEATURE_CRC32) "+CRC" diff --git a/src/ailego/math/distance_matrix_accum_fp32.i b/src/ailego/math/distance_matrix_accum_fp32.i index c186492c6..913784076 100644 --- a/src/ailego/math/distance_matrix_accum_fp32.i +++ b/src/ailego/math/distance_matrix_accum_fp32.i @@ -30,7 +30,7 @@ _mm512_castps_si512(b))) #endif // __AVX512DQ__ -#if defined(__ARM_NEON) && !defined(__aarch64__) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) && !(defined(__aarch64__) || defined(_M_ARM64)) static inline float32_t vaddvq_f32(float32x4_t v) { float32x2_t s = vadd_f32(vget_low_f32(v), vget_high_f32(v)); return vget_lane_f32(vpadd_f32(s, s), 0); @@ -42,7 +42,7 @@ static inline int32_t vaddvq_s32(int32x4_t v) { } #endif //__ARM_NEON && !__aarch64__ -#if defined(__aarch64__) +#if (defined(__aarch64__) || defined(_M_ARM64)) #define ACCUM_FP32_2X1_NEON ACCUM_FP32_2X1_NEON_A64 #else #define ACCUM_FP32_2X1_NEON ACCUM_FP32_2X1_NEON_A32 diff --git a/src/ailego/math/distance_matrix_fp32.i b/src/ailego/math/distance_matrix_fp32.i index a9ddcd075..f4e6a16ab 100644 --- a/src/ailego/math/distance_matrix_fp32.i +++ b/src/ailego/math/distance_matrix_fp32.i @@ -26,7 +26,7 @@ _mm256_insertf128_ps(_mm256_castps128_ps256(b), (a), 1) #endif // __AVX__ -#if defined(__ARM_NEON) && !defined(__aarch64__) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) && !(defined(__aarch64__) || defined(_M_ARM64)) #define vdupq_laneq_f32(a, b) vdupq_n_f32(vgetq_lane_f32(a, b)) #endif // __ARM_NEON && __aarch64__ diff --git a/src/ailego/math/euclidean_distance_matrix_fp16_dispatch.cc b/src/ailego/math/euclidean_distance_matrix_fp16_dispatch.cc index fb145265e..e9e205201 100644 --- a/src/ailego/math/euclidean_distance_matrix_fp16_dispatch.cc +++ b/src/ailego/math/euclidean_distance_matrix_fp16_dispatch.cc @@ -18,7 +18,7 @@ namespace zvec { namespace ailego { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) float SquaredEuclideanDistanceFp16NEON(const Float16 *lhs, const Float16 *rhs, size_t size); #endif @@ -46,7 +46,7 @@ void SquaredEuclideanDistanceMatrix::Compute(const ValueType *m, const ValueType *q, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) *out = SquaredEuclideanDistanceFp16NEON(m, q, dim); #else #if defined(__AVX512FP16__) diff --git a/src/ailego/math/euclidean_distance_matrix_fp16_neon.cc b/src/ailego/math/euclidean_distance_matrix_fp16_neon.cc index 3d3bf8787..84b75eb8c 100644 --- a/src/ailego/math/euclidean_distance_matrix_fp16_neon.cc +++ b/src/ailego/math/euclidean_distance_matrix_fp16_neon.cc @@ -19,7 +19,7 @@ namespace zvec { namespace ailego { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) float SquaredEuclideanDistanceFp16NEON(const Float16 *lhs, const Float16 *rhs, size_t size) { float score{0.0f}; diff --git a/src/ailego/math/euclidean_distance_matrix_fp32_dispatch.cc b/src/ailego/math/euclidean_distance_matrix_fp32_dispatch.cc index cc3044389..f0650a08e 100644 --- a/src/ailego/math/euclidean_distance_matrix_fp32_dispatch.cc +++ b/src/ailego/math/euclidean_distance_matrix_fp32_dispatch.cc @@ -18,7 +18,7 @@ namespace zvec { namespace ailego { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) void SquaredEuclideanDistanceFp32NEON(const float *lhs, const float *rhs, size_t size, float *out); #endif @@ -49,7 +49,7 @@ void SquaredEuclideanDistanceMatrix::Compute(const ValueType *m, const ValueType *q, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) SquaredEuclideanDistanceFp32NEON(m, q, dim, out); #else #if defined(__AVX512F__) diff --git a/src/ailego/math/euclidean_distance_matrix_fp32_neon.cc b/src/ailego/math/euclidean_distance_matrix_fp32_neon.cc index aa1694e21..14ce90767 100644 --- a/src/ailego/math/euclidean_distance_matrix_fp32_neon.cc +++ b/src/ailego/math/euclidean_distance_matrix_fp32_neon.cc @@ -19,7 +19,7 @@ namespace zvec { namespace ailego { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) //! Squared Euclidean Distance void SquaredEuclideanDistanceFp32NEON(const float *lhs, const float *rhs, size_t size, float *out) { diff --git a/src/ailego/math/inner_product_matrix_fp16_dispatch.cc b/src/ailego/math/inner_product_matrix_fp16_dispatch.cc index 3c46bc32b..3b70d6f0a 100644 --- a/src/ailego/math/inner_product_matrix_fp16_dispatch.cc +++ b/src/ailego/math/inner_product_matrix_fp16_dispatch.cc @@ -21,7 +21,7 @@ namespace ailego { //-------------------------------------------------- // Dense //-------------------------------------------------- -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) float InnerProductFp16NEON(const Float16 *lhs, const Float16 *rhs, size_t size); float MinusInnerProductFp16NEON(const Float16 *lhs, const Float16 *rhs, size_t size); @@ -56,7 +56,7 @@ float MinusInnerProductFp16Scalar(const Float16 *lhs, const Float16 *rhs, void InnerProductMatrix::Compute(const ValueType *m, const ValueType *q, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) *out = InnerProductFp16NEON(m, q, dim); #else #if defined(__AVX512FP16__) @@ -86,7 +86,7 @@ void InnerProductMatrix::Compute(const ValueType *m, void MinusInnerProductMatrix::Compute(const ValueType *m, const ValueType *q, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) *out = MinusInnerProductFp16NEON(m, q, dim); #else #if defined(__AVX512FP16__) diff --git a/src/ailego/math/inner_product_matrix_fp16_neon.cc b/src/ailego/math/inner_product_matrix_fp16_neon.cc index 3d6c0d621..29a3dccea 100644 --- a/src/ailego/math/inner_product_matrix_fp16_neon.cc +++ b/src/ailego/math/inner_product_matrix_fp16_neon.cc @@ -19,7 +19,7 @@ namespace zvec { namespace ailego { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) float InnerProductFp16NEON(const Float16 *lhs, const Float16 *rhs, size_t size) { float score; diff --git a/src/ailego/math/inner_product_matrix_fp32_dispatch.cc b/src/ailego/math/inner_product_matrix_fp32_dispatch.cc index 8b289b6e6..32540296b 100644 --- a/src/ailego/math/inner_product_matrix_fp32_dispatch.cc +++ b/src/ailego/math/inner_product_matrix_fp32_dispatch.cc @@ -20,7 +20,7 @@ namespace ailego { //-------------------------------------------------- // Dense //-------------------------------------------------- -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) float InnerProductFp32NEON(const float *lhs, const float *rhs, size_t size); float MinusInnerProductFp32NEON(const float *lhs, const float *rhs, size_t size); @@ -49,7 +49,7 @@ float MinusInnerProductFp32Scalar(const float *lhs, const float *rhs, //! Compute the distance between matrix and query (FP32, M=1, N=1) void InnerProductMatrix::Compute(const float *m, const float *q, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) *out = InnerProductFp32NEON(m, q, dim); #else #if defined(__AVX512F__) @@ -80,7 +80,7 @@ void InnerProductMatrix::Compute(const float *m, const float *q, void MinusInnerProductMatrix::Compute(const float *m, const float *q, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) *out = MinusInnerProductFp32NEON(m, q, dim); #else #if defined(__AVX512F__) diff --git a/src/ailego/math/inner_product_matrix_fp32_neon.cc b/src/ailego/math/inner_product_matrix_fp32_neon.cc index c457b3ea2..e21fd3abf 100644 --- a/src/ailego/math/inner_product_matrix_fp32_neon.cc +++ b/src/ailego/math/inner_product_matrix_fp32_neon.cc @@ -22,7 +22,7 @@ namespace ailego { //-------------------------------------------------- // Dense //-------------------------------------------------- -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) float InnerProductFp32NEON(const float *lhs, const float *rhs, size_t size) { const float *last = lhs + size; const float *last_aligned = lhs + ((size >> 3) << 3); diff --git a/src/ailego/math/mips_euclidean_distance_matrix_fp16_dispatch.cc b/src/ailego/math/mips_euclidean_distance_matrix_fp16_dispatch.cc index 8e40563cf..ee80f61ef 100644 --- a/src/ailego/math/mips_euclidean_distance_matrix_fp16_dispatch.cc +++ b/src/ailego/math/mips_euclidean_distance_matrix_fp16_dispatch.cc @@ -18,7 +18,7 @@ namespace zvec { namespace ailego { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) float MipsEuclideanDistanceRepeatedQuadraticInjectionFp16NEON( const Float16 *lhs, const Float16 *rhs, size_t size, size_t m, float e2); float MipsEuclideanDistanceSphericalInjectionFp16NEON(const Float16 *lhs, @@ -51,7 +51,7 @@ float MipsEuclideanDistanceSphericalInjectionFp16Scalar( //! Compute the distance between matrix and query by SphericalInjection void MipsSquaredEuclideanDistanceMatrix::Compute( const ValueType *p, const ValueType *q, size_t dim, float e2, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) *out = MipsEuclideanDistanceSphericalInjectionFp16NEON(p, q, dim, e2); #else #if defined(__AVX512F__) @@ -75,7 +75,7 @@ void MipsSquaredEuclideanDistanceMatrix::Compute( void MipsSquaredEuclideanDistanceMatrix::Compute( const ValueType *p, const ValueType *q, size_t dim, size_t m, float e2, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) *out = MipsEuclideanDistanceRepeatedQuadraticInjectionFp16NEON(p, q, dim, m, e2); #else diff --git a/src/ailego/math/mips_euclidean_distance_matrix_fp16_neon.cc b/src/ailego/math/mips_euclidean_distance_matrix_fp16_neon.cc index b4f4c970d..3d6628e76 100644 --- a/src/ailego/math/mips_euclidean_distance_matrix_fp16_neon.cc +++ b/src/ailego/math/mips_euclidean_distance_matrix_fp16_neon.cc @@ -19,7 +19,8 @@ namespace zvec { namespace ailego { -#if defined(__ARM_NEON) && defined(__aarch64__) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64)) #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) //! Compute the Inner Product between p and q, and each Squared L2-Norm value float InnerProductAndSquaredNormFp16NEON(const Float16 *lhs, const Float16 *rhs, diff --git a/src/ailego/math/mips_euclidean_distance_matrix_fp32_dispatch.cc b/src/ailego/math/mips_euclidean_distance_matrix_fp32_dispatch.cc index f48626a3f..37c8a1daf 100644 --- a/src/ailego/math/mips_euclidean_distance_matrix_fp32_dispatch.cc +++ b/src/ailego/math/mips_euclidean_distance_matrix_fp32_dispatch.cc @@ -18,7 +18,7 @@ namespace zvec { namespace ailego { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) float InnerProductAndSquaredNormFp32NEON(const float *lhs, const float *rhs, size_t size, float *sql, float *sqr); #endif @@ -98,7 +98,7 @@ void MipsSquaredEuclideanDistanceMatrix::Compute( void MipsSquaredEuclideanDistanceMatrix::Compute( const ValueType *p, const ValueType *q, size_t dim, size_t m, float e2, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) float u2{0.0f}; float v2{0.0f}; float sum = InnerProductAndSquaredNormFp32NEON(p, q, dim, &u2, &v2); diff --git a/src/ailego/math/mips_euclidean_distance_matrix_fp32_neon.cc b/src/ailego/math/mips_euclidean_distance_matrix_fp32_neon.cc index 6491f2260..e5bff681e 100644 --- a/src/ailego/math/mips_euclidean_distance_matrix_fp32_neon.cc +++ b/src/ailego/math/mips_euclidean_distance_matrix_fp32_neon.cc @@ -19,7 +19,7 @@ namespace zvec { namespace ailego { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) //! Compute the Inner Product between p and q, and each Squared L2-Norm value float InnerProductAndSquaredNormFp32NEON(const float *lhs, const float *rhs, size_t size, float *sql, float *sqr) { diff --git a/src/ailego/math/norm1_matrix.h b/src/ailego/math/norm1_matrix.h index 7e8d9cbc8..3263f4c22 100644 --- a/src/ailego/math/norm1_matrix.h +++ b/src/ailego/math/norm1_matrix.h @@ -116,7 +116,8 @@ struct Norm1Matrix< } }; -#if defined(__SSE__) || (defined(__ARM_NEON) && defined(__aarch64__)) +#if defined(__SSE__) || ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) /*! L1-Norm Matrix (FP32, M=1) */ template <> @@ -129,8 +130,9 @@ struct Norm1Matrix { }; #endif // __SSE__ || (__ARM_NEON && __aarch64__) -#if (defined(__F16C__) && defined(__AVX__)) || \ - (defined(__ARM_NEON) && defined(__aarch64__)) +#if (defined(__F16C__) && defined(__AVX__)) || \ + ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) /*! L1-Norm Matrix (FP16, M=1) */ template <> diff --git a/src/ailego/math/norm1_matrix_fp16.cc b/src/ailego/math/norm1_matrix_fp16.cc index e75b3e0a8..1060196ad 100644 --- a/src/ailego/math/norm1_matrix_fp16.cc +++ b/src/ailego/math/norm1_matrix_fp16.cc @@ -67,12 +67,13 @@ static const __m512 ABS_MASK_FP32_AVX512 = //! Calculate sum of absolute (NEON) #define SA_FP16_NEON(v_m, v_sum) v_sum = vaddq_f16(vabsq_f16(v_m), v_sum); -#if (defined(__F16C__) && defined(__AVX__)) || \ - (defined(__ARM_NEON) && defined(__aarch64__)) +#if (defined(__F16C__) && defined(__AVX__)) || \ + ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) //! Compute the L1-norm of vectors (FP16, M=1) void Norm1Matrix::Compute(const ValueType *m, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) NORM_FP16_1_NEON(m, dim, out, ) #else #if defined(__AVX512F__) diff --git a/src/ailego/math/norm1_matrix_fp32.cc b/src/ailego/math/norm1_matrix_fp32.cc index 2e7279118..b4194b2f7 100644 --- a/src/ailego/math/norm1_matrix_fp32.cc +++ b/src/ailego/math/norm1_matrix_fp32.cc @@ -56,11 +56,12 @@ namespace ailego { //! Calculate sum of absolute (NEON) #define SA_FP32_NEON(v_m, v_sum) v_sum = vaddq_f32(vabsq_f32(v_m), v_sum); -#if defined(__SSE__) || (defined(__ARM_NEON) && defined(__aarch64__)) +#if defined(__SSE__) || ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) //! Compute the L1-norm of vectors (FP32, M=1) void Norm1Matrix::Compute(const ValueType *m, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) NORM_FP32_1_NEON(m, dim, out, ) #else #if defined(__AVX512F__) diff --git a/src/ailego/math/norm2_matrix.h b/src/ailego/math/norm2_matrix.h index 3c905147d..530653382 100644 --- a/src/ailego/math/norm2_matrix.h +++ b/src/ailego/math/norm2_matrix.h @@ -371,7 +371,8 @@ struct SquaredNorm2Matrix= 2>::type> { } }; -#if defined(__SSE__) || (defined(__ARM_NEON) && defined(__aarch64__)) +#if defined(__SSE__) || ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) /*! L2-Norm Matrix (FP32, M=1) */ template <> @@ -395,8 +396,9 @@ struct SquaredNorm2Matrix { }; #endif // __SSE__ || (__ARM_NEON && __aarch64__) -#if (defined(__F16C__) && defined(__AVX__)) || \ - (defined(__ARM_NEON) && defined(__aarch64__)) +#if (defined(__F16C__) && defined(__AVX__)) || \ + ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) /*! L2-Norm Matrix (FP16, M=1) */ template <> diff --git a/src/ailego/math/norm2_matrix_fp16.cc b/src/ailego/math/norm2_matrix_fp16.cc index 6bb8dd06c..4a617fdd3 100644 --- a/src/ailego/math/norm2_matrix_fp16.cc +++ b/src/ailego/math/norm2_matrix_fp16.cc @@ -52,12 +52,13 @@ namespace ailego { //! Calculate sum of squared (NEON) #define SS_FP16_NEON(v_m, v_sum) v_sum = vfmaq_f16(v_sum, v_m, v_m); -#if (defined(__F16C__) && defined(__AVX__)) || \ - (defined(__ARM_NEON) && defined(__aarch64__)) +#if (defined(__F16C__) && defined(__AVX__)) || \ + ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) //! Compute the L2-norm of vectors (FP16, M=1) void Norm2Matrix::Compute(const ValueType *m, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) NORM_FP16_1_NEON(m, dim, out, std::sqrt) #else #if defined(__AVX512F__) @@ -73,7 +74,7 @@ void Norm2Matrix::Compute(const ValueType *m, size_t dim, //! Compute the L2-norm of vectors (FP16, M=1) void SquaredNorm2Matrix::Compute(const ValueType *m, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) NORM_FP16_1_NEON(m, dim, out, ) #else #if defined(__AVX512F__) diff --git a/src/ailego/math/norm2_matrix_fp32.cc b/src/ailego/math/norm2_matrix_fp32.cc index 8cc76c1f5..b1c1ad399 100644 --- a/src/ailego/math/norm2_matrix_fp32.cc +++ b/src/ailego/math/norm2_matrix_fp32.cc @@ -43,11 +43,12 @@ namespace ailego { //! Calculate sum of squared (NEON) #define SS_FP32_NEON(v_m, v_sum) v_sum = vfmaq_f32(v_sum, v_m, v_m); -#if defined(__SSE__) || (defined(__ARM_NEON) && defined(__aarch64__)) +#if defined(__SSE__) || ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) //! Compute the L2-norm of vectors (FP32, M=1) void Norm2Matrix::Compute(const ValueType *m, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) NORM_FP32_1_NEON(m, dim, out, std::sqrt) #else #if defined(__AVX512F__) @@ -69,7 +70,7 @@ void Norm2Matrix::Compute(const ValueType *m, size_t dim, //! Compute the squared L2-norm of vectors (FP32, M=1) void SquaredNorm2Matrix::Compute(const ValueType *m, size_t dim, float *out) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) NORM_FP32_1_NEON(m, dim, out, ) #else #if defined(__AVX512F__) diff --git a/src/ailego/math/normalizer.cc b/src/ailego/math/normalizer.cc index a31a9f350..0b33929dd 100644 --- a/src/ailego/math/normalizer.cc +++ b/src/ailego/math/normalizer.cc @@ -17,7 +17,8 @@ namespace zvec { namespace ailego { -#if (defined(__ARM_NEON) && defined(__aarch64__)) +#if ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) static inline void NormalizeNEON(float *arr, size_t dim, float norm) { float *last = arr + dim; float *last_aligned = arr + ((dim >> 3) << 3); @@ -392,10 +393,11 @@ static inline void NormalizeSSE(float *arr, size_t dim, float norm) { } #endif // __SSE__ -#if defined(__SSE__) || (defined(__ARM_NEON) && defined(__aarch64__)) +#if defined(__SSE__) || ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) //! Compute the norm of vector void Normalizer::Compute(ValueType *arr, size_t dim, float norm) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) NormalizeNEON(arr, dim, norm); #else #if defined(__AVX512F__) @@ -415,11 +417,12 @@ void Normalizer::Compute(ValueType *arr, size_t dim, float norm) { } #endif // __SSE__ || (__ARM_NEON && __aarch64__) -#if (defined(__F16C__) && defined(__AVX__)) || \ - (defined(__ARM_NEON) && defined(__aarch64__)) +#if (defined(__F16C__) && defined(__AVX__)) || \ + ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) //! Compute the norm of vector void Normalizer::Compute(ValueType *arr, size_t dim, float norm) { -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) NormalizeNEON(reinterpret_cast(arr), dim, norm); #else #if defined(__AVX512F__) diff --git a/src/ailego/math/normalizer.h b/src/ailego/math/normalizer.h index 2c191b0e7..52300284a 100644 --- a/src/ailego/math/normalizer.h +++ b/src/ailego/math/normalizer.h @@ -51,7 +51,8 @@ struct Normalizer { } }; -#if defined(__SSE__) || (defined(__ARM_NEON) && defined(__aarch64__)) +#if defined(__SSE__) || ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) /*! Normalizer (FP32) */ template <> @@ -80,8 +81,9 @@ struct Normalizer { }; #endif // __SSE__ || (__ARM_NEON && __aarch64__) -#if (defined(__F16C__) && defined(__AVX__)) || \ - (defined(__ARM_NEON) && defined(__aarch64__)) +#if (defined(__F16C__) && defined(__AVX__)) || \ + ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) /*! Normalizer (FP16) */ template <> diff --git a/src/ailego/utility/bitset_helper.cc b/src/ailego/utility/bitset_helper.cc index 19be34847..47d5c6238 100644 --- a/src/ailego/utility/bitset_helper.cc +++ b/src/ailego/utility/bitset_helper.cc @@ -23,7 +23,7 @@ #define bitset_popcount64 _mm_popcnt_u64 #endif // !__SSE4_2__ -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) static inline void bitset_and(uint32_t *lhs, const uint32_t *rhs, size_t size) { uint32_t *last = lhs + size; uint32_t *last_aligned = lhs + ((size >> 2) << 2); @@ -1480,7 +1480,8 @@ static inline bool bitset_test_none(const uint32_t *lhs, size_t size) { #endif // AILEGO_M64 #endif // __AVX2__ -#if (defined(__ARM_NEON) && defined(__aarch64__)) +#if ((defined(__ARM_NEON) || defined(_M_ARM64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) static inline size_t bitset_cardinality(const uint32_t *lhs, size_t size) { const uint32_t *last = lhs + size; const uint32_t *last_aligned = lhs + ((size >> 2) << 2); diff --git a/src/ailego/utility/float_helper.cc b/src/ailego/utility/float_helper.cc index 6e33fedd7..a14c49e76 100644 --- a/src/ailego/utility/float_helper.cc +++ b/src/ailego/utility/float_helper.cc @@ -21,7 +21,10 @@ // #define float32(x) _cvtsh_ss(x) // #endif // __F16C__ && __AVX__ -#if defined(__aarch64__) +// MSVC ARM64 lacks the GCC/Clang `__fp16` extension type, so keep this +// path gated on `__aarch64__` (predefined only by GCC/Clang on AArch64). +// MSVC ARM64 falls through to the F16C/scalar paths below. +#if defined(__aarch64__) && !defined(_MSC_VER) static inline float float32(uint16_t val) { __fp16 *p = reinterpret_cast<__fp16 *>(&val); return *p; diff --git a/src/ailego/version.i b/src/ailego/version.i index c1b14be2e..f8e78d443 100644 --- a/src/ailego/version.i +++ b/src/ailego/version.i @@ -225,7 +225,7 @@ #define AILEGO_VERSION_OPENMP "" #endif -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) #define AILEGO_VERSION_SIMD " Arm Neon Instruction Set\n" #elif defined(__AVX512FP16__) #define AILEGO_VERSION_SIMD " AVX-512FP16 Instruction Set\n" diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h index f7f3d77ed..1bff8b08e 100644 --- a/src/include/zvec/ailego/buffer/concurrentqueue.h +++ b/src/include/zvec/ailego/buffer/concurrentqueue.h @@ -129,8 +129,9 @@ static inline thread_id_t thread_id() { } } // namespace details } // namespace moodycamel -#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \ - (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || \ +#elif defined(__arm__) || defined(_M_ARM) || \ + (defined(__aarch64__) || defined(_M_ARM64)) || \ + (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || \ defined(MOODYCAMEL_NO_THREAD_LOCAL) namespace moodycamel { namespace details { @@ -293,13 +294,14 @@ inline thread_id_t thread_id() { // support thread_local either. Finally, iOS/ARM doesn't have support for it // either, and g++/ARM allows it to compile but it's unconfirmed to actually // work -#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \ - (!defined(__MINGW32__) && !defined(__MINGW64__) || \ - !defined(__WINPTHREADS_VERSION)) && \ - (!defined(__GNUC__) || __GNUC__ > 4 || \ - (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \ - (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \ - !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__) +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \ + (!defined(__MINGW32__) && !defined(__MINGW64__) || \ + !defined(__WINPTHREADS_VERSION)) && \ + (!defined(__GNUC__) || __GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \ + (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \ + !defined(_M_ARM) && !(defined(__aarch64__) || defined(_M_ARM64)) && \ + !defined(__MVS__) // Assume `thread_local` is fully supported in all other C++11 // compilers/platforms #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // tentatively enabled for now; diff --git a/src/include/zvec/ailego/internal/platform.h b/src/include/zvec/ailego/internal/platform.h index ccd33971e..614c0a206 100644 --- a/src/include/zvec/ailego/internal/platform.h +++ b/src/include/zvec/ailego/internal/platform.h @@ -30,13 +30,16 @@ #if defined(_MSC_VER) #include +#if defined(_M_ARM64) +#include +#endif #else #include #include #if defined(__x86_64__) || defined(__i386) #include #endif -#if defined(__ARM_NEON) +#if (defined(__ARM_NEON) || defined(_M_ARM64)) #include #endif #if defined(__ARM_FEATURE_CRC32) @@ -110,7 +113,8 @@ extern "C" { #endif #if defined(__GNUC__) -#if defined(__x86_64__) || defined(__aarch64__) || defined(__ppc64__) +#if defined(__x86_64__) || (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ppc64__) #define AILEGO_M64 #else #define AILEGO_M32 @@ -228,7 +232,7 @@ static inline int ailego_clz64(uint64_t x) { #define ailego_popcount ailego_popcount32 #endif // AILEGO_M64 -#if defined(__arm__) || defined(__aarch64__) +#if defined(__arm__) || (defined(__aarch64__) || defined(_M_ARM64)) // ARMv7 Architecture Reference Manual (for YIELD) // ARM Compiler toolchain Compiler Reference (for __yield() instrinsic) #if defined(__CC_ARM) @@ -273,11 +277,12 @@ static inline int ailego_clz64(uint64_t x) { #define ailego_malloc(SIZE) ailego_aligned_malloc((SIZE), 32) #elif defined(__SSE__) #define ailego_malloc(SIZE) ailego_aligned_malloc((SIZE), 16) -#elif defined(__ARM_NEON) +#elif (defined(__ARM_NEON) || defined(_M_ARM64)) #define ailego_malloc(SIZE) ailego_aligned_malloc((SIZE), 16) #endif #endif // !ailego_malloc -#if (defined(__SSE__) || defined(__ARM_NEON)) && !defined(ailego_free) +#if (defined(__SSE__) || (defined(__ARM_NEON) || defined(_M_ARM64))) && \ + !defined(ailego_free) #define ailego_free ailego_aligned_free #endif #endif // !__SANITIZE_ADDRESS__ diff --git a/src/include/zvec/ailego/utility/float_helper.h b/src/include/zvec/ailego/utility/float_helper.h index 5dc2fe69a..d32eb3404 100644 --- a/src/include/zvec/ailego/utility/float_helper.h +++ b/src/include/zvec/ailego/utility/float_helper.h @@ -52,7 +52,10 @@ struct FloatHelper { } }; -#if !defined(__aarch64__) +// The `#else` branch below stores `Float16::value_` as `__fp16` — a GCC/Clang +// extension type that MSVC does not provide even on ARM64. Keep the uint16_t +// storage path for MSVC (including MSVC ARM64) so the wrapper compiles. +#if !defined(__aarch64__) || defined(_MSC_VER) /*! Half-Precision Floating Point */ class Float16 { diff --git a/thirdparty/arrow/CMakeLists.txt b/thirdparty/arrow/CMakeLists.txt index 0c4603b65..7b8b2a2b0 100644 --- a/thirdparty/arrow/CMakeLists.txt +++ b/thirdparty/arrow/CMakeLists.txt @@ -12,6 +12,10 @@ endif() if(MSVC) set(ARROW_WIN_PATCH ${CMAKE_CURRENT_SOURCE_DIR}/arrow.windows.patch) apply_patch_once("arrow_windows_crt_fix" "${ARROW_SRC_DIR}" "${ARROW_WIN_PATCH}") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ARM64|arm64|aarch64)$") + set(ARROW_WIN_ARM64_PATCH ${CMAKE_CURRENT_SOURCE_DIR}/arrow.windows-arm64.patch) + apply_patch_once("arrow_windows_arm64_fix" "${ARROW_SRC_DIR}" "${ARROW_WIN_ARM64_PATCH}") + endif() endif() include(ExternalProject) @@ -105,6 +109,14 @@ elseif (MSVC) -DARROW_USE_STATIC_CRT=${ZVEC_USE_STATIC_CRT} "-DCMAKE_MSVC_RUNTIME_LIBRARY=${_ARROW_MSVC_RUNTIME}" ) + # Arrow 21.0's xsimd-13 does not provide make_sized_batch_t for MSVC ARM64, + # so disable SIMD on that target. x86/x64 MSVC keeps the default SSE4.2 path. + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ARM64|arm64|aarch64)$") + list(APPEND ARROW_EXTRA_CMAKE_ARGS + -DARROW_SIMD_LEVEL=NONE + -DARROW_RUNTIME_SIMD_LEVEL=NONE + ) + endif() ExternalProject_Add( ARROW.BUILD PREFIX arrow SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/apache-arrow-21.0.0 diff --git a/thirdparty/arrow/arrow.windows-arm64.patch b/thirdparty/arrow/arrow.windows-arm64.patch new file mode 100644 index 000000000..f815a0c49 --- /dev/null +++ b/thirdparty/arrow/arrow.windows-arm64.patch @@ -0,0 +1,35 @@ +diff --git a/cpp/src/arrow/vendored/pcg/pcg_uint128.hpp b/cpp/src/arrow/vendored/pcg/pcg_uint128.hpp +index 0181e69e4e..349e3b6bfa 100644 +--- a/cpp/src/arrow/vendored/pcg/pcg_uint128.hpp ++++ b/cpp/src/arrow/vendored/pcg/pcg_uint128.hpp +@@ -67,7 +67,8 @@ + #define PCG_LITTLE_ENDIAN 1 + #elif __BIG_ENDIAN__ || _BIG_ENDIAN + #define PCG_LITTLE_ENDIAN 0 +- #elif __x86_64 || __x86_64__ || _M_X64 || __i386 || __i386__ || _M_IX86 ++ #elif __x86_64 || __x86_64__ || _M_X64 || __i386 || __i386__ || _M_IX86 \ ++ || _M_ARM64 || _M_ARM || __aarch64__ || __arm__ + #define PCG_LITTLE_ENDIAN 1 + #elif __powerpc__ || __POWERPC__ || __ppc__ || __PPC__ \ + || __m68k__ || __mc68000__ +@@ -733,7 +734,7 @@ uint_x4 operator*(const uint_x4& a, + } + + #if PCG_64BIT_SPECIALIZATIONS +-#if defined(_MSC_VER) ++#if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_ARM) + #pragma intrinsic(_umul128) + #endif + +@@ -742,7 +743,10 @@ template + uint_x4 operator*(const uint_x4& a, + const uint_x4& b) + { +-#if defined(_MSC_VER) ++#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM)) ++ uint64_t lo = a.d.v01 * b.d.v01; ++ uint64_t hi = __umulh(a.d.v01, b.d.v01); ++#elif defined(_MSC_VER) + uint64_t hi; + uint64_t lo = _umul128(a.d.v01, b.d.v01, &hi); + #else