Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions .github/workflows/05-windows-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,16 @@ jobs:
matrix:
include:
- platform: windows-2022
msvc_arch: x64
python_version: '3.10'
- platform: windows-2025
msvc_arch: x64
python_version: '3.10'
# Windows ARM64: Python 3.10 has no official ARM64 installer;
# 3.11 is the first CPython release with a Windows-on-ARM build.
- platform: windows-11-arm
msvc_arch: arm64
python_version: '3.11'

steps:
- name: Show env info
Expand All @@ -41,14 +50,14 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.10'
python-version: ${{ matrix.python_version }}
cache: 'pip'
cache-dependency-path: 'pyproject.toml'

- name: Set up MSVC environment
uses: ilammy/msvc-dev-cmd@v1
with:
arch: x64
arch: ${{ matrix.msvc_arch }}

- name: Set up environment variables
run: |
Expand Down
50 changes: 28 additions & 22 deletions src/ailego/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,29 +91,35 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH)
)
endforeach()
elseif (HOST_ARCH MATCHES "^(arm|arm64)$")
if(MSVC)
return()
endif()
set(MATH_MARCH_FLAG_NEON "-march=armv8-a")

file(GLOB_RECURSE MATH_FILES_NEON
${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc
${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.cc
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.c
${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.cc
${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.c
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.cc
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.c
)
if(NOT MSVC)
set(MATH_MARCH_FLAG_NEON "-march=armv8-a")

file(GLOB_RECURSE MATH_FILES_NEON
${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc
${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.cc
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.c
${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.cc
${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.c
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.cc
${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.c
)

foreach(MATH_FILE ${MATH_FILES_NEON})
set_source_files_properties(
${MATH_FILE}
PROPERTIES
COMPILE_FLAGS "${MATH_MARCH_FLAG_NEON}"
)
endforeach()
foreach(MATH_FILE ${MATH_FILES_NEON})
set_source_files_properties(
${MATH_FILE}
PROPERTIES
COMPILE_FLAGS "${MATH_MARCH_FLAG_NEON}"
)
endforeach()
else()
# MSVC on ARM64: NEON is the ARMv8 baseline and is always enabled,
# so no `-march` flag is required (MSVC does not accept GCC-style
# `-march=` anyway). The NEON math kernels still get compiled via
# the ALL_SRCS glob above; their `#if defined(__ARM_NEON)` guards
# were extended in this PR to also accept `_M_ARM64` so the bodies
# actually emit code under MSVC.
endif()
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need to consider -march optimizations for MSVC on ARM64 here?

endif()
endif()

Expand Down
12 changes: 7 additions & 5 deletions src/ailego/internal/cpu_features.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
#include "cpu_features.h"
#include <cstddef>

#if defined(_MSC_VER)
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
#include <intrin.h>
#elif !defined(__ARM_ARCH)
#elif !defined(_MSC_VER) && !defined(__ARM_ARCH) && \
!(defined(__aarch64__) || defined(_M_ARM64))
#include <cpuid.h>
#endif

Expand All @@ -34,7 +35,7 @@ namespace internal {

CpuFeatures::CpuFlags CpuFeatures::flags_;

#if defined(_MSC_VER)
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
CpuFeatures::CpuFlags::CpuFlags(void)
: L1_ECX(0), L1_EDX(0), L7_EBX(0), L7_ECX(0), L7_EDX(0) {
int l1[4] = {0, 0, 0, 0};
Expand All @@ -48,7 +49,8 @@ CpuFeatures::CpuFlags::CpuFlags(void)
L7_ECX = l7[2];
L7_EDX = l7[3];
}
#elif !defined(__ARM_ARCH)
#elif !defined(_MSC_VER) && !defined(__ARM_ARCH) && \
!(defined(__aarch64__) || defined(_M_ARM64))
CpuFeatures::CpuFlags::CpuFlags(void)
: L1_ECX(0), L1_EDX(0), L7_EBX(0), L7_ECX(0), L7_EDX(0) {
uint32_t eax, ebx, ecx, edx;
Expand Down Expand Up @@ -336,7 +338,7 @@ bool CpuFeatures::HYPERVISOR(void) {

const char *CpuFeatures::Intrinsics(void) {
return ""
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
"Neon"
#if defined(__ARM_FEATURE_CRC32)
"+CRC"
Expand Down
4 changes: 2 additions & 2 deletions src/ailego/math/distance_matrix_accum_fp32.i
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
_mm512_castps_si512(b)))
#endif // __AVX512DQ__

#if defined(__ARM_NEON) && !defined(__aarch64__)
#if (defined(__ARM_NEON) || defined(_M_ARM64)) && !(defined(__aarch64__) || defined(_M_ARM64))
static inline float32_t vaddvq_f32(float32x4_t v) {
float32x2_t s = vadd_f32(vget_low_f32(v), vget_high_f32(v));
return vget_lane_f32(vpadd_f32(s, s), 0);
Expand All @@ -42,7 +42,7 @@ static inline int32_t vaddvq_s32(int32x4_t v) {
}
#endif //__ARM_NEON && !__aarch64__

#if defined(__aarch64__)
#if (defined(__aarch64__) || defined(_M_ARM64))
#define ACCUM_FP32_2X1_NEON ACCUM_FP32_2X1_NEON_A64
#else
#define ACCUM_FP32_2X1_NEON ACCUM_FP32_2X1_NEON_A32
Expand Down
2 changes: 1 addition & 1 deletion src/ailego/math/distance_matrix_fp32.i
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
_mm256_insertf128_ps(_mm256_castps128_ps256(b), (a), 1)
#endif // __AVX__

#if defined(__ARM_NEON) && !defined(__aarch64__)
#if (defined(__ARM_NEON) || defined(_M_ARM64)) && !(defined(__aarch64__) || defined(_M_ARM64))
#define vdupq_laneq_f32(a, b) vdupq_n_f32(vgetq_lane_f32(a, b))
#endif // __ARM_NEON && __aarch64__

Expand Down
4 changes: 2 additions & 2 deletions src/ailego/math/euclidean_distance_matrix_fp16_dispatch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
namespace zvec {
namespace ailego {

#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
float SquaredEuclideanDistanceFp16NEON(const Float16 *lhs, const Float16 *rhs,
size_t size);
#endif
Expand Down Expand Up @@ -46,7 +46,7 @@ void SquaredEuclideanDistanceMatrix<Float16, 1, 1>::Compute(const ValueType *m,
const ValueType *q,
size_t dim,
float *out) {
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
*out = SquaredEuclideanDistanceFp16NEON(m, q, dim);
#else
#if defined(__AVX512FP16__)
Expand Down
2 changes: 1 addition & 1 deletion src/ailego/math/euclidean_distance_matrix_fp16_neon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
namespace zvec {
namespace ailego {

#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
float SquaredEuclideanDistanceFp16NEON(const Float16 *lhs, const Float16 *rhs,
size_t size) {
float score{0.0f};
Expand Down
4 changes: 2 additions & 2 deletions src/ailego/math/euclidean_distance_matrix_fp32_dispatch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
namespace zvec {
namespace ailego {

#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
void SquaredEuclideanDistanceFp32NEON(const float *lhs, const float *rhs,
size_t size, float *out);
#endif
Expand Down Expand Up @@ -49,7 +49,7 @@ void SquaredEuclideanDistanceMatrix<float, 1, 1>::Compute(const ValueType *m,
const ValueType *q,
size_t dim,
float *out) {
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
SquaredEuclideanDistanceFp32NEON(m, q, dim, out);
#else
#if defined(__AVX512F__)
Expand Down
2 changes: 1 addition & 1 deletion src/ailego/math/euclidean_distance_matrix_fp32_neon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
namespace zvec {
namespace ailego {

#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
//! Squared Euclidean Distance
void SquaredEuclideanDistanceFp32NEON(const float *lhs, const float *rhs,
size_t size, float *out) {
Expand Down
6 changes: 3 additions & 3 deletions src/ailego/math/inner_product_matrix_fp16_dispatch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace ailego {
//--------------------------------------------------
// Dense
//--------------------------------------------------
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
float InnerProductFp16NEON(const Float16 *lhs, const Float16 *rhs, size_t size);
float MinusInnerProductFp16NEON(const Float16 *lhs, const Float16 *rhs,
size_t size);
Expand Down Expand Up @@ -56,7 +56,7 @@ float MinusInnerProductFp16Scalar(const Float16 *lhs, const Float16 *rhs,
void InnerProductMatrix<Float16, 1, 1>::Compute(const ValueType *m,
const ValueType *q, size_t dim,
float *out) {
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
*out = InnerProductFp16NEON(m, q, dim);
#else
#if defined(__AVX512FP16__)
Expand Down Expand Up @@ -86,7 +86,7 @@ void InnerProductMatrix<Float16, 1, 1>::Compute(const ValueType *m,
void MinusInnerProductMatrix<Float16, 1, 1>::Compute(const ValueType *m,
const ValueType *q,
size_t dim, float *out) {
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
*out = MinusInnerProductFp16NEON(m, q, dim);
#else
#if defined(__AVX512FP16__)
Expand Down
2 changes: 1 addition & 1 deletion src/ailego/math/inner_product_matrix_fp16_neon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
namespace zvec {
namespace ailego {

#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
float InnerProductFp16NEON(const Float16 *lhs, const Float16 *rhs,
size_t size) {
float score;
Expand Down
6 changes: 3 additions & 3 deletions src/ailego/math/inner_product_matrix_fp32_dispatch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace ailego {
//--------------------------------------------------
// Dense
//--------------------------------------------------
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
float InnerProductFp32NEON(const float *lhs, const float *rhs, size_t size);
float MinusInnerProductFp32NEON(const float *lhs, const float *rhs,
size_t size);
Expand Down Expand Up @@ -49,7 +49,7 @@ float MinusInnerProductFp32Scalar(const float *lhs, const float *rhs,
//! Compute the distance between matrix and query (FP32, M=1, N=1)
void InnerProductMatrix<float, 1, 1>::Compute(const float *m, const float *q,
size_t dim, float *out) {
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
*out = InnerProductFp32NEON(m, q, dim);
#else
#if defined(__AVX512F__)
Expand Down Expand Up @@ -80,7 +80,7 @@ void InnerProductMatrix<float, 1, 1>::Compute(const float *m, const float *q,
void MinusInnerProductMatrix<float, 1, 1>::Compute(const float *m,
const float *q, size_t dim,
float *out) {
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
*out = MinusInnerProductFp32NEON(m, q, dim);
#else
#if defined(__AVX512F__)
Expand Down
2 changes: 1 addition & 1 deletion src/ailego/math/inner_product_matrix_fp32_neon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace ailego {
//--------------------------------------------------
// Dense
//--------------------------------------------------
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
float InnerProductFp32NEON(const float *lhs, const float *rhs, size_t size) {
const float *last = lhs + size;
const float *last_aligned = lhs + ((size >> 3) << 3);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
namespace zvec {
namespace ailego {

#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
float MipsEuclideanDistanceRepeatedQuadraticInjectionFp16NEON(
const Float16 *lhs, const Float16 *rhs, size_t size, size_t m, float e2);
float MipsEuclideanDistanceSphericalInjectionFp16NEON(const Float16 *lhs,
Expand Down Expand Up @@ -51,7 +51,7 @@ float MipsEuclideanDistanceSphericalInjectionFp16Scalar(
//! Compute the distance between matrix and query by SphericalInjection
void MipsSquaredEuclideanDistanceMatrix<Float16, 1, 1>::Compute(
const ValueType *p, const ValueType *q, size_t dim, float e2, float *out) {
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
*out = MipsEuclideanDistanceSphericalInjectionFp16NEON(p, q, dim, e2);
#else
#if defined(__AVX512F__)
Expand All @@ -75,7 +75,7 @@ void MipsSquaredEuclideanDistanceMatrix<Float16, 1, 1>::Compute(
void MipsSquaredEuclideanDistanceMatrix<Float16, 1, 1>::Compute(
const ValueType *p, const ValueType *q, size_t dim, size_t m, float e2,
float *out) {
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
*out =
MipsEuclideanDistanceRepeatedQuadraticInjectionFp16NEON(p, q, dim, m, e2);
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
namespace zvec {
namespace ailego {

#if defined(__ARM_NEON) && defined(__aarch64__)
#if (defined(__ARM_NEON) || defined(_M_ARM64)) && \
(defined(__aarch64__) || defined(_M_ARM64))
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
//! Compute the Inner Product between p and q, and each Squared L2-Norm value
float InnerProductAndSquaredNormFp16NEON(const Float16 *lhs, const Float16 *rhs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
namespace zvec {
namespace ailego {

#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
float InnerProductAndSquaredNormFp32NEON(const float *lhs, const float *rhs,
size_t size, float *sql, float *sqr);
#endif
Expand Down Expand Up @@ -98,7 +98,7 @@ void MipsSquaredEuclideanDistanceMatrix<float, 1, 1>::Compute(
void MipsSquaredEuclideanDistanceMatrix<float, 1, 1>::Compute(
const ValueType *p, const ValueType *q, size_t dim, size_t m, float e2,
float *out) {
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
float u2{0.0f};
float v2{0.0f};
float sum = InnerProductAndSquaredNormFp32NEON(p, q, dim, &u2, &v2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
namespace zvec {
namespace ailego {

#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
//! Compute the Inner Product between p and q, and each Squared L2-Norm value
float InnerProductAndSquaredNormFp32NEON(const float *lhs, const float *rhs,
size_t size, float *sql, float *sqr) {
Expand Down
8 changes: 5 additions & 3 deletions src/ailego/math/norm1_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ struct Norm1Matrix<
}
};

#if defined(__SSE__) || (defined(__ARM_NEON) && defined(__aarch64__))
#if defined(__SSE__) || ((defined(__ARM_NEON) || defined(_M_ARM64)) && \
(defined(__aarch64__) || defined(_M_ARM64)))
/*! L1-Norm Matrix (FP32, M=1)
*/
template <>
Expand All @@ -129,8 +130,9 @@ struct Norm1Matrix<float, 1> {
};
#endif // __SSE__ || (__ARM_NEON && __aarch64__)

#if (defined(__F16C__) && defined(__AVX__)) || \
(defined(__ARM_NEON) && defined(__aarch64__))
#if (defined(__F16C__) && defined(__AVX__)) || \
((defined(__ARM_NEON) || defined(_M_ARM64)) && \
(defined(__aarch64__) || defined(_M_ARM64)))
/*! L1-Norm Matrix (FP16, M=1)
*/
template <>
Expand Down
7 changes: 4 additions & 3 deletions src/ailego/math/norm1_matrix_fp16.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,13 @@ static const __m512 ABS_MASK_FP32_AVX512 =
//! Calculate sum of absolute (NEON)
#define SA_FP16_NEON(v_m, v_sum) v_sum = vaddq_f16(vabsq_f16(v_m), v_sum);

#if (defined(__F16C__) && defined(__AVX__)) || \
(defined(__ARM_NEON) && defined(__aarch64__))
#if (defined(__F16C__) && defined(__AVX__)) || \
((defined(__ARM_NEON) || defined(_M_ARM64)) && \
(defined(__aarch64__) || defined(_M_ARM64)))
//! Compute the L1-norm of vectors (FP16, M=1)
void Norm1Matrix<Float16, 1>::Compute(const ValueType *m, size_t dim,
float *out) {
#if defined(__ARM_NEON)
#if (defined(__ARM_NEON) || defined(_M_ARM64))
NORM_FP16_1_NEON(m, dim, out, )
#else
#if defined(__AVX512F__)
Expand Down
Loading