From 229c0fe4f92fbdbfb3c53e8483cf365cc1ff9901 Mon Sep 17 00:00:00 2001 From: "Ivan A. Melnikov" Date: Wed, 14 Jan 2026 15:06:57 +0400 Subject: [PATCH 01/16] x86/sse2: Fix typo in LSX implementation for simde_mm_cvttpd_epi32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __lsx_vftintrz_w_d accepts two __m128d arguments, so it's should be called with zero_f64 that is declared. This fixes the following compilation error that I get when compiling current simde master for loongarch64-linux-gnu with gcc 14.3.1 and `-Ofast -mlsx -mlasx` in CFLAGS: ../test/x86/avx512/../../../simde/x86/sse2.h: In function ‘simde__m128i simde_mm_cvttpd_epi32(simde__m128d)’: ../test/x86/avx512/../../../simde/x86/sse2.h:3736:39: error: ‘zero_i64’ was not declared in this scope; did you mean ‘zero_f64’? 3736 | r_.lsx_i64 = __lsx_vftintrz_w_d(zero_i64, simde__m128d_to_private(a).lsx_f64); | ^~~~~~~~ | zero_f64 Signed-off-by: Ivan A. Melnikov --- simde/x86/sse2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simde/x86/sse2.h b/simde/x86/sse2.h index 8f646cab5..dafc3f840 100644 --- a/simde/x86/sse2.h +++ b/simde/x86/sse2.h @@ -3733,7 +3733,7 @@ simde_mm_cvttpd_epi32 (simde__m128d a) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) const v2f64 zero_f64 = {-0.0f, -0.0f}; - r_.lsx_i64 = __lsx_vftintrz_w_d(zero_i64, simde__m128d_to_private(a).lsx_f64); + r_.lsx_i64 = __lsx_vftintrz_w_d(zero_f64, simde__m128d_to_private(a).lsx_f64); #else r_.m64[0] = simde_mm_cvttpd_pi32(a); r_.m64[1] = simde_mm_setzero_si64(); From 472648829ddf8190095becd0567581ae8fc01126 Mon Sep 17 00:00:00 2001 From: "Ivan A. Melnikov" Date: Wed, 14 Jan 2026 15:20:00 +0400 Subject: [PATCH 02/16] x86/sse2: Fix fast math codepath in simde_mm_cvttps_epi32 Similarly to what other architectures do, __lsx_vftintrz_w_s should be used when both SIMDE_FAST_CONVERSION_RANGE and SIMDE_FAST_NANS are declared, not just stored to a temporary and lost. Signed-off-by: Ivan A. Melnikov --- simde/x86/sse2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simde/x86/sse2.h b/simde/x86/sse2.h index dafc3f840..5e9fe56b3 100644 --- a/simde/x86/sse2.h +++ b/simde/x86/sse2.h @@ -3793,7 +3793,7 @@ simde_mm_cvttps_epi32 (simde__m128 a) { r_.wasm_v128 = wasm_v128_bitselect(r_.wasm_v128, wasm_i32x4_splat(INT32_MIN), valid_input); #endif #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __m128i temp = __lsx_vftintrz_w_s(a_.lsx_f32); + r_.lsx_i64 = __lsx_vftintrz_w_s(a_.lsx_f32); #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) #if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_FAST_NANS) simde_float32 f1 = 2147483648.0f; @@ -3809,7 +3809,7 @@ simde_mm_cvttps_epi32 (simde__m128 a) { __m128i valid_input = __lsx_vfcmp_ceq_s(a_.lsx_f32, a_.lsx_f32); #endif - r_.lsx_i64 = __lsx_vbitsel_v(__lsx_vreplgr2vr_w(INT32_MIN), temp, valid_input); + r_.lsx_i64 = __lsx_vbitsel_v(__lsx_vreplgr2vr_w(INT32_MIN), r_.lsx_i64, valid_input); #endif #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_ARCH_POWER) SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); From 1bfb7e82b3752a4b679eec9547f02fc7bb966f6e Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Thu, 15 Jan 2026 11:26:01 +0100 Subject: [PATCH 03/16] gh-actions: test loongarch64 with -Ofast --- .github/workflows/ci.yml | 6 ++++++ .../loongarch64-gcc-14-fastmath-ccache.cross | 20 +++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 docker/cross-files/loongarch64-gcc-14-fastmath-ccache.cross diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 935ffbb64..a1ea20c36 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -487,6 +487,12 @@ jobs: arch_gnu: loongarch64 arch_deb: loong64 distro: ubuntu-24.04 + - extra: -fastmath + version: 14 + cross: loongarch64 + arch_gnu: loongarch64 + arch_deb: loong64 + distro: ubuntu-24.04 # - version: 14 # cross: mips64el # arch_gnu: mips64el diff --git a/docker/cross-files/loongarch64-gcc-14-fastmath-ccache.cross b/docker/cross-files/loongarch64-gcc-14-fastmath-ccache.cross new file mode 100644 index 000000000..1892193d6 --- /dev/null +++ b/docker/cross-files/loongarch64-gcc-14-fastmath-ccache.cross @@ -0,0 +1,20 @@ +[binaries] +c = ['ccache', 'loongarch64-linux-gnu-gcc-14'] +cpp = ['ccache', 'loongarch64-linux-gnu-g++-14'] +ar = 'loongarch64-linux-gnu-gcc-ar-14' +strip = 'loongarch64-linux-gnu-strip' +objcopy = 'loongarch64-linux-gnu-objcopy' +ld = 'loongarch64-linux-gnu-ld' +exe_wrapper = ['qemu-loongarch64-static', '-L', '/usr/loongarch64-linux-gnu/', '-cpu', 'la464'] + +[properties] +c_args = ['-march=loongarch64', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-Ofast'] +cpp_args = ['-march=loongarch64', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-Ofast'] +#c_args = ['-march=la464', '-Wextra', '-Werror'] +#cpp_args = ['-march=la464', '-Wextra', '-Werror'] + +[host_machine] +system = 'linux' +cpu_family = 'loongarch64' +cpu = 'loongarch64' +endian = 'little' From 4985afacc84bd932dd1a2b16b506c1479acfe4ee Mon Sep 17 00:00:00 2001 From: "Ivan A. Melnikov" Date: Tue, 20 Jan 2026 19:13:52 +0400 Subject: [PATCH 04/16] x86/sse2 loong64: Fix fast math codepath for simde_mm_cvtps_epi32 __lsx_vftintrne_w_s actually returns a vector of 4 ints, but lsxintrin.h from gcc 14 and 15 declares it as returning a vector of 2 longs. We use HEDLEY_REINTERPRET_CAST to work this around. See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123759 --- simde/x86/sse2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simde/x86/sse2.h b/simde/x86/sse2.h index 5e9fe56b3..d97e1a192 100644 --- a/simde/x86/sse2.h +++ b/simde/x86/sse2.h @@ -3313,7 +3313,7 @@ simde_mm_cvtps_epi32 (simde__m128 a) { r_.wasm_v128 = wasm_i32x4_trunc_sat_f32x4(a_.wasm_v128); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) a_ = simde__m128_to_private(a); - r_.lsx_i32 = __lsx_vftintrne_w_s(a_.lsx_f32); + r_.lsx_i32 = HEDLEY_REINTERPRET_CAST(v4i32, __lsx_vftintrne_w_s(a_.lsx_f32)); #else a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1)); SIMDE_VECTORIZE From 48482863cf11def164beacb918630d1715127cb2 Mon Sep 17 00:00:00 2001 From: "Ivan A. Melnikov" Date: Tue, 20 Jan 2026 19:18:04 +0400 Subject: [PATCH 05/16] x86/sse: Fix fast math LSX codepath for simde_mm_cmpunord_ps __lsx_vfcmp_cun_s actually retuns a vector of 4 ints, but lsxintrin.h from GCC 14 and 15 declares it as returning two longs. Use HEDLEY_REINTERPRET_CAST to work this around and assign the correct member of simde__m128_private. See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123759 --- simde/x86/sse.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/simde/x86/sse.h b/simde/x86/sse.h index 1258ab30d..8db1ab686 100644 --- a/simde/x86/sse.h +++ b/simde/x86/sse.h @@ -1825,7 +1825,8 @@ simde_mm_cmpunord_ps (simde__m128 a, simde__m128 b) { vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); r_.altivec_f32 = vec_nor(r_.altivec_f32, r_.altivec_f32); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); + r_.lsx_i32 = HEDLEY_REINTERPRET_CAST(v4i32, __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32)); + // TODO: change when https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123759 is resolved #elif defined(simde_math_isnanf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { From b115db965e41ba0ca9640d4ec284a3448201b4be Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 22 Jan 2026 10:15:02 +0800 Subject: [PATCH 06/16] x86/sse2: Fix typo about SIMDE_LOONGARCH_LSX_NATIVE Change from SIMD_LOONGARCH_LSX_NATIVE to SIMDE_LOONGARCH_LSX_NATIVE. --- simde/x86/sse2.h | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/simde/x86/sse2.h b/simde/x86/sse2.h index d97e1a192..5522efa50 100644 --- a/simde/x86/sse2.h +++ b/simde/x86/sse2.h @@ -568,7 +568,7 @@ simde_mm_set_pd (simde_float64 e1, simde_float64 e0) { #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; r_.neon_f64 = vld1q_f64(data); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5705,8 +5705,6 @@ simde_mm_loadu_si16 (void const* mem_addr) { HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ HEDLEY_GCC_VERSION_CHECK(12,1,0)) return _mm_loadu_si16(mem_addr); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) - return __lsx_vld(mem_addr, 0); #else int16_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -5761,8 +5759,10 @@ simde_mm_loadu_si32 (void const* mem_addr) { simde__m128i_private r_; r_.neon_i32 = vsetq_lane_s32(* HEDLEY_REINTERPRET_CAST(const int32_t *, mem_addr), vdupq_n_s32(0), 0); return simde__m128i_from_private(r_); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) - return __lsx_vld(mem_addr, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i_private r_; + r_.lsx_i64 = __lsx_vbsrl_v(__lsx_vldrepl_w(mem_addr, 0), 12); + return simde__m128i_from_private(r_); #else int32_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -5783,7 +5783,7 @@ simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1)); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_TO_16 simde__m64 data[2] = {e0, e1}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5811,7 +5811,7 @@ simde_mm_set_epi64x (int64_t e1, int64_t e0) { r_.neon_i64 = vld1q_s64(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_make(e0, e1); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v2i64) int64_t data[2] = {e0, e1}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5834,8 +5834,10 @@ simde_mm_loadu_si64 (void const* mem_addr) { HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(20,21,1)) return _mm_loadu_si64(mem_addr); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) - return __lsx_vld(mem_addr, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i_private r_; + r_.lsx_i64 = __lsx_vbsrl_v(__lsx_vldrepl_d(mem_addr, 0), 8); + return simde__m128i_from_private(r_); #else int64_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -5870,7 +5872,7 @@ simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, r_.neon_u8 = vld1q_u8(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u8x16_make(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v16u8) uint8_t data[16] = { e0, e1, e2, e3, e4, e5, e6, e7, @@ -5904,7 +5906,7 @@ simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, r_.neon_u16 = vld1q_u16(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v8u16) uint16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5930,7 +5932,7 @@ simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { r_.neon_u32 = vld1q_u32(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u32x4_make(e0, e1, e2, e3); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v4u32) uint32_t data[4] = {e0, e1, e2, e3}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5957,7 +5959,7 @@ simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) { r_.neon_u64 = vld1q_u64(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u64x2_make(e0, e1); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v2u64) uint64_t data[2] = {e0, e1}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5978,7 +5980,7 @@ simde_mm_set_sd (simde_float64 a) { return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return simde__m128d_from_wasm_v128(wasm_f64x2_make(a, 0)); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) return (__m128d)__lsx_vinsgr2vr_d(__lsx_vldrepl_d(&a, 0), 0, 1); #else return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a); From 8bea4b3e73cf2a467a0b5d4c908d1796edb59647 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Wed, 21 Jan 2026 15:47:04 +0100 Subject: [PATCH 07/16] gh-actions gcc-qemu: only add extra repository for gcc-15 on Ubuntu 24.04 To avoid binutils mismatch --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a1ea20c36..667a47aab 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -506,7 +506,7 @@ jobs: submodules: recursive - name: CPU Information run: cat /proc/cpuinfo - - if: ${{ matrix.distro == 'ubuntu-24.04' }} + - if: ${{ matrix.distro == 'ubuntu-24.04' && ( matrix.version == '15' )}} run: sudo add-apt-repository ppa:daawesomep/toolchain-backports-noble - name: Install APT Dependencies From b97ec457f3e3b75a8600a682937fe943fd58e9d1 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Wed, 21 Jan 2026 15:49:49 +0100 Subject: [PATCH 08/16] DO NOT MERGE: *-qemu: run emulated tests as well --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 667a47aab..2dd1cd269 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -533,7 +533,7 @@ jobs: - name: Test run: | # shellcheck disable=SC2046 - meson test -C build --print-errorlogs --print-errorlogs $(meson test -C build --list | grep -v emul) + meson test -C build --print-errorlogs --print-errorlogs # $(meson test -C build --list | grep -v emul) clang17-qemu-rvv: strategy: @@ -734,7 +734,7 @@ jobs: - name: Test run: | # shellcheck disable=SC2046 - meson test -C build --print-errorlogs --print-errorlogs $(meson test -C build --list | grep -v emul) + meson test -C build --print-errorlogs --print-errorlogs # $(meson test -C build --list | grep -v emul) clang: strategy: From c06c232e4f6cc8e75fa52851bbfc03af85697793 Mon Sep 17 00:00:00 2001 From: "Ivan A. Melnikov" Date: Wed, 21 Jan 2026 20:49:43 +0400 Subject: [PATCH 09/16] test: Avoid -ffinite-math-only on floating point comparisons With `-ffinite-math-only` (implied by `-ffast-math` and -Ofast), GCC considers all comparisons against infinities to be false and can optimize them away. This breaks several tests that rely on infinities to be compared correctly. To avoid this, we add GCC-specific optimize attribute that disables `-ffinite-math-only` optimization for floating point comarisions used in assertions. Signed-off-by: Ivan A. Melnikov --- test/test.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test.h b/test/test.h index 3e1b3de78..287171bfa 100644 --- a/test/test.h +++ b/test/test.h @@ -126,6 +126,10 @@ simde_test_debug_printf_(const char* format, ...) { HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ +#if defined(SIMDE_FAST_MATH) && defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4,4,0) +__attribute__((optimize("-fno-finite-math-only"))) +__attribute__((noinline)) +#endif static int simde_test_equal_f32(simde_float32 a, simde_float32 b, simde_float32 slop) { if (simde_math_isnan(a)) { @@ -156,6 +160,10 @@ simde_test_equal_f16(simde_float16 a, simde_float16 b, simde_float16 slop) { return simde_test_equal_f32(af, bf, slopf); } +#if defined(SIMDE_FAST_MATH) && defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4,4,0) +__attribute__((optimize("-fno-finite-math-only"))) +__attribute__((noinline)) +#endif static int simde_test_equal_f64(simde_float64 a, simde_float64 b, simde_float64 slop) { if (simde_math_isnan(a)) { From e14f23f78209ddaa56d071df7bfc97dd0eb7c7fc Mon Sep 17 00:00:00 2001 From: "Ivan A. Melnikov" Date: Wed, 14 Jan 2026 13:22:11 +0400 Subject: [PATCH 10/16] x86/avx: Workaround for GCC ICE on loongarch64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This works around two similar instances of ICE of GCC 14: test/x86/avx512/range.cpp: In function ‘int test_simde_mm256_maskz_range_ps()’: test/x86/avx512/range.cpp:702:1: error: unrecognizable insn: 702 | } | ^ (insn 191 190 192 2 (set (reg:V8SF 446 [ r_$f32_514 ]) (vec_merge:V8SF (vec_duplicate:V8SF (const_double:SF 0.0 [0x0.0p+0])) (reg:V8SF 446 [ r_$f32_514 ]) (const_int 1 [0x1]))) "../test/x86/avx512/../../../simde/x86/avx.h":1041:17 -1 (nil)) [...] The similar workaround is already present in simde_mm256_set_ps. Link: https://gcc.gnu.org/pipermail/gcc-patches/2026-January/706166.html Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117575 --- simde/x86/avx.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/simde/x86/avx.h b/simde/x86/avx.h index c8366c471..93848dfc8 100644 --- a/simde/x86/avx.h +++ b/simde/x86/avx.h @@ -1028,15 +1028,15 @@ simde_mm256_set_ps (simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_ simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0); + #elif defined(SIMDE_ARCH_LOONGARCH) + simde__m256 tmp_ = { e0, e1, e2, e3, e4, e5, e6, e7 }; + return tmp_; #else simde__m256_private r_; #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_mm_set_ps(e3, e2, e1, e0); r_.m128[1] = simde_mm_set_ps(e7, e6, e5, e4); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - SIMDE_ALIGN_LIKE_32(__m256) simde_float32 data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 }; - r_.i256 = __lasx_xvld(data, 0); #else r_.f32[0] = e0; r_.f32[1] = e1; @@ -1062,6 +1062,9 @@ simde__m256d simde_mm256_set_pd (simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set_pd(e3, e2, e1, e0); + #elif defined(SIMDE_ARCH_LOONGARCH) + simde__m256d tmp_ = { e0, e1, e2, e3 }; + return tmp_; #else simde__m256d_private r_; From ba08f28dc8c45dd756b7834026a529d6836cb378 Mon Sep 17 00:00:00 2001 From: "Ivan A. Melnikov" Date: Thu, 22 Jan 2026 19:08:09 +0400 Subject: [PATCH 11/16] x86/avx, x86/sse: workaround for -Werror=maybe-uninitialized on loongarch64 Avoid some usages of __lsx_vst and __lasx_xvst, as they may cause maybe-uninitialized warnings to be triggered: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123766 The optimizing compiler still generates optimal vectorized code for fixed-size __builtin_memcpy, so no performance loss is expected. --- simde/simde-common.h | 3 +++ simde/x86/avx.h | 18 +++++++++--------- simde/x86/sse2.h | 6 +++--- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/simde/simde-common.h b/simde/simde-common.h index 31889b492..369a16444 100644 --- a/simde/simde-common.h +++ b/simde/simde-common.h @@ -1086,6 +1086,9 @@ HEDLEY_DIAGNOSTIC_POP || (HEDLEY_GCC_VERSION_CHECK(9,5,0) && !(HEDLEY_GCC_VERSION_CHECK(10,0,0)))) # define SIMDE_BUG_GCC_105339 # endif +# if defined(SIMDE_ARCH_LOONGARCH) +# define SIMDE_BUG_GCC_123766 +# endif # elif defined(__clang__) # if defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_CLANG_48257 // https://github.com/llvm/llvm-project/issues/47601 diff --git a/simde/x86/avx.h b/simde/x86/avx.h index 93848dfc8..276195e9e 100644 --- a/simde/x86/avx.h +++ b/simde/x86/avx.h @@ -5860,7 +5860,7 @@ void simde_mm256_store_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_ps(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a)); @@ -5876,7 +5876,7 @@ void simde_mm256_store_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_pd(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a)); @@ -5892,7 +5892,7 @@ void simde_mm256_store_si256 (simde__m256i* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_si256(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); @@ -5908,7 +5908,7 @@ void simde_mm256_storeu_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_ps(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); @@ -5924,7 +5924,7 @@ void simde_mm256_storeu_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_pd(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) simde__m256d_private a_ = simde__m256d_to_private(a); @@ -5945,7 +5945,7 @@ void simde_mm256_storeu_si256 (void* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_si256(SIMDE_ALIGN_CAST(__m256i*, mem_addr), a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); @@ -6006,7 +6006,7 @@ void simde_mm256_stream_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_ps(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); @@ -6024,7 +6024,7 @@ void simde_mm256_stream_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_pd(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); @@ -6042,7 +6042,7 @@ void simde_mm256_stream_si256 (simde__m256i* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_si256(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); diff --git a/simde/x86/sse2.h b/simde/x86/sse2.h index 5522efa50..b5306309b 100644 --- a/simde/x86/sse2.h +++ b/simde/x86/sse2.h @@ -7279,7 +7279,7 @@ simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) __lsx_vst(simde__m128d_to_private(a).lsx_i64, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a)); @@ -7478,7 +7478,7 @@ simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) { _mm_storeu_pd(mem_addr, a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) __lsx_vst(simde__m128d_to_private(a).lsx_f64, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); @@ -7493,7 +7493,7 @@ void simde_mm_storeu_si128 (void* mem_addr, simde__m128i a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) __lsx_vst(simde__m128i_to_private(a).lsx_i64, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); From 7d6f25dbcb4b0dd7288878ba633f590d2752ead4 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Thu, 22 Jan 2026 15:03:23 +0100 Subject: [PATCH 12/16] arm neon ext: small adjustment to reduce risk of -Werror=maybe-uninitialized --- simde/arm/neon/ext.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/simde/arm/neon/ext.h b/simde/arm/neon/ext.h index 67e03099c..0fed2a8d1 100644 --- a/simde/arm/neon/ext.h +++ b/simde/arm/neon/ext.h @@ -53,9 +53,10 @@ simde_vext_f16(simde_float16x4_t a, simde_float16x4_t b, const int n) r_.sv64 = __riscv_vslideup_vx_f16m1(a_.sv64, b_.sv64, 4-n, 4); #else const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 3]; } #endif return simde_float16x4_from_private(r_); @@ -500,9 +501,10 @@ simde_vextq_f16(simde_float16x8_t a, simde_float16x8_t b, const int n) r_.sv128 = __riscv_vslideup_vx_f16m1(a_.sv128, b_.sv128, 8-n, 8); #else const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 7]; } #endif return simde_float16x8_from_private(r_); @@ -1106,9 +1108,10 @@ simde_vextq_p8(simde_poly8x16_t a, simde_poly8x16_t b, const int n) b_ = simde_poly8x16_to_private(b), r_ = a_; const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 15]; } return simde_poly8x16_from_private(r_); #endif @@ -1132,9 +1135,10 @@ simde_vextq_p16(simde_poly16x8_t a, simde_poly16x8_t b, const int n) b_ = simde_poly16x8_to_private(b), r_ = a_; const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 7]; } return simde_poly16x8_from_private(r_); #endif From b88b9a7d81b90cfa9977cf5e4d54c3781f69eb1c Mon Sep 17 00:00:00 2001 From: "Ivan A. Melnikov" Date: Fri, 23 Jan 2026 11:15:45 +0400 Subject: [PATCH 13/16] arm/neon: Disable (maybe) uninitialized variable warnings on loongarch64 ... in the same way it's already done for RISC-V GCC. Co-authored-by: Michael R. Crusoe --- simde/arm/neon/ext.h | 18 ++++++++++++++++++ simde/arm/neon/ld2.h | 33 +++++++++++++++++++++++++++------ simde/arm/neon/sm3.h | 4 ++-- 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/simde/arm/neon/ext.h b/simde/arm/neon/ext.h index 0fed2a8d1..b696f7f5b 100644 --- a/simde/arm/neon/ext.h +++ b/simde/arm/neon/ext.h @@ -614,6 +614,11 @@ simde_vextq_f64(simde_float64x2_t a, simde_float64x2_t b, const int n) #define vextq_f64(a, b, n) simde_vextq_f64((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_int8x16_t simde_vextq_s8(simde_int8x16_t a, simde_int8x16_t b, const int n) @@ -780,6 +785,10 @@ simde_vextq_s32(simde_int32x4_t a, simde_int32x4_t b, const int n) #define vextq_s32(a, b, n) simde_vextq_s32((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_POP +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_int64x2_t simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) @@ -828,6 +837,11 @@ simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) #define vextq_s64(a, b, n) simde_vextq_s64((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint8x16_t simde_vextq_u8(simde_uint8x16_t a, simde_uint8x16_t b, const int n) @@ -971,6 +985,10 @@ simde_vextq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int n) #define vextq_u32(a, b, n) simde_vextq_u32((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_POP +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2_t simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) diff --git a/simde/arm/neon/ld2.h b/simde/arm/neon/ld2.h index c72c6d622..9d5251034 100644 --- a/simde/arm/neon/ld2.h +++ b/simde/arm/neon/ld2.h @@ -713,7 +713,9 @@ simde_vld2q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; #else - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ #endif @@ -731,7 +733,9 @@ simde_vld2q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_POP #endif #endif @@ -907,7 +911,9 @@ simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; #else - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ #endif @@ -925,7 +931,9 @@ simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_POP #endif #endif @@ -1046,6 +1054,11 @@ simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_float64x2_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -1058,6 +1071,10 @@ simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_float64x2_from_private(r_[0]), simde_float64x2_from_private(r_[1]), } }; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH) + HEDLEY_DIAGNOSTIC_POP + #endif return r; #endif @@ -1263,7 +1280,9 @@ simde_vld2q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_p64(ptr); #else - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ #endif @@ -1286,7 +1305,9 @@ simde_vld2q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_poly64x2_from_private(r_[1]), } }; - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_POP #endif return r; diff --git a/simde/arm/neon/sm3.h b/simde/arm/neon/sm3.h index c68d7c8eb..8475984c5 100644 --- a/simde/arm/neon/sm3.h +++ b/simde/arm/neon/sm3.h @@ -62,7 +62,7 @@ simde_vsm3ss1q_u32(simde_uint32x4_t n, simde_uint32x4_t m, simde_uint32x4_t a) { #define vsm3ss1q_u32(n, m, a) simde_vsm3ss1q_u32((n), (m), (a)) #endif -#if defined(SIMDE_ARCH_RISCV64) && HEDLEY_GCC_VERSION_CHECK(14,0,0) +#if HEDLEY_GCC_VERSION_CHECK(14,0,0) && (defined(SIMDE_ARCH_RISCV64) || defined(SIMDE_ARCH_LOONGARCH)) HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ #endif @@ -189,7 +189,7 @@ simde_vsm3tt2bq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, #define vsm3tt2bq_u32(a, b, c, imm2) simde_vsm3tt2bq_u32((a), (b), (c), (imm2)) #endif -#if defined(SIMDE_ARCH_RISCV64) && HEDLEY_GCC_VERSION_CHECK(14,0,0) +#if HEDLEY_GCC_VERSION_CHECK(14,0,0) && (defined(SIMDE_ARCH_RISCV64) || defined(SIMDE_ARCH_LOONGARCH)) HEDLEY_DIAGNOSTIC_POP #endif From bdc21cd388a1062b406f263903a9b0c32573f502 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 30 Jan 2026 09:20:18 +0100 Subject: [PATCH 14/16] gh-actions clang-qemu: test versions 21 & 22 for loong64 with -ffast-math --- .github/workflows/ci.yml | 13 ++++++++++++ ...loongarch64-clang-20-fastmath-ccache.cross | 21 +++++++++++++++++++ ...loongarch64-clang-21-fastmath-ccache.cross | 21 +++++++++++++++++++ ...loongarch64-clang-22-fastmath-ccache.cross | 21 +++++++++++++++++++ 4 files changed, 76 insertions(+) create mode 100644 docker/cross-files/loongarch64-clang-20-fastmath-ccache.cross create mode 100644 docker/cross-files/loongarch64-clang-21-fastmath-ccache.cross create mode 100644 docker/cross-files/loongarch64-clang-22-fastmath-ccache.cross diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2dd1cd269..cbf2120f1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -696,6 +696,19 @@ jobs: distro: ubuntu-24.04 - version: 22 cross: loongarch64 + extra: -fastmath + arch_deb: loong64 + arch_gnu: loongarch64 + distro: ubuntu-24.04 + - version: 21 + cross: loongarch64 + extra: -fastmath + arch_deb: loong64 + arch_gnu: loongarch64 + distro: ubuntu-24.04 + - version: 22 + cross: loongarch64 + extra: -fastmath arch_deb: loong64 arch_gnu: loongarch64 distro: ubuntu-24.04 diff --git a/docker/cross-files/loongarch64-clang-20-fastmath-ccache.cross b/docker/cross-files/loongarch64-clang-20-fastmath-ccache.cross new file mode 100644 index 000000000..380b8e5d5 --- /dev/null +++ b/docker/cross-files/loongarch64-clang-20-fastmath-ccache.cross @@ -0,0 +1,21 @@ +[binaries] +c = ['ccache', 'clang-20'] +cpp = ['ccache', 'clang++-20'] +ar = 'llvm-ar-20' +strip = 'llvm-strip-20' +objcopy = 'llvm-objcopy-20' +c_ld = 'lld' +cpp_ld = 'lld' +exe_wrapper = ['qemu-loongarch64-static', '-L', '/usr/loongarch64-linux-gnu/', '-cpu', 'la464'] + +[properties] +c_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +cpp_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +c_link_args = ['--target=loongarch64-linux-gnu'] +cpp_link_args = ['--target=loongarch64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'loongarch64' +cpu = 'la464' +endian = 'little' diff --git a/docker/cross-files/loongarch64-clang-21-fastmath-ccache.cross b/docker/cross-files/loongarch64-clang-21-fastmath-ccache.cross new file mode 100644 index 000000000..371482c1e --- /dev/null +++ b/docker/cross-files/loongarch64-clang-21-fastmath-ccache.cross @@ -0,0 +1,21 @@ +[binaries] +c = ['ccache', 'clang-21'] +cpp = ['ccache', 'clang++-21'] +ar = 'llvm-ar-21' +strip = 'llvm-strip-21' +objcopy = 'llvm-objcopy-21' +c_ld = 'lld' +cpp_ld = 'lld' +exe_wrapper = ['qemu-loongarch64-static', '-L', '/usr/loongarch64-linux-gnu/', '-cpu', 'la464'] + +[properties] +c_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +cpp_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +c_link_args = ['--target=loongarch64-linux-gnu'] +cpp_link_args = ['--target=loongarch64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'loongarch64' +cpu = 'la464' +endian = 'little' diff --git a/docker/cross-files/loongarch64-clang-22-fastmath-ccache.cross b/docker/cross-files/loongarch64-clang-22-fastmath-ccache.cross new file mode 100644 index 000000000..1a8145c03 --- /dev/null +++ b/docker/cross-files/loongarch64-clang-22-fastmath-ccache.cross @@ -0,0 +1,21 @@ +[binaries] +c = ['ccache', 'clang-22'] +cpp = ['ccache', 'clang++-22'] +ar = 'llvm-ar-22' +strip = 'llvm-strip-22' +objcopy = 'llvm-objcopy-22' +c_ld = 'lld' +cpp_ld = 'lld' +exe_wrapper = ['qemu-loongarch64-static', '-L', '/usr/loongarch64-linux-gnu/', '-cpu', 'la464'] + +[properties] +c_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +cpp_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +c_link_args = ['--target=loongarch64-linux-gnu'] +cpp_link_args = ['--target=loongarch64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'loongarch64' +cpu = 'la464' +endian = 'little' From fc4bcf72640411ba553d10d9573c0cf9cd5842b1 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 28 Jan 2026 16:50:03 +0800 Subject: [PATCH 15/16] gcc loong64: work around the vec_perm_const bug in the LoongArch backend https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121064 has been fixed in the gcc-14 branch, and already released in gcc 15.2 Co-authored-by: Michael R. Crusoe --- simde/arm/neon/ext.h | 20 ++++++++++---------- simde/simde-common.h | 5 +++++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/simde/arm/neon/ext.h b/simde/arm/neon/ext.h index b696f7f5b..3858528cb 100644 --- a/simde/arm/neon/ext.h +++ b/simde/arm/neon/ext.h @@ -552,7 +552,7 @@ simde_vextq_f32(simde_float32x4_t a, simde_float32x4_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \ simde_float32x4_from_private(simde_vextq_f32_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_f32(a, b, n) (__extension__ ({ \ simde_float32x4_private simde_vextq_f32_r_; \ simde_vextq_f32_r_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, simde_float32x4_to_private(a).values, simde_float32x4_to_private(b).values, \ @@ -661,7 +661,7 @@ simde_vextq_s8(simde_int8x16_t a, simde_int8x16_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 14)), HEDLEY_STATIC_CAST(int8_t, ((n) + 15))); \ simde_int8x16_from_private(simde_vextq_s8_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s8(a, b, n) (__extension__ ({ \ simde_int8x16_private simde_vextq_s8_r_; \ simde_vextq_s8_r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, simde_int8x16_to_private(a).values, simde_int8x16_to_private(b).values, \ @@ -719,7 +719,7 @@ simde_vextq_s16(simde_int16x8_t a, simde_int16x8_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7))); \ simde_int16x8_from_private(simde_vextq_s16_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s16(a, b, n) (__extension__ ({ \ simde_int16x8_private simde_vextq_s16_r_; \ simde_vextq_s16_r_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, simde_int16x8_to_private(a).values, simde_int16x8_to_private(b).values, \ @@ -771,7 +771,7 @@ simde_vextq_s32(simde_int32x4_t a, simde_int32x4_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \ simde_int32x4_from_private(simde_vextq_s32_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s32(a, b, n) (__extension__ ({ \ simde_int32x4_private simde_vextq_s32_r_; \ simde_vextq_s32_r_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, simde_int32x4_to_private(a).values, simde_int32x4_to_private(b).values, \ @@ -824,7 +824,7 @@ simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1))); \ simde_int64x2_from_private(simde_vextq_s64_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s64(a, b, n) (__extension__ ({ \ simde_int64x2_private simde_vextq_s64_r_; \ simde_vextq_s64_r_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, simde_int64x2_to_private(a).values, simde_int64x2_to_private(b).values, \ @@ -870,7 +870,7 @@ simde_vextq_u8(simde_uint8x16_t a, simde_uint8x16_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u8(a, b, n) simde_uint8x16_from_m128i(_mm_alignr_epi8(simde_uint8x16_to_m128i(b), simde_uint8x16_to_m128i(a), n * sizeof(uint8_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u8(a, b, n) (__extension__ ({ \ simde_uint8x16_private simde_vextq_u8_r_; \ simde_vextq_u8_r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, simde_uint8x16_to_private(a).values, simde_uint8x16_to_private(b).values, \ @@ -918,7 +918,7 @@ simde_vextq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u16(a, b, n) simde_uint16x8_from_m128i(_mm_alignr_epi8(simde_uint16x8_to_m128i(b), simde_uint16x8_to_m128i(a), n * sizeof(uint16_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u16(a, b, n) (__extension__ ({ \ simde_uint16x8_private simde_vextq_u16_r_; \ simde_vextq_u16_r_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, simde_uint16x8_to_private(a).values, simde_uint16x8_to_private(b).values, \ @@ -928,7 +928,7 @@ simde_vextq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7))); \ simde_uint16x8_from_private(simde_vextq_u16_r_); \ })) -#elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) +#elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u16(a, b, n) (__extension__ ({ \ simde_uint16x8_private r_; \ r_.values = __builtin_shufflevector( \ @@ -971,7 +971,7 @@ simde_vextq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u32(a, b, n) simde_uint32x4_from_m128i(_mm_alignr_epi8(simde_uint32x4_to_m128i(b), simde_uint32x4_to_m128i(a), n * sizeof(uint32_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u32(a, b, n) (__extension__ ({ \ simde_uint32x4_private simde_vextq_u32_r_; \ simde_vextq_u32_r_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, simde_uint32x4_to_private(a).values, simde_uint32x4_to_private(b).values, \ @@ -1017,7 +1017,7 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u64(a, b, n) simde_uint64x2_from_m128i(_mm_alignr_epi8(simde_uint64x2_to_m128i(b), simde_uint64x2_to_m128i(a), n * sizeof(uint64_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u64(a, b, n) (__extension__ ({ \ simde_uint64x2_private simde_vextq_u64_r_; \ simde_vextq_u64_r_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, simde_uint64x2_to_private(a).values, simde_uint64x2_to_private(b).values, \ diff --git a/simde/simde-common.h b/simde/simde-common.h index 369a16444..3650e4091 100644 --- a/simde/simde-common.h +++ b/simde/simde-common.h @@ -1079,6 +1079,11 @@ HEDLEY_DIAGNOSTIC_POP # if HEDLEY_GCC_VERSION_CHECK(16,0,0) # define SIMDE_BUG_GCC_123807 # endif +# if defined(SIMDE_LOONGARCH_LSX_NATIVE) && \ + ((HEDLEY_GCC_VERSION_CHECK(14,0,0) && !HEDLEY_GCC_VERSION_CHECK(14,4,0)) || \ + (HEDLEY_GCC_VERSION_CHECK(15,0,0) && !HEDLEY_GCC_VERSION_CHECK(15,2,0))) +# define SIMDE_BUG_GCC_121064 +# endif # endif # if !defined(__OPTIMIZE__) && !(\ HEDLEY_GCC_VERSION_CHECK(11,4,0) \ From bed1e4abdf61d873e3d76b10b5248414d1599ba8 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 30 Jan 2026 11:00:29 +0100 Subject: [PATCH 16/16] diagnostics: fix typo --- simde/arm/neon/ext.h | 7 ++++++- simde/arm/neon/ld1_x2.h | 2 +- simde/arm/neon/ld1_x3.h | 2 +- simde/arm/neon/ld1_x4.h | 2 +- simde/arm/neon/ld1q_x2.h | 2 +- simde/arm/neon/ld1q_x3.h | 2 +- simde/arm/neon/ld1q_x4.h | 2 +- simde/arm/neon/ld2.h | 2 +- simde/arm/neon/ld3.h | 2 +- simde/arm/neon/ld4.h | 2 +- simde/arm/neon/sm3.h | 2 +- simde/simde-diagnostic.h | 4 ++-- simde/x86/avx.h | 6 +++--- 13 files changed, 21 insertions(+), 16 deletions(-) diff --git a/simde/arm/neon/ext.h b/simde/arm/neon/ext.h index 3858528cb..c7ef81cec 100644 --- a/simde/arm/neon/ext.h +++ b/simde/arm/neon/ext.h @@ -1030,8 +1030,10 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) #define vextq_u64(a, b, n) simde_vextq_u64((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_RISCV64) HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif SIMDE_FUNCTION_ATTRIBUTES simde_poly8x8_t @@ -1085,7 +1087,10 @@ simde_vext_p16(simde_poly16x4_t a, simde_poly16x4_t b, const int n) #define vext_p16(a, b, n) simde_vext_p16((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_RISCV64) HEDLEY_DIAGNOSTIC_POP +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif SIMDE_FUNCTION_ATTRIBUTES simde_poly64x1_t diff --git a/simde/arm/neon/ld1_x2.h b/simde/arm/neon/ld1_x2.h index ca794217b..c502debb4 100644 --- a/simde/arm/neon/ld1_x2.h +++ b/simde/arm/neon/ld1_x2.h @@ -36,7 +36,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld1_x3.h b/simde/arm/neon/ld1_x3.h index ad96b19ca..a34ce54e7 100644 --- a/simde/arm/neon/ld1_x3.h +++ b/simde/arm/neon/ld1_x3.h @@ -35,7 +35,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld1_x4.h b/simde/arm/neon/ld1_x4.h index 1f70daacb..bb72da4ba 100644 --- a/simde/arm/neon/ld1_x4.h +++ b/simde/arm/neon/ld1_x4.h @@ -36,7 +36,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld1q_x2.h b/simde/arm/neon/ld1q_x2.h index 1db68e964..df6452d23 100644 --- a/simde/arm/neon/ld1q_x2.h +++ b/simde/arm/neon/ld1q_x2.h @@ -38,7 +38,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld1q_x3.h b/simde/arm/neon/ld1q_x3.h index 0ab6005f0..c34109613 100644 --- a/simde/arm/neon/ld1q_x3.h +++ b/simde/arm/neon/ld1q_x3.h @@ -37,7 +37,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld1q_x4.h b/simde/arm/neon/ld1q_x4.h index c2d17d937..96e526777 100644 --- a/simde/arm/neon/ld1q_x4.h +++ b/simde/arm/neon/ld1q_x4.h @@ -38,7 +38,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld2.h b/simde/arm/neon/ld2.h index 9d5251034..72ab47854 100644 --- a/simde/arm/neon/ld2.h +++ b/simde/arm/neon/ld2.h @@ -37,7 +37,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld3.h b/simde/arm/neon/ld3.h index a102f2eda..2361a8968 100644 --- a/simde/arm/neon/ld3.h +++ b/simde/arm/neon/ld3.h @@ -36,7 +36,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld4.h b/simde/arm/neon/ld4.h index 5f13ebbd6..85a15e194 100644 --- a/simde/arm/neon/ld4.h +++ b/simde/arm/neon/ld4.h @@ -35,7 +35,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/sm3.h b/simde/arm/neon/sm3.h index 8475984c5..737601b08 100644 --- a/simde/arm/neon/sm3.h +++ b/simde/arm/neon/sm3.h @@ -64,7 +64,7 @@ simde_vsm3ss1q_u32(simde_uint32x4_t n, simde_uint32x4_t m, simde_uint32x4_t a) { #if HEDLEY_GCC_VERSION_CHECK(14,0,0) && (defined(SIMDE_ARCH_RISCV64) || defined(SIMDE_ARCH_LOONGARCH)) HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_FUNCTION_ATTRIBUTES diff --git a/simde/simde-diagnostic.h b/simde/simde-diagnostic.h index c798f23bf..7fe033ec3 100644 --- a/simde/simde-diagnostic.h +++ b/simde/simde-diagnostic.h @@ -410,9 +410,9 @@ /* This is a false positive from GCC in a few places. */ #if HEDLEY_GCC_VERSION_CHECK(4,7,0) - #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ _Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") + #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ _Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") #else - #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif #if defined(SIMDE_ENABLE_NATIVE_ALIASES) diff --git a/simde/x86/avx.h b/simde/x86/avx.h index 276195e9e..fd7c2e490 100644 --- a/simde/x86/avx.h +++ b/simde/x86/avx.h @@ -2183,7 +2183,7 @@ simde_mm256_castps128_ps256 (simde__m128 a) { simde__m256_private r_; simde__m128_private a_ = simde__m128_to_private(a); HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ r_.m128_private[0] = a_; return simde__m256_from_private(r_); @@ -4783,7 +4783,7 @@ simde_mm256_min_ps (simde__m256 a, simde__m256 b) { return __lasx_xvfmin_s(a, b); #else HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ simde__m256_private r_, a_ = simde__m256_to_private(a), @@ -5235,7 +5235,7 @@ simde__m128 simde_mm_permute_ps (simde__m128 a, const int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ simde__m128_private r_, a_ = simde__m128_to_private(a);