diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 935ffbb64..cbf2120f1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -487,6 +487,12 @@ jobs: arch_gnu: loongarch64 arch_deb: loong64 distro: ubuntu-24.04 + - extra: -fastmath + version: 14 + cross: loongarch64 + arch_gnu: loongarch64 + arch_deb: loong64 + distro: ubuntu-24.04 # - version: 14 # cross: mips64el # arch_gnu: mips64el @@ -500,7 +506,7 @@ jobs: submodules: recursive - name: CPU Information run: cat /proc/cpuinfo - - if: ${{ matrix.distro == 'ubuntu-24.04' }} + - if: ${{ matrix.distro == 'ubuntu-24.04' && ( matrix.version == '15' )}} run: sudo add-apt-repository ppa:daawesomep/toolchain-backports-noble - name: Install APT Dependencies @@ -527,7 +533,7 @@ jobs: - name: Test run: | # shellcheck disable=SC2046 - meson test -C build --print-errorlogs --print-errorlogs $(meson test -C build --list | grep -v emul) + meson test -C build --print-errorlogs --print-errorlogs # $(meson test -C build --list | grep -v emul) clang17-qemu-rvv: strategy: @@ -690,6 +696,19 @@ jobs: distro: ubuntu-24.04 - version: 22 cross: loongarch64 + extra: -fastmath + arch_deb: loong64 + arch_gnu: loongarch64 + distro: ubuntu-24.04 + - version: 21 + cross: loongarch64 + extra: -fastmath + arch_deb: loong64 + arch_gnu: loongarch64 + distro: ubuntu-24.04 + - version: 22 + cross: loongarch64 + extra: -fastmath arch_deb: loong64 arch_gnu: loongarch64 distro: ubuntu-24.04 @@ -728,7 +747,7 @@ jobs: - name: Test run: | # shellcheck disable=SC2046 - meson test -C build --print-errorlogs --print-errorlogs $(meson test -C build --list | grep -v emul) + meson test -C build --print-errorlogs --print-errorlogs # $(meson test -C build --list | grep -v emul) clang: strategy: diff --git a/docker/cross-files/loongarch64-clang-20-fastmath-ccache.cross b/docker/cross-files/loongarch64-clang-20-fastmath-ccache.cross new file mode 100644 index 000000000..380b8e5d5 --- /dev/null +++ b/docker/cross-files/loongarch64-clang-20-fastmath-ccache.cross @@ -0,0 +1,21 @@ +[binaries] +c = ['ccache', 'clang-20'] +cpp = ['ccache', 'clang++-20'] +ar = 'llvm-ar-20' +strip = 'llvm-strip-20' +objcopy = 'llvm-objcopy-20' +c_ld = 'lld' +cpp_ld = 'lld' +exe_wrapper = ['qemu-loongarch64-static', '-L', '/usr/loongarch64-linux-gnu/', '-cpu', 'la464'] + +[properties] +c_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +cpp_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +c_link_args = ['--target=loongarch64-linux-gnu'] +cpp_link_args = ['--target=loongarch64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'loongarch64' +cpu = 'la464' +endian = 'little' diff --git a/docker/cross-files/loongarch64-clang-21-fastmath-ccache.cross b/docker/cross-files/loongarch64-clang-21-fastmath-ccache.cross new file mode 100644 index 000000000..371482c1e --- /dev/null +++ b/docker/cross-files/loongarch64-clang-21-fastmath-ccache.cross @@ -0,0 +1,21 @@ +[binaries] +c = ['ccache', 'clang-21'] +cpp = ['ccache', 'clang++-21'] +ar = 'llvm-ar-21' +strip = 'llvm-strip-21' +objcopy = 'llvm-objcopy-21' +c_ld = 'lld' +cpp_ld = 'lld' +exe_wrapper = ['qemu-loongarch64-static', '-L', '/usr/loongarch64-linux-gnu/', '-cpu', 'la464'] + +[properties] +c_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +cpp_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +c_link_args = ['--target=loongarch64-linux-gnu'] +cpp_link_args = ['--target=loongarch64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'loongarch64' +cpu = 'la464' +endian = 'little' diff --git a/docker/cross-files/loongarch64-clang-22-fastmath-ccache.cross b/docker/cross-files/loongarch64-clang-22-fastmath-ccache.cross new file mode 100644 index 000000000..1a8145c03 --- /dev/null +++ b/docker/cross-files/loongarch64-clang-22-fastmath-ccache.cross @@ -0,0 +1,21 @@ +[binaries] +c = ['ccache', 'clang-22'] +cpp = ['ccache', 'clang++-22'] +ar = 'llvm-ar-22' +strip = 'llvm-strip-22' +objcopy = 'llvm-objcopy-22' +c_ld = 'lld' +cpp_ld = 'lld' +exe_wrapper = ['qemu-loongarch64-static', '-L', '/usr/loongarch64-linux-gnu/', '-cpu', 'la464'] + +[properties] +c_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +cpp_args = ['--target=loongarch64-linux-gnu', '-march=la464', '-isystem=/usr/loongarch64-linux-gnu/include', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-O3', '-ffast-math', '-Wno-nan-infinity-disabled'] +c_link_args = ['--target=loongarch64-linux-gnu'] +cpp_link_args = ['--target=loongarch64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'loongarch64' +cpu = 'la464' +endian = 'little' diff --git a/docker/cross-files/loongarch64-gcc-14-fastmath-ccache.cross b/docker/cross-files/loongarch64-gcc-14-fastmath-ccache.cross new file mode 100644 index 000000000..1892193d6 --- /dev/null +++ b/docker/cross-files/loongarch64-gcc-14-fastmath-ccache.cross @@ -0,0 +1,20 @@ +[binaries] +c = ['ccache', 'loongarch64-linux-gnu-gcc-14'] +cpp = ['ccache', 'loongarch64-linux-gnu-g++-14'] +ar = 'loongarch64-linux-gnu-gcc-ar-14' +strip = 'loongarch64-linux-gnu-strip' +objcopy = 'loongarch64-linux-gnu-objcopy' +ld = 'loongarch64-linux-gnu-ld' +exe_wrapper = ['qemu-loongarch64-static', '-L', '/usr/loongarch64-linux-gnu/', '-cpu', 'la464'] + +[properties] +c_args = ['-march=loongarch64', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-Ofast'] +cpp_args = ['-march=loongarch64', '-Wextra', '-Werror', '-mlsx', '-mlasx', '-Ofast'] +#c_args = ['-march=la464', '-Wextra', '-Werror'] +#cpp_args = ['-march=la464', '-Wextra', '-Werror'] + +[host_machine] +system = 'linux' +cpu_family = 'loongarch64' +cpu = 'loongarch64' +endian = 'little' diff --git a/simde/arm/neon/ext.h b/simde/arm/neon/ext.h index 67e03099c..c7ef81cec 100644 --- a/simde/arm/neon/ext.h +++ b/simde/arm/neon/ext.h @@ -53,9 +53,10 @@ simde_vext_f16(simde_float16x4_t a, simde_float16x4_t b, const int n) r_.sv64 = __riscv_vslideup_vx_f16m1(a_.sv64, b_.sv64, 4-n, 4); #else const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 3]; } #endif return simde_float16x4_from_private(r_); @@ -500,9 +501,10 @@ simde_vextq_f16(simde_float16x8_t a, simde_float16x8_t b, const int n) r_.sv128 = __riscv_vslideup_vx_f16m1(a_.sv128, b_.sv128, 8-n, 8); #else const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 7]; } #endif return simde_float16x8_from_private(r_); @@ -550,7 +552,7 @@ simde_vextq_f32(simde_float32x4_t a, simde_float32x4_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \ simde_float32x4_from_private(simde_vextq_f32_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_f32(a, b, n) (__extension__ ({ \ simde_float32x4_private simde_vextq_f32_r_; \ simde_vextq_f32_r_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, simde_float32x4_to_private(a).values, simde_float32x4_to_private(b).values, \ @@ -612,6 +614,11 @@ simde_vextq_f64(simde_float64x2_t a, simde_float64x2_t b, const int n) #define vextq_f64(a, b, n) simde_vextq_f64((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_int8x16_t simde_vextq_s8(simde_int8x16_t a, simde_int8x16_t b, const int n) @@ -654,7 +661,7 @@ simde_vextq_s8(simde_int8x16_t a, simde_int8x16_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 14)), HEDLEY_STATIC_CAST(int8_t, ((n) + 15))); \ simde_int8x16_from_private(simde_vextq_s8_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s8(a, b, n) (__extension__ ({ \ simde_int8x16_private simde_vextq_s8_r_; \ simde_vextq_s8_r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, simde_int8x16_to_private(a).values, simde_int8x16_to_private(b).values, \ @@ -712,7 +719,7 @@ simde_vextq_s16(simde_int16x8_t a, simde_int16x8_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7))); \ simde_int16x8_from_private(simde_vextq_s16_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s16(a, b, n) (__extension__ ({ \ simde_int16x8_private simde_vextq_s16_r_; \ simde_vextq_s16_r_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, simde_int16x8_to_private(a).values, simde_int16x8_to_private(b).values, \ @@ -764,7 +771,7 @@ simde_vextq_s32(simde_int32x4_t a, simde_int32x4_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \ simde_int32x4_from_private(simde_vextq_s32_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s32(a, b, n) (__extension__ ({ \ simde_int32x4_private simde_vextq_s32_r_; \ simde_vextq_s32_r_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, simde_int32x4_to_private(a).values, simde_int32x4_to_private(b).values, \ @@ -778,6 +785,10 @@ simde_vextq_s32(simde_int32x4_t a, simde_int32x4_t b, const int n) #define vextq_s32(a, b, n) simde_vextq_s32((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_POP +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_int64x2_t simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) @@ -813,7 +824,7 @@ simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1))); \ simde_int64x2_from_private(simde_vextq_s64_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s64(a, b, n) (__extension__ ({ \ simde_int64x2_private simde_vextq_s64_r_; \ simde_vextq_s64_r_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, simde_int64x2_to_private(a).values, simde_int64x2_to_private(b).values, \ @@ -826,6 +837,11 @@ simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) #define vextq_s64(a, b, n) simde_vextq_s64((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint8x16_t simde_vextq_u8(simde_uint8x16_t a, simde_uint8x16_t b, const int n) @@ -854,7 +870,7 @@ simde_vextq_u8(simde_uint8x16_t a, simde_uint8x16_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u8(a, b, n) simde_uint8x16_from_m128i(_mm_alignr_epi8(simde_uint8x16_to_m128i(b), simde_uint8x16_to_m128i(a), n * sizeof(uint8_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u8(a, b, n) (__extension__ ({ \ simde_uint8x16_private simde_vextq_u8_r_; \ simde_vextq_u8_r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, simde_uint8x16_to_private(a).values, simde_uint8x16_to_private(b).values, \ @@ -902,7 +918,7 @@ simde_vextq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u16(a, b, n) simde_uint16x8_from_m128i(_mm_alignr_epi8(simde_uint16x8_to_m128i(b), simde_uint16x8_to_m128i(a), n * sizeof(uint16_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u16(a, b, n) (__extension__ ({ \ simde_uint16x8_private simde_vextq_u16_r_; \ simde_vextq_u16_r_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, simde_uint16x8_to_private(a).values, simde_uint16x8_to_private(b).values, \ @@ -912,7 +928,7 @@ simde_vextq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7))); \ simde_uint16x8_from_private(simde_vextq_u16_r_); \ })) -#elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) +#elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u16(a, b, n) (__extension__ ({ \ simde_uint16x8_private r_; \ r_.values = __builtin_shufflevector( \ @@ -955,7 +971,7 @@ simde_vextq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u32(a, b, n) simde_uint32x4_from_m128i(_mm_alignr_epi8(simde_uint32x4_to_m128i(b), simde_uint32x4_to_m128i(a), n * sizeof(uint32_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u32(a, b, n) (__extension__ ({ \ simde_uint32x4_private simde_vextq_u32_r_; \ simde_vextq_u32_r_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, simde_uint32x4_to_private(a).values, simde_uint32x4_to_private(b).values, \ @@ -969,6 +985,10 @@ simde_vextq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int n) #define vextq_u32(a, b, n) simde_vextq_u32((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_POP +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2_t simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) @@ -997,7 +1017,7 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u64(a, b, n) simde_uint64x2_from_m128i(_mm_alignr_epi8(simde_uint64x2_to_m128i(b), simde_uint64x2_to_m128i(a), n * sizeof(uint64_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u64(a, b, n) (__extension__ ({ \ simde_uint64x2_private simde_vextq_u64_r_; \ simde_vextq_u64_r_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, simde_uint64x2_to_private(a).values, simde_uint64x2_to_private(b).values, \ @@ -1010,8 +1030,10 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) #define vextq_u64(a, b, n) simde_vextq_u64((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_RISCV64) HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif SIMDE_FUNCTION_ATTRIBUTES simde_poly8x8_t @@ -1065,7 +1087,10 @@ simde_vext_p16(simde_poly16x4_t a, simde_poly16x4_t b, const int n) #define vext_p16(a, b, n) simde_vext_p16((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_RISCV64) HEDLEY_DIAGNOSTIC_POP +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif SIMDE_FUNCTION_ATTRIBUTES simde_poly64x1_t @@ -1106,9 +1131,10 @@ simde_vextq_p8(simde_poly8x16_t a, simde_poly8x16_t b, const int n) b_ = simde_poly8x16_to_private(b), r_ = a_; const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 15]; } return simde_poly8x16_from_private(r_); #endif @@ -1132,9 +1158,10 @@ simde_vextq_p16(simde_poly16x8_t a, simde_poly16x8_t b, const int n) b_ = simde_poly16x8_to_private(b), r_ = a_; const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 7]; } return simde_poly16x8_from_private(r_); #endif diff --git a/simde/arm/neon/ld1_x2.h b/simde/arm/neon/ld1_x2.h index ca794217b..c502debb4 100644 --- a/simde/arm/neon/ld1_x2.h +++ b/simde/arm/neon/ld1_x2.h @@ -36,7 +36,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld1_x3.h b/simde/arm/neon/ld1_x3.h index ad96b19ca..a34ce54e7 100644 --- a/simde/arm/neon/ld1_x3.h +++ b/simde/arm/neon/ld1_x3.h @@ -35,7 +35,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld1_x4.h b/simde/arm/neon/ld1_x4.h index 1f70daacb..bb72da4ba 100644 --- a/simde/arm/neon/ld1_x4.h +++ b/simde/arm/neon/ld1_x4.h @@ -36,7 +36,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld1q_x2.h b/simde/arm/neon/ld1q_x2.h index 1db68e964..df6452d23 100644 --- a/simde/arm/neon/ld1q_x2.h +++ b/simde/arm/neon/ld1q_x2.h @@ -38,7 +38,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld1q_x3.h b/simde/arm/neon/ld1q_x3.h index 0ab6005f0..c34109613 100644 --- a/simde/arm/neon/ld1q_x3.h +++ b/simde/arm/neon/ld1q_x3.h @@ -37,7 +37,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld1q_x4.h b/simde/arm/neon/ld1q_x4.h index c2d17d937..96e526777 100644 --- a/simde/arm/neon/ld1q_x4.h +++ b/simde/arm/neon/ld1q_x4.h @@ -38,7 +38,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld2.h b/simde/arm/neon/ld2.h index c72c6d622..72ab47854 100644 --- a/simde/arm/neon/ld2.h +++ b/simde/arm/neon/ld2.h @@ -37,7 +37,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ @@ -713,7 +713,9 @@ simde_vld2q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; #else - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ #endif @@ -731,7 +733,9 @@ simde_vld2q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_POP #endif #endif @@ -907,7 +911,9 @@ simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; #else - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ #endif @@ -925,7 +931,9 @@ simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_POP #endif #endif @@ -1046,6 +1054,11 @@ simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_float64x2_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -1058,6 +1071,10 @@ simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_float64x2_from_private(r_[0]), simde_float64x2_from_private(r_[1]), } }; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH) + HEDLEY_DIAGNOSTIC_POP + #endif return r; #endif @@ -1263,7 +1280,9 @@ simde_vld2q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_p64(ptr); #else - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ #endif @@ -1286,7 +1305,9 @@ simde_vld2q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_poly64x2_from_private(r_[1]), } }; - #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64) + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) HEDLEY_DIAGNOSTIC_POP #endif return r; diff --git a/simde/arm/neon/ld3.h b/simde/arm/neon/ld3.h index a102f2eda..2361a8968 100644 --- a/simde/arm/neon/ld3.h +++ b/simde/arm/neon/ld3.h @@ -36,7 +36,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/ld4.h b/simde/arm/neon/ld4.h index 5f13ebbd6..85a15e194 100644 --- a/simde/arm/neon/ld4.h +++ b/simde/arm/neon/ld4.h @@ -35,7 +35,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ diff --git a/simde/arm/neon/sm3.h b/simde/arm/neon/sm3.h index c68d7c8eb..737601b08 100644 --- a/simde/arm/neon/sm3.h +++ b/simde/arm/neon/sm3.h @@ -62,9 +62,9 @@ simde_vsm3ss1q_u32(simde_uint32x4_t n, simde_uint32x4_t m, simde_uint32x4_t a) { #define vsm3ss1q_u32(n, m, a) simde_vsm3ss1q_u32((n), (m), (a)) #endif -#if defined(SIMDE_ARCH_RISCV64) && HEDLEY_GCC_VERSION_CHECK(14,0,0) +#if HEDLEY_GCC_VERSION_CHECK(14,0,0) && (defined(SIMDE_ARCH_RISCV64) || defined(SIMDE_ARCH_LOONGARCH)) HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_FUNCTION_ATTRIBUTES @@ -189,7 +189,7 @@ simde_vsm3tt2bq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, #define vsm3tt2bq_u32(a, b, c, imm2) simde_vsm3tt2bq_u32((a), (b), (c), (imm2)) #endif -#if defined(SIMDE_ARCH_RISCV64) && HEDLEY_GCC_VERSION_CHECK(14,0,0) +#if HEDLEY_GCC_VERSION_CHECK(14,0,0) && (defined(SIMDE_ARCH_RISCV64) || defined(SIMDE_ARCH_LOONGARCH)) HEDLEY_DIAGNOSTIC_POP #endif diff --git a/simde/simde-common.h b/simde/simde-common.h index 31889b492..3650e4091 100644 --- a/simde/simde-common.h +++ b/simde/simde-common.h @@ -1079,6 +1079,11 @@ HEDLEY_DIAGNOSTIC_POP # if HEDLEY_GCC_VERSION_CHECK(16,0,0) # define SIMDE_BUG_GCC_123807 # endif +# if defined(SIMDE_LOONGARCH_LSX_NATIVE) && \ + ((HEDLEY_GCC_VERSION_CHECK(14,0,0) && !HEDLEY_GCC_VERSION_CHECK(14,4,0)) || \ + (HEDLEY_GCC_VERSION_CHECK(15,0,0) && !HEDLEY_GCC_VERSION_CHECK(15,2,0))) +# define SIMDE_BUG_GCC_121064 +# endif # endif # if !defined(__OPTIMIZE__) && !(\ HEDLEY_GCC_VERSION_CHECK(11,4,0) \ @@ -1086,6 +1091,9 @@ HEDLEY_DIAGNOSTIC_POP || (HEDLEY_GCC_VERSION_CHECK(9,5,0) && !(HEDLEY_GCC_VERSION_CHECK(10,0,0)))) # define SIMDE_BUG_GCC_105339 # endif +# if defined(SIMDE_ARCH_LOONGARCH) +# define SIMDE_BUG_GCC_123766 +# endif # elif defined(__clang__) # if defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_CLANG_48257 // https://github.com/llvm/llvm-project/issues/47601 diff --git a/simde/simde-diagnostic.h b/simde/simde-diagnostic.h index c798f23bf..7fe033ec3 100644 --- a/simde/simde-diagnostic.h +++ b/simde/simde-diagnostic.h @@ -410,9 +410,9 @@ /* This is a false positive from GCC in a few places. */ #if HEDLEY_GCC_VERSION_CHECK(4,7,0) - #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ _Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") + #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ _Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") #else - #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif #if defined(SIMDE_ENABLE_NATIVE_ALIASES) diff --git a/simde/x86/avx.h b/simde/x86/avx.h index c8366c471..fd7c2e490 100644 --- a/simde/x86/avx.h +++ b/simde/x86/avx.h @@ -1028,15 +1028,15 @@ simde_mm256_set_ps (simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_ simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0); + #elif defined(SIMDE_ARCH_LOONGARCH) + simde__m256 tmp_ = { e0, e1, e2, e3, e4, e5, e6, e7 }; + return tmp_; #else simde__m256_private r_; #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_mm_set_ps(e3, e2, e1, e0); r_.m128[1] = simde_mm_set_ps(e7, e6, e5, e4); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - SIMDE_ALIGN_LIKE_32(__m256) simde_float32 data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 }; - r_.i256 = __lasx_xvld(data, 0); #else r_.f32[0] = e0; r_.f32[1] = e1; @@ -1062,6 +1062,9 @@ simde__m256d simde_mm256_set_pd (simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set_pd(e3, e2, e1, e0); + #elif defined(SIMDE_ARCH_LOONGARCH) + simde__m256d tmp_ = { e0, e1, e2, e3 }; + return tmp_; #else simde__m256d_private r_; @@ -2180,7 +2183,7 @@ simde_mm256_castps128_ps256 (simde__m128 a) { simde__m256_private r_; simde__m128_private a_ = simde__m128_to_private(a); HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ r_.m128_private[0] = a_; return simde__m256_from_private(r_); @@ -4780,7 +4783,7 @@ simde_mm256_min_ps (simde__m256 a, simde__m256 b) { return __lasx_xvfmin_s(a, b); #else HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ simde__m256_private r_, a_ = simde__m256_to_private(a), @@ -5232,7 +5235,7 @@ simde__m128 simde_mm_permute_ps (simde__m128 a, const int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ simde__m128_private r_, a_ = simde__m128_to_private(a); @@ -5857,7 +5860,7 @@ void simde_mm256_store_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_ps(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a)); @@ -5873,7 +5876,7 @@ void simde_mm256_store_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_pd(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a)); @@ -5889,7 +5892,7 @@ void simde_mm256_store_si256 (simde__m256i* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_si256(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); @@ -5905,7 +5908,7 @@ void simde_mm256_storeu_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_ps(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); @@ -5921,7 +5924,7 @@ void simde_mm256_storeu_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_pd(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) simde__m256d_private a_ = simde__m256d_to_private(a); @@ -5942,7 +5945,7 @@ void simde_mm256_storeu_si256 (void* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_si256(SIMDE_ALIGN_CAST(__m256i*, mem_addr), a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); @@ -6003,7 +6006,7 @@ void simde_mm256_stream_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_ps(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); @@ -6021,7 +6024,7 @@ void simde_mm256_stream_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_pd(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); @@ -6039,7 +6042,7 @@ void simde_mm256_stream_si256 (simde__m256i* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_si256(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); diff --git a/simde/x86/sse.h b/simde/x86/sse.h index 1258ab30d..8db1ab686 100644 --- a/simde/x86/sse.h +++ b/simde/x86/sse.h @@ -1825,7 +1825,8 @@ simde_mm_cmpunord_ps (simde__m128 a, simde__m128 b) { vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); r_.altivec_f32 = vec_nor(r_.altivec_f32, r_.altivec_f32); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); + r_.lsx_i32 = HEDLEY_REINTERPRET_CAST(v4i32, __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32)); + // TODO: change when https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123759 is resolved #elif defined(simde_math_isnanf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { diff --git a/simde/x86/sse2.h b/simde/x86/sse2.h index 8f646cab5..b5306309b 100644 --- a/simde/x86/sse2.h +++ b/simde/x86/sse2.h @@ -568,7 +568,7 @@ simde_mm_set_pd (simde_float64 e1, simde_float64 e0) { #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; r_.neon_f64 = vld1q_f64(data); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -3313,7 +3313,7 @@ simde_mm_cvtps_epi32 (simde__m128 a) { r_.wasm_v128 = wasm_i32x4_trunc_sat_f32x4(a_.wasm_v128); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) a_ = simde__m128_to_private(a); - r_.lsx_i32 = __lsx_vftintrne_w_s(a_.lsx_f32); + r_.lsx_i32 = HEDLEY_REINTERPRET_CAST(v4i32, __lsx_vftintrne_w_s(a_.lsx_f32)); #else a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1)); SIMDE_VECTORIZE @@ -3733,7 +3733,7 @@ simde_mm_cvttpd_epi32 (simde__m128d a) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) const v2f64 zero_f64 = {-0.0f, -0.0f}; - r_.lsx_i64 = __lsx_vftintrz_w_d(zero_i64, simde__m128d_to_private(a).lsx_f64); + r_.lsx_i64 = __lsx_vftintrz_w_d(zero_f64, simde__m128d_to_private(a).lsx_f64); #else r_.m64[0] = simde_mm_cvttpd_pi32(a); r_.m64[1] = simde_mm_setzero_si64(); @@ -3793,7 +3793,7 @@ simde_mm_cvttps_epi32 (simde__m128 a) { r_.wasm_v128 = wasm_v128_bitselect(r_.wasm_v128, wasm_i32x4_splat(INT32_MIN), valid_input); #endif #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __m128i temp = __lsx_vftintrz_w_s(a_.lsx_f32); + r_.lsx_i64 = __lsx_vftintrz_w_s(a_.lsx_f32); #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) #if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_FAST_NANS) simde_float32 f1 = 2147483648.0f; @@ -3809,7 +3809,7 @@ simde_mm_cvttps_epi32 (simde__m128 a) { __m128i valid_input = __lsx_vfcmp_ceq_s(a_.lsx_f32, a_.lsx_f32); #endif - r_.lsx_i64 = __lsx_vbitsel_v(__lsx_vreplgr2vr_w(INT32_MIN), temp, valid_input); + r_.lsx_i64 = __lsx_vbitsel_v(__lsx_vreplgr2vr_w(INT32_MIN), r_.lsx_i64, valid_input); #endif #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_ARCH_POWER) SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); @@ -5705,8 +5705,6 @@ simde_mm_loadu_si16 (void const* mem_addr) { HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ HEDLEY_GCC_VERSION_CHECK(12,1,0)) return _mm_loadu_si16(mem_addr); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) - return __lsx_vld(mem_addr, 0); #else int16_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -5761,8 +5759,10 @@ simde_mm_loadu_si32 (void const* mem_addr) { simde__m128i_private r_; r_.neon_i32 = vsetq_lane_s32(* HEDLEY_REINTERPRET_CAST(const int32_t *, mem_addr), vdupq_n_s32(0), 0); return simde__m128i_from_private(r_); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) - return __lsx_vld(mem_addr, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i_private r_; + r_.lsx_i64 = __lsx_vbsrl_v(__lsx_vldrepl_w(mem_addr, 0), 12); + return simde__m128i_from_private(r_); #else int32_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -5783,7 +5783,7 @@ simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1)); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_TO_16 simde__m64 data[2] = {e0, e1}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5811,7 +5811,7 @@ simde_mm_set_epi64x (int64_t e1, int64_t e0) { r_.neon_i64 = vld1q_s64(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_make(e0, e1); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v2i64) int64_t data[2] = {e0, e1}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5834,8 +5834,10 @@ simde_mm_loadu_si64 (void const* mem_addr) { HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(20,21,1)) return _mm_loadu_si64(mem_addr); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) - return __lsx_vld(mem_addr, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i_private r_; + r_.lsx_i64 = __lsx_vbsrl_v(__lsx_vldrepl_d(mem_addr, 0), 8); + return simde__m128i_from_private(r_); #else int64_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -5870,7 +5872,7 @@ simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, r_.neon_u8 = vld1q_u8(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u8x16_make(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v16u8) uint8_t data[16] = { e0, e1, e2, e3, e4, e5, e6, e7, @@ -5904,7 +5906,7 @@ simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, r_.neon_u16 = vld1q_u16(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v8u16) uint16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5930,7 +5932,7 @@ simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { r_.neon_u32 = vld1q_u32(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u32x4_make(e0, e1, e2, e3); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v4u32) uint32_t data[4] = {e0, e1, e2, e3}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5957,7 +5959,7 @@ simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) { r_.neon_u64 = vld1q_u64(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u64x2_make(e0, e1); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v2u64) uint64_t data[2] = {e0, e1}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5978,7 +5980,7 @@ simde_mm_set_sd (simde_float64 a) { return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return simde__m128d_from_wasm_v128(wasm_f64x2_make(a, 0)); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) return (__m128d)__lsx_vinsgr2vr_d(__lsx_vldrepl_d(&a, 0), 0, 1); #else return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a); @@ -7277,7 +7279,7 @@ simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) __lsx_vst(simde__m128d_to_private(a).lsx_i64, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a)); @@ -7476,7 +7478,7 @@ simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) { _mm_storeu_pd(mem_addr, a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) __lsx_vst(simde__m128d_to_private(a).lsx_f64, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); @@ -7491,7 +7493,7 @@ void simde_mm_storeu_si128 (void* mem_addr, simde__m128i a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) __lsx_vst(simde__m128i_to_private(a).lsx_i64, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); diff --git a/test/test.h b/test/test.h index 3e1b3de78..287171bfa 100644 --- a/test/test.h +++ b/test/test.h @@ -126,6 +126,10 @@ simde_test_debug_printf_(const char* format, ...) { HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ +#if defined(SIMDE_FAST_MATH) && defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4,4,0) +__attribute__((optimize("-fno-finite-math-only"))) +__attribute__((noinline)) +#endif static int simde_test_equal_f32(simde_float32 a, simde_float32 b, simde_float32 slop) { if (simde_math_isnan(a)) { @@ -156,6 +160,10 @@ simde_test_equal_f16(simde_float16 a, simde_float16 b, simde_float16 slop) { return simde_test_equal_f32(af, bf, slopf); } +#if defined(SIMDE_FAST_MATH) && defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4,4,0) +__attribute__((optimize("-fno-finite-math-only"))) +__attribute__((noinline)) +#endif static int simde_test_equal_f64(simde_float64 a, simde_float64 b, simde_float64 slop) { if (simde_math_isnan(a)) {