diff --git a/CMakeLists.txt b/CMakeLists.txt index 353154f..e707d83 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,16 @@ option(SIMDCOMP_NATIVE "Tune for the building machine (-march=native); enables AVX2/AVX-512 on \ capable x86 hosts" ON) +string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SIMDCOMP_SYSTEM_PROCESSOR_LOWER) +set(SIMDCOMP_TARGET_IS_X86 FALSE) +set(SIMDCOMP_TARGET_IS_RISCV FALSE) +if(SIMDCOMP_SYSTEM_PROCESSOR_LOWER MATCHES "^(x86_64|amd64|i[3-6]86)$") + set(SIMDCOMP_TARGET_IS_X86 TRUE) +elseif(SIMDCOMP_SYSTEM_PROCESSOR_LOWER MATCHES "^riscv") + set(SIMDCOMP_TARGET_IS_RISCV TRUE) + message(STATUS "RISC-V target detected; using scalar 128-bit compatibility shim") +endif() + # Default to an optimized build when the user did not pick one. if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) @@ -55,14 +65,20 @@ set_target_properties(simdcomp PROPERTIES SOVERSION ${PROJECT_VERSION_MAJOR} POSITION_INDEPENDENT_CODE ON) +if(SIMDCOMP_TARGET_IS_RISCV) + target_compile_definitions(simdcomp PRIVATE __riscv=1 __riscv_xlen=64) +endif() + # -march=native (when requested and supported by the compiler). include(CheckCCompilerFlag) set(SIMDCOMP_HAS_MARCH_NATIVE FALSE) -if(SIMDCOMP_NATIVE) +if(SIMDCOMP_NATIVE AND SIMDCOMP_TARGET_IS_X86) check_c_compiler_flag("-march=native" SIMDCOMP_HAS_MARCH_NATIVE) if(SIMDCOMP_HAS_MARCH_NATIVE) target_compile_options(simdcomp PRIVATE -march=native) endif() +elseif(SIMDCOMP_NATIVE AND SIMDCOMP_TARGET_IS_RISCV) + message(STATUS "Skipping -march=native for RISC-V target") endif() # Warnings, mirroring the previous Makefile, on GCC/Clang only. @@ -77,6 +93,12 @@ function(simdcomp_apply_native target) endif() endfunction() +function(simdcomp_apply_riscv_defs target) + if(SIMDCOMP_TARGET_IS_RISCV) + target_compile_definitions(${target} PRIVATE __riscv=1 __riscv_xlen=64) + endif() +endfunction() + # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- @@ -86,11 +108,13 @@ if(SIMDCOMP_BUILD_TESTS) add_executable(unit tests/unit.c) target_link_libraries(unit PRIVATE simdcomp) simdcomp_apply_native(unit) + simdcomp_apply_riscv_defs(unit) add_test(NAME unit COMMAND unit) add_executable(unit_chars tests/unit_chars.c) target_link_libraries(unit_chars PRIVATE simdcomp) simdcomp_apply_native(unit_chars) + simdcomp_apply_riscv_defs(unit_chars) add_test(NAME unit_chars COMMAND unit_chars) endif() @@ -101,6 +125,7 @@ if(SIMDCOMP_BUILD_EXAMPLES) add_executable(example example/example.c) target_link_libraries(example PRIVATE simdcomp) simdcomp_apply_native(example) + simdcomp_apply_riscv_defs(example) endif() # --------------------------------------------------------------------------- @@ -127,10 +152,12 @@ if(SIMDCOMP_BUILD_BENCHMARKS) CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON) simdcomp_apply_native(bitpackingbenchmark) + simdcomp_apply_riscv_defs(bitpackingbenchmark) add_executable(benchmark benchmarks/benchmark.c) target_link_libraries(benchmark PRIVATE simdcomp) simdcomp_apply_native(benchmark) + simdcomp_apply_riscv_defs(benchmark) endif() # --------------------------------------------------------------------------- diff --git a/README.md b/README.md index 8937da0..ecb56e0 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,8 @@ This is significantly faster than generic codecs like gzip, LZO, Snappy or LZ4. On a Skylake Intel processor, it can decode integers at a rate 0.3 cycles per integer, which can easily translate into more than 8 decoded billions integers per second. -It runs on both x86/x64 (SSE/AVX) and 64-bit ARM (NEON, e.g. Apple Silicon). See +It runs on x86/x64 (SSE/AVX), 64-bit ARM (NEON, e.g. Apple Silicon), and +RISC-V through a conservative scalar compatibility backend. See [Platforms](#platforms) below. This library is part of the [Awesome C](https://github.com/kozross/awesome-c) list of C resources. @@ -39,6 +40,7 @@ Requirements - On x86/x64: your processor should support SSE4.1 (supported by most Intel and AMD processors released since 2008). The core bit-packing functions only require SSE2 (Pentium4 or better). - On ARM: an AArch64/ARM processor with NEON (e.g. Apple Silicon). The SSE intrinsics are mapped to NEON by our own self-contained shim (`include/neon128.h`); no third-party translation library is pulled in. +- On RISC-V: the library builds through a small scalar 128-bit compatibility shim (`include/riscv128.h`). This preserves the existing API but does not provide RVV acceleration. - A C99 (or better) compiler, plus a C++17 compiler if you build the benchmarks. - CMake 3.14 or better. @@ -47,7 +49,7 @@ For a plain C version that does not use SIMD instructions, see https://github.co Platforms --------- -The library supports two SIMD backends behind the same API: +The library supports three backends behind the same API: - **x86 / x64** — Intel/AMD SSE (with optional AVX2 and AVX-512 code paths, enabled automatically when you build with `-march=native` on a capable host). @@ -57,10 +59,15 @@ The library supports two SIMD backends behind the same API: written directly against ``; no third-party translation layer (such as sse2neon) is pulled in. The wider AVX2/AVX-512 paths are x86-only and are simply inactive on ARM. - -The public API is identical on both: it is selected automatically at compile -time, so the same source (including the `__m128i`-based entry points) builds on -either architecture. +- **RISC-V** — the same 128-bit kernel sources build through a conservative + scalar compatibility shim in `include/riscv128.h`. This keeps the existing + `__m128i`-based API available on RISC-V without pulling in any x86 headers, + while leaving AVX2/AVX-512 inactive. It is a portability path, not an RVV + optimization backend. + +The public API is identical across these backends: it is selected automatically +at compile time, so the same source (including the `__m128i`-based entry +points) builds on each architecture. Usage ------- diff --git a/include/portability.h b/include/portability.h index 032bd56..b650aab 100644 --- a/include/portability.h +++ b/include/portability.h @@ -78,6 +78,22 @@ typedef signed char int8_t; defined(_M_ARM64) /* ARM NEON: use our own SSE-on-NEON shim instead of the x86 intrinsics. */ #include "neon128.h" +#elif defined(__riscv) +/* RISC-V: use a conservative scalar 128-bit shim; this is compatibility, not + * RVV acceleration. */ +#include "riscv128.h" +#ifndef __SSE2__ +#define __SSE2__ 1 +#endif +#ifndef __SSSE3__ +#define __SSSE3__ 1 +#endif +#ifndef __SSE4_1__ +#define __SSE4_1__ 1 +#endif +#ifndef __SSE4_2__ +#define __SSE4_2__ 1 +#endif #else #include #endif diff --git a/include/riscv128.h b/include/riscv128.h new file mode 100644 index 0000000..d9d0fc4 --- /dev/null +++ b/include/riscv128.h @@ -0,0 +1,259 @@ +/** + * This code is released under a BSD License. + * + * riscv128.h -- a small, self-contained scalar implementation of the handful + * of 128-bit Intel SSE2/SSSE3/SSE4.1 intrinsics that simdcomp actually uses. + * + * This is intentionally conservative: it provides source compatibility for the + * existing 128-bit kernels on RISC-V, but it does not claim RVV acceleration. + * The wider AVX2/AVX-512 code paths remain x86-only. + */ +#ifndef SIMDCOMP_RISCV128_H_ +#define SIMDCOMP_RISCV128_H_ + +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#define SIMDCOMP_RISCV_INLINE __inline__ __attribute__((always_inline)) +#else +#define SIMDCOMP_RISCV_INLINE inline +#endif + +typedef union { + uint32_t u32[4]; + int32_t s32[4]; + uint8_t u8[16]; + float f32[4]; +} __m128i; + +typedef __m128i __m128; + +static SIMDCOMP_RISCV_INLINE __m128i _mm_loadu_si128(const __m128i *p) { + __m128i out; + memcpy(&out, p, sizeof(out)); + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_load_si128(const __m128i *p) { + return _mm_loadu_si128(p); +} + +static SIMDCOMP_RISCV_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) { + memcpy(p, &a, sizeof(a)); +} + +static SIMDCOMP_RISCV_INLINE void _mm_store_si128(__m128i *p, __m128i a) { + _mm_storeu_si128(p, a); +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_setzero_si128(void) { + __m128i out; + memset(&out, 0, sizeof(out)); + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_set1_epi32(int v) { + __m128i out; + out.s32[0] = v; + out.s32[1] = v; + out.s32[2] = v; + out.s32[3] = v; + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_setr_epi32(int e0, int e1, int e2, + int e3) { + __m128i out; + out.s32[0] = e0; + out.s32[1] = e1; + out.s32[2] = e2; + out.s32[3] = e3; + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) { + __m128i out; + out.u32[0] = a.u32[0] & b.u32[0]; + out.u32[1] = a.u32[1] & b.u32[1]; + out.u32[2] = a.u32[2] & b.u32[2]; + out.u32[3] = a.u32[3] & b.u32[3]; + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) { + __m128i out; + out.u32[0] = a.u32[0] | b.u32[0]; + out.u32[1] = a.u32[1] | b.u32[1]; + out.u32[2] = a.u32[2] | b.u32[2]; + out.u32[3] = a.u32[3] | b.u32[3]; + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) { + __m128i out; + out.s32[0] = a.s32[0] + b.s32[0]; + out.s32[1] = a.s32[1] + b.s32[1]; + out.s32[2] = a.s32[2] + b.s32[2]; + out.s32[3] = a.s32[3] + b.s32[3]; + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) { + __m128i out; + out.s32[0] = a.s32[0] - b.s32[0]; + out.s32[1] = a.s32[1] - b.s32[1]; + out.s32[2] = a.s32[2] - b.s32[2]; + out.s32[3] = a.s32[3] - b.s32[3]; + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) { + __m128i out; + out.u32[0] = (a.u32[0] < b.u32[0]) ? a.u32[0] : b.u32[0]; + out.u32[1] = (a.u32[1] < b.u32[1]) ? a.u32[1] : b.u32[1]; + out.u32[2] = (a.u32[2] < b.u32[2]) ? a.u32[2] : b.u32[2]; + out.u32[3] = (a.u32[3] < b.u32[3]) ? a.u32[3] : b.u32[3]; + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) { + __m128i out; + out.u32[0] = (a.u32[0] > b.u32[0]) ? a.u32[0] : b.u32[0]; + out.u32[1] = (a.u32[1] > b.u32[1]) ? a.u32[1] : b.u32[1]; + out.u32[2] = (a.u32[2] > b.u32[2]) ? a.u32[2] : b.u32[2]; + out.u32[3] = (a.u32[3] > b.u32[3]) ? a.u32[3] : b.u32[3]; + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_slli_epi32(__m128i a, int count) { + __m128i out; + if (count >= 32) { + return _mm_setzero_si128(); + } + out.u32[0] = a.u32[0] << count; + out.u32[1] = a.u32[1] << count; + out.u32[2] = a.u32[2] << count; + out.u32[3] = a.u32[3] << count; + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_srli_epi32(__m128i a, int count) { + __m128i out; + if (count >= 32) { + return _mm_setzero_si128(); + } + out.u32[0] = a.u32[0] >> count; + out.u32[1] = a.u32[1] >> count; + out.u32[2] = a.u32[2] >> count; + out.u32[3] = a.u32[3] >> count; + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { + __m128i out; + out.u32[0] = (a.s32[0] < b.s32[0]) ? 0xFFFFFFFFU : 0U; + out.u32[1] = (a.s32[1] < b.s32[1]) ? 0xFFFFFFFFU : 0U; + out.u32[2] = (a.s32[2] < b.s32[2]) ? 0xFFFFFFFFU : 0U; + out.u32[3] = (a.s32[3] < b.s32[3]) ? 0xFFFFFFFFU : 0U; + return out; +} + +static SIMDCOMP_RISCV_INLINE int _mm_cvtsi128_si32(__m128i a) { + return a.s32[0]; +} + +#define _mm_extract_epi32(a, imm) ((a).s32[(imm) & 3]) + +#define _mm_shuffle_epi32(a, imm) \ + _mm_setr_epi32((a).s32[(imm) & 3], (a).s32[((imm) >> 2) & 3], \ + (a).s32[((imm) >> 4) & 3], (a).s32[((imm) >> 6) & 3]) + +static SIMDCOMP_RISCV_INLINE __m128i simdcomp_riscv_alignr_epi8(__m128i a, + __m128i b, + int count) { + __m128i out; + int i; + if (count >= 32) { + return _mm_setzero_si128(); + } + for (i = 0; i < 16; ++i) { + int src = count + i; + if (src < 16) { + out.u8[i] = b.u8[src]; + } else { + out.u8[i] = a.u8[src - 16]; + } + } + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i simdcomp_riscv_srli_si128(__m128i a, + int count) { + __m128i out; + int i; + if (count >= 16) { + return _mm_setzero_si128(); + } + for (i = 0; i < 16; ++i) { + int src = i + count; + out.u8[i] = (src < 16) ? a.u8[src] : 0; + } + return out; +} + +static SIMDCOMP_RISCV_INLINE __m128i simdcomp_riscv_slli_si128(__m128i a, + int count) { + __m128i out; + int i; + if (count >= 16) { + return _mm_setzero_si128(); + } + for (i = 0; i < 16; ++i) { + int src = i - count; + out.u8[i] = (src >= 0) ? a.u8[src] : 0; + } + return out; +} + +#define _mm_alignr_epi8(a, b, imm) \ + simdcomp_riscv_alignr_epi8((a), (b), (imm)) + +#define _mm_srli_si128(a, imm) simdcomp_riscv_srli_si128((a), (imm)) + +#define _mm_slli_si128(a, imm) simdcomp_riscv_slli_si128((a), (imm)) + +static SIMDCOMP_RISCV_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i mask) { + __m128i out; + int i; + for (i = 0; i < 16; ++i) { + uint8_t control = mask.u8[i]; + if (control & 0x80U) { + out.u8[i] = 0; + } else { + out.u8[i] = a.u8[control & 0x0FU]; + } + } + return out; +} + +#define _mm_castsi128_ps(a) (a) + +static SIMDCOMP_RISCV_INLINE int _mm_movemask_ps(__m128 a) { + int mask = 0; + if (a.u32[0] & 0x80000000U) { + mask |= 1; + } + if (a.u32[1] & 0x80000000U) { + mask |= 2; + } + if (a.u32[2] & 0x80000000U) { + mask |= 4; + } + if (a.u32[3] & 0x80000000U) { + mask |= 8; + } + return mask; +} + +#endif /* SIMDCOMP_RISCV128_H_ */ diff --git a/include/simdbitpacking.h b/include/simdbitpacking.h index 499be6d..e06a22a 100644 --- a/include/simdbitpacking.h +++ b/include/simdbitpacking.h @@ -6,9 +6,9 @@ #include "portability.h" -/* SSE2 is required (on ARM, neon128.h via portability.h provides the shim) */ +/* SSE2 is required (on ARM/RISC-V, portability.h provides the shim). */ #if !(defined(__aarch64__) || defined(__arm__) || defined(__ARM_NEON) || \ - defined(_M_ARM64)) + defined(_M_ARM64) || defined(__riscv)) #include #endif /* for memset */ diff --git a/include/simdcomputil.h b/include/simdcomputil.h index 810b739..7d5fd29 100644 --- a/include/simdcomputil.h +++ b/include/simdcomputil.h @@ -7,9 +7,9 @@ #include "portability.h" -/* SSE2 is required (on ARM, neon128.h via portability.h provides the shim) */ +/* SSE2 is required (on ARM/RISC-V, portability.h provides the shim). */ #if !(defined(__aarch64__) || defined(__arm__) || defined(__ARM_NEON) || \ - defined(_M_ARM64)) + defined(_M_ARM64) || defined(__riscv)) #include #endif diff --git a/include/simdfor.h b/include/simdfor.h index 9e6fd06..5111f2f 100644 --- a/include/simdfor.h +++ b/include/simdfor.h @@ -6,9 +6,9 @@ #include "portability.h" -/* SSE2 is required (on ARM, neon128.h via portability.h provides the shim) */ +/* SSE2 is required (on ARM/RISC-V, portability.h provides the shim). */ #if !(defined(__aarch64__) || defined(__arm__) || defined(__ARM_NEON) || \ - defined(_M_ARM64)) + defined(_M_ARM64) || defined(__riscv)) #include #endif diff --git a/include/simdintegratedbitpacking.h b/include/simdintegratedbitpacking.h index 545b742..abd65e2 100644 --- a/include/simdintegratedbitpacking.h +++ b/include/simdintegratedbitpacking.h @@ -7,9 +7,9 @@ #include "portability.h" -/* SSE2 is required (on ARM, neon128.h via portability.h provides the shim) */ +/* SSE2 is required (on ARM/RISC-V, portability.h provides the shim). */ #if !(defined(__aarch64__) || defined(__arm__) || defined(__ARM_NEON) || \ - defined(_M_ARM64)) + defined(_M_ARM64) || defined(__riscv)) #include #endif diff --git a/src/simdcomputil.c b/src/simdcomputil.c index 87e45ca..50fbf3c 100644 --- a/src/simdcomputil.c +++ b/src/simdcomputil.c @@ -3,7 +3,7 @@ */ #include "simdcomputil.h" -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) && !defined(__riscv) #include #endif #include diff --git a/src/simdpackedsearch.c b/src/simdpackedsearch.c index d650270..8ecfb86 100644 --- a/src/simdpackedsearch.c +++ b/src/simdpackedsearch.c @@ -2,10 +2,10 @@ * This code is released under a BSD License. */ #if defined(__SSE4_1__) || defined(__aarch64__) || defined(__arm__) || \ - defined(__ARM_NEON) || defined(_M_ARM64) + defined(__ARM_NEON) || defined(_M_ARM64) || defined(__riscv) #include "simdintegratedbitpacking.h" -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) && !defined(__riscv) #include #endif diff --git a/src/simdpackedselect.c b/src/simdpackedselect.c index 9e81002..75b7c75 100644 --- a/src/simdpackedselect.c +++ b/src/simdpackedselect.c @@ -2,9 +2,9 @@ * This code is released under a BSD License. */ #if defined(__SSE4_1__) || defined(__aarch64__) || defined(__arm__) || \ - defined(__ARM_NEON) || defined(_M_ARM64) + defined(__ARM_NEON) || defined(_M_ARM64) || defined(__riscv) #include "simdintegratedbitpacking.h" -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) && !defined(__riscv) #include #endif