diff --git a/CMakeLists.txt b/CMakeLists.txt index cd71a47..bcf8adb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,6 +101,8 @@ option(SNAPPY_REQUIRE_AVX "Target processors with AVX support." OFF) option(SNAPPY_REQUIRE_AVX2 "Target processors with AVX2 support." OFF) +option(SNAPPY_REQUIRE_RVV "Target processors with RVV support." OFF) + option(SNAPPY_INSTALL "Install Snappy's header and library" ON) include(TestBigEndian) @@ -113,6 +115,8 @@ check_include_file("sys/time.h" HAVE_SYS_TIME_H) check_include_file("sys/uio.h" HAVE_SYS_UIO_H) check_include_file("unistd.h" HAVE_UNISTD_H) check_include_file("windows.h" HAVE_WINDOWS_H) +check_include_file("sse2rvv.h" HAVE_SSE2RISCV_INSTRINSIC_H) +check_include_file("riscv_vector.h" HAVE_RISCV_INSTRINSIC_H) include(CheckLibraryExists) check_library_exists(z zlibVersion "" HAVE_LIBZ) @@ -124,6 +128,7 @@ CHECK_CXX_COMPILER_FLAG("/arch:AVX" HAVE_VISUAL_STUDIO_ARCH_AVX) CHECK_CXX_COMPILER_FLAG("/arch:AVX2" HAVE_VISUAL_STUDIO_ARCH_AVX2) CHECK_CXX_COMPILER_FLAG("-mavx" HAVE_CLANG_MAVX) CHECK_CXX_COMPILER_FLAG("-mbmi2" HAVE_CLANG_MBMI2) +CHECK_CXX_COMPILER_FLAG("-march=rv64gcv" HAVE_CLANG_RVV) if(SNAPPY_REQUIRE_AVX2) if(HAVE_VISUAL_STUDIO_ARCH_AVX2) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") @@ -141,6 +146,10 @@ elseif (SNAPPY_REQUIRE_AVX) if(HAVE_CLANG_MAVX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") endif(HAVE_CLANG_MAVX) +elseif (SNAPPY_REQUIRE_RVV) + if(HAVE_CLANG_RVV) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gcv") + endif(HAVE_CLANG_RVV) endif(SNAPPY_REQUIRE_AVX2) # Used by googletest. @@ -185,6 +194,42 @@ int main() { return 0; }" SNAPPY_HAVE_SSSE3) +check_cxx_source_compiles(" +#include + +#define vreinterpretq_f64_m128i(x) \ + __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_f64m1_i64m1(x)) +#define vreinterpretq_m128i_i64(x) __riscv_vreinterpret_v_i32m1_i64m1(x) +#define vreinterpretq_i64_m128i(x) __riscv_vreinterpret_v_i64m1_i32m1(x) +#define vreinterpretq_m128i_i8(x) __riscv_vreinterpret_v_i32m1_i8m1(x) +#define vreinterpretq_i8_m128i(x) __riscv_vreinterpret_v_i8m1_i32m1(x) + +int main() { + const vint32m1_t *src = 0; + vint32m1_t dest; + const vint32m1_t shuffle_mask = vreinterpretq_f64_m128i( + __riscv_vle64_v_f64m1((double const *)src, 2)); + + vint64m1_t addr = vreinterpretq_m128i_i64(*src); + vint64m1_t zeros = __riscv_vmv_v_x_i64m1(0, 2); + + vint32m1_t a = vreinterpretq_i64_m128i( + __riscv_vslideup_vx_i64m1_tu(addr, zeros, 1, 2)); + + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(shuffle_mask); + vbool8_t mask_lt_zero = __riscv_vmslt_vx_i8m1_b8(_b, 0, 16); + vuint8m1_t idxs = + __riscv_vreinterpret_v_i8m1_u8m1(__riscv_vand_vx_i8m1(_b, 0xf, 16)); + vint8m1_t shuffle = __riscv_vrgather_vv_i8m1(_a, idxs, 16); + + const vint32m1_t pattern = vreinterpretq_i8_m128i( + __riscv_vmerge_vxm_i8m1(shuffle, 0, mask_lt_zero, 16)); + + dest = pattern; + return 0; +}" SNAPPY_HAVE_RVV) + check_cxx_source_compiles(" #include int main() { diff --git a/cmake/config.h.in b/cmake/config.h.in index 3510c27..657bc03 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -49,6 +49,9 @@ /* Define to 1 if you target processors with SSSE3+ and have . */ #cmakedefine01 SNAPPY_HAVE_SSSE3 +/* Define to 1 if you target processors with RVV and have . */ +#cmakedefine01 SNAPPY_HAVE_RVV + /* Define to 1 if you target processors with SSE4.2 and have . */ #cmakedefine01 SNAPPY_HAVE_X86_CRC32 diff --git a/snappy.cc b/snappy.cc index 8dc3713..243ca98 100644 --- a/snappy.cc +++ b/snappy.cc @@ -52,6 +52,10 @@ #endif #endif // !defined(SNAPPY_HAVE_X86_CRC32) +#if SNAPPY_HAVE_RVV +#include +#endif + #if !defined(SNAPPY_HAVE_NEON_CRC32) #if SNAPPY_HAVE_NEON && defined(__ARM_FEATURE_CRC32) #define SNAPPY_HAVE_NEON_CRC32 1 @@ -537,7 +541,27 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, } while (SNAPPY_PREDICT_TRUE(op < op_end)); } return IncrementalCopySlow(op - pattern_size, op, op_limit); -#else // !SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE +#elif defined (SNAPPY_HAVE_RVV) + size_t bytes_to_copy = op_limit - op; + while (bytes_to_copy > 0) { + size_t vl = __riscv_vsetvl_e8m1(bytes_to_copy); + vuint8m1_t pattern_source = __riscv_vle8_v_u8m1( + reinterpret_cast(src), pattern_size); + vuint8m1_t indices_sequential = __riscv_vid_v_u8m1(vl); + vuint8m1_t indices_repeating = __riscv_vremu_vx_u8m1( + indices_sequential, pattern_size, vl); + + vuint8m1_t pattern_to_write = __riscv_vrgather_vv_u8m1( + pattern_source, indices_repeating, vl); + + __riscv_vse8_v_u8m1(reinterpret_cast(op), pattern_to_write, vl); + + op += vl; + bytes_to_copy -= vl; + } + return op_limit; + +#else // If plenty of buffer space remains, expand the pattern to at least 8 // bytes. The way the following loop is written, we need 8 bytes of buffer // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10