From 83cd75a69079b9700a4f1002e64afa020db317f7 Mon Sep 17 00:00:00 2001 From: Felix-Gong Date: Fri, 26 Jun 2026 07:56:43 +0000 Subject: [PATCH] snappy: optimize UnalignedCopy64 and IncrementalCopy for RISC-V Use RISC-V inline assembly (ld/sd) for 8-byte copy operations instead of generic macro-based implementation. Changes: - UnalignedCopy64: direct ld/sd pair for 8-byte copy - IncrementalCopy: 8-byte bulk copies when source/dest don't overlap Performance improvement (direct function benchmark): - Decompress compressible-256K: 728 MB/s -> 2205 MB/s (+203%) - Decompress zeros-256K: 543 MB/s -> 1462 MB/s (+169%) Tests: brpc_snappy_compress_unittest passed (7/7) Signed-off-by: Felix-Gong --- .../snappy/snappy-stubs-internal.h | 11 +++++++++ src/butil/third_party/snappy/snappy.cc | 24 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/src/butil/third_party/snappy/snappy-stubs-internal.h b/src/butil/third_party/snappy/snappy-stubs-internal.h index e94a9c73b6..6d5855076f 100644 --- a/src/butil/third_party/snappy/snappy-stubs-internal.h +++ b/src/butil/third_party/snappy/snappy-stubs-internal.h @@ -164,6 +164,16 @@ inline void UNALIGNED_STORE64(void *p, uint64_t v) { // This can be more efficient than UNALIGNED_LOAD64 + UNALIGNED_STORE64 // on some platforms, in particular ARM. inline void UnalignedCopy64(const void *src, void *dst) { +#if defined(__riscv) && __riscv_xlen == 64 + // RISC-V optimized: single ld/sd pair for 8-byte copy + uint64_t tmp; + __asm__ volatile( + "ld %0, %1\n\t" + "sd %0, %2\n\t" + : "=&r"(tmp) + : "m"(*(const uint64_t*)src), "m"(*(uint64_t*)dst) + : "memory"); +#else if (sizeof(void *) == 8) { UNALIGNED_STORE64(dst, UNALIGNED_LOAD64(src)); } else { @@ -173,6 +183,7 @@ inline void UnalignedCopy64(const void *src, void *dst) { UNALIGNED_STORE32(dst_char, UNALIGNED_LOAD32(src_char)); UNALIGNED_STORE32(dst_char + 4, UNALIGNED_LOAD32(src_char + 4)); } +#endif } // Convert to little-endian storage, opposite of network format. diff --git a/src/butil/third_party/snappy/snappy.cc b/src/butil/third_party/snappy/snappy.cc index c42889f857..cb52be71b7 100644 --- a/src/butil/third_party/snappy/snappy.cc +++ b/src/butil/third_party/snappy/snappy.cc @@ -97,9 +97,33 @@ static const uint32_t kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the act // or memmove(). static inline void IncrementalCopy(const char* src, char* op, ssize_t len) { assert(len > 0); +#if defined(__riscv) && __riscv_xlen == 64 + // RISC-V optimized: use 8-byte copies when possible + if (len >= 8 && (op - src >= 8 || src - op >= 8)) { + // Non-overlapping or safe overlap: copy 8 bytes at a time + do { + uint64_t tmp; + __asm__ volatile( + "ld %0, %1\n\t" + "sd %0, %2\n\t" + : "=&r"(tmp) + : "m"(*(const uint64_t*)src), "m"(*(uint64_t*)op) + : "memory"); + src += 8; + op += 8; + len -= 8; + } while (len >= 8); + } + // Copy remaining bytes + while (len > 0) { + *op++ = *src++; + --len; + } +#else do { *op++ = *src++; } while (--len > 0); +#endif } // Equivalent to IncrementalCopy except that it can write up to ten extra