From d261d2766fa6c7932994ed3c5b14d7911629c971 Mon Sep 17 00:00:00 2001 From: Snappy Team Date: Thu, 9 Jun 2022 14:13:38 +0000 Subject: [PATCH] Optimize zippy MemCpy / MemMove during decompression By default MemCpy() / MemMove() always copies 64 bytes in DecompressBranchless(). Profiling shows that the vast majority of the time we need to copy many fewer bytes (typically <= 16 bytes). It is safe to copy fewer bytes as long as we exceed len. This change improves throughput by ~12% on ARM, ~35% on AMD Milan, and ~7% on Intel Cascade Lake. PiperOrigin-RevId: 453917840 --- snappy.cc | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/snappy.cc b/snappy.cc index 30505ad..21d0c01 100644 --- a/snappy.cc +++ b/snappy.cc @@ -983,22 +983,35 @@ inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) { return offset != 0; } -void MemCopy(char* dst, const uint8_t* src, size_t size) { - std::memcpy(dst, src, size); +// Copies between size bytes and 64 bytes from src to dest. size cannot exceed +// 64. More than size bytes, but never exceeding 64, might be copied if doing +// so gives better performance. +void MemCopy64(char* dst, const void* src, size_t size) { + // Always copy this many bytes, test if we need to copy more. + constexpr int kShortMemCopy = 32; + // We're always allowed to copy 64 bytes, so if we exceed kShortMemCopy just + // copy 64 rather than the exact amount. + constexpr int kLongMemCopy = 64; + + assert(size <= kLongMemCopy); + // [src, src + size) must not overlap with [dst, dst + size) + assert(std::less_equal()(static_cast(src) + size, + dst) || + std::less_equal()(dst + size, src)); + + // We know that src and dst are at least size bytes apart. However, because we + // might copy more than size bytes the copy still might overlap past size. + // E.g. if src and dst appear consecutively in memory (src + size == dst). + std::memmove(dst, src, kShortMemCopy); + // Profiling shows that nearly all copies are short. + if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { + std::memmove(dst + kShortMemCopy, + static_cast(src) + kShortMemCopy, + kLongMemCopy - kShortMemCopy); + } } -void MemCopy(ptrdiff_t dst, const uint8_t* src, size_t size) { - // TODO: Switch to [[maybe_unused]] when we can assume C++17. - (void)dst; - (void)src; - (void)size; -} - -void MemMove(char* dst, const void* src, size_t size) { - std::memmove(dst, src, size); -} - -void MemMove(ptrdiff_t dst, const void* src, size_t size) { +void MemCopy64(ptrdiff_t dst, const void* src, size_t size) { // TODO: Switch to [[maybe_unused]] when we can assume C++17. (void)dst; (void)src; @@ -1170,7 +1183,7 @@ std::pair DecompressBranchless( // Due to the spurious offset in literals have this will trigger // at the start of a block when op is still smaller than 256. if (tag_type != 0) goto break_loop; - MemCopy(op_base + op, old_ip, 64); + MemCopy64(op_base + op, old_ip, len); op += len; continue; } @@ -1179,7 +1192,7 @@ std::pair DecompressBranchless( // we need to copy from ip instead of from the stream. const void* from = tag_type ? reinterpret_cast(op_base + delta) : old_ip; - MemMove(op_base + op, from, 64); + MemCopy64(op_base + op, from, len); op += len; } } while (ip < ip_limit_min_slop && op < op_limit_min_slop);