Optimize zippy MemCpy / MemMove during decompression

By default MemCpy() / MemMove() always copies 64 bytes in DecompressBranchless().  Profiling shows that the vast majority of the time we need to copy many fewer bytes (typically <= 16 bytes).  It is safe to copy fewer bytes as long as we exceed len.

This change improves throughput by ~12% on ARM, ~35% on AMD Milan, and ~7% on Intel Cascade Lake.

PiperOrigin-RevId: 453917840
This commit is contained in:
Snappy Team 2022-06-09 14:13:38 +00:00 committed by Victor Costan
parent 6a2b78a379
commit d261d2766f
1 changed files with 29 additions and 16 deletions

View File

@ -983,22 +983,35 @@ inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) {
return offset != 0;
}
void MemCopy(char* dst, const uint8_t* src, size_t size) {
std::memcpy(dst, src, size);
// Copies between size bytes and 64 bytes from src to dest. size cannot exceed
// 64. More than size bytes, but never exceeding 64, might be copied if doing
// so gives better performance.
void MemCopy64(char* dst, const void* src, size_t size) {
// Always copy this many bytes, test if we need to copy more.
constexpr int kShortMemCopy = 32;
// We're always allowed to copy 64 bytes, so if we exceed kShortMemCopy just
// copy 64 rather than the exact amount.
constexpr int kLongMemCopy = 64;
assert(size <= kLongMemCopy);
// [src, src + size) must not overlap with [dst, dst + size)
assert(std::less_equal<const void*>()(static_cast<const char*>(src) + size,
dst) ||
std::less_equal<const void*>()(dst + size, src));
// We know that src and dst are at least size bytes apart. However, because we
// might copy more than size bytes the copy still might overlap past size.
// E.g. if src and dst appear consecutively in memory (src + size == dst).
std::memmove(dst, src, kShortMemCopy);
// Profiling shows that nearly all copies are short.
if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
std::memmove(dst + kShortMemCopy,
static_cast<const uint8_t*>(src) + kShortMemCopy,
kLongMemCopy - kShortMemCopy);
}
}
void MemCopy(ptrdiff_t dst, const uint8_t* src, size_t size) {
// TODO: Switch to [[maybe_unused]] when we can assume C++17.
(void)dst;
(void)src;
(void)size;
}
void MemMove(char* dst, const void* src, size_t size) {
std::memmove(dst, src, size);
}
void MemMove(ptrdiff_t dst, const void* src, size_t size) {
void MemCopy64(ptrdiff_t dst, const void* src, size_t size) {
// TODO: Switch to [[maybe_unused]] when we can assume C++17.
(void)dst;
(void)src;
@ -1170,7 +1183,7 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
// Due to the spurious offset in literals have this will trigger
// at the start of a block when op is still smaller than 256.
if (tag_type != 0) goto break_loop;
MemCopy(op_base + op, old_ip, 64);
MemCopy64(op_base + op, old_ip, len);
op += len;
continue;
}
@ -1179,7 +1192,7 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
// we need to copy from ip instead of from the stream.
const void* from =
tag_type ? reinterpret_cast<void*>(op_base + delta) : old_ip;
MemMove(op_base + op, from, 64);
MemCopy64(op_base + op, from, len);
op += len;
}
} while (ip < ip_limit_min_slop && op < op_limit_min_slop);