From a2d219a8a801ae522bac8e966de005fcb336821b Mon Sep 17 00:00:00 2001 From: Snappy Team Date: Tue, 11 Oct 2022 16:00:34 +0000 Subject: [PATCH] Modify MemCopy64 to use AVX 32 byte copies instead of SSE2 16 byte copies on capable x86 platforms. This gives an average speedup of 6.87% on Milan and 1.90% on Skylake. PiperOrigin-RevId: 480370725 --- snappy.cc | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/snappy.cc b/snappy.cc index b072e5d..932f59f 100644 --- a/snappy.cc +++ b/snappy.cc @@ -989,27 +989,36 @@ inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) { // so gives better performance. [src, src + size) must not overlap with // [dst, dst + size), but [src, src + 64) may overlap with [dst, dst + 64). void MemCopy64(char* dst, const void* src, size_t size) { - // Always copy this many bytes, test if we need to copy more. + // Always copy this many bytes. If that's below size then copy the full 64. constexpr int kShortMemCopy = 32; - // We're always allowed to copy 64 bytes, so if we exceed kShortMemCopy just - // copy 64 rather than the exact amount. - constexpr int kLongMemCopy = 64; - assert(size <= kLongMemCopy); + assert(size <= 64); assert(std::less_equal()(static_cast(src) + size, dst) || std::less_equal()(dst + size, src)); // We know that src and dst are at least size bytes apart. However, because we // might copy more than size bytes the copy still might overlap past size. - // E.g. if src and dst appear consecutively in memory (src + size == dst). + // E.g. if src and dst appear consecutively in memory (src + size >= dst). + // TODO: Investigate wider copies on other platforms. +#if defined(__x86_64__) && defined(__AVX__) + assert(kShortMemCopy <= 32); + __m256i data = _mm256_lddqu_si256(static_cast(src)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), data); + // Profiling shows that nearly all copies are short. + if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { + data = _mm256_lddqu_si256(static_cast(src) + 1); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data); + } +#else std::memmove(dst, src, kShortMemCopy); // Profiling shows that nearly all copies are short. if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { std::memmove(dst + kShortMemCopy, static_cast(src) + kShortMemCopy, - kLongMemCopy - kShortMemCopy); + 64 - kShortMemCopy); } +#endif } void MemCopy64(ptrdiff_t dst, const void* src, size_t size) {