From 5ec5d16bd6896a8ca522a25b19d070e75ab9dd42 Mon Sep 17 00:00:00 2001 From: Mathias Stearn Date: Thu, 18 Jan 2024 15:07:21 +0000 Subject: [PATCH] Perf tuning for gcc + aarch64 --- snappy-internal.h | 24 ++++++++++---- snappy.cc | 79 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 83 insertions(+), 20 deletions(-) diff --git a/snappy-internal.h b/snappy-internal.h index 39fbda5..89deba2 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -233,9 +233,7 @@ static inline std::pair FindMatchLength(const char* s1, int shift = Bits::FindLSBSetNonZero64(xorval); size_t matched_bytes = shift >> 3; uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); -#ifndef __x86_64__ - a2 = static_cast(xorval) == 0 ? a3 : a2; -#else +#ifdef __x86_64__ // Ideally this would just be // // a2 = static_cast(xorval) == 0 ? a3 : a2; @@ -250,6 +248,14 @@ static inline std::pair FindMatchLength(const char* s1, : "+r"(a2) : "r"(a3), "r"(xorval) : "cc"); +#elif defined(__aarch64__) + asm("cmp %w[xorval], 0\n\t" + "csel %x[a2], %[a3], %[a2], eq\n\t" + : [a2] "+r" (a2) + : [a3] "r" (a3) , [xorval] "r" (xorval) + : "cc"); +#else + a2 = static_cast(xorval) == 0 ? a3 : a2; #endif *data = a2 >> (shift & (3 * 8)); return std::pair(matched_bytes, true); @@ -276,14 +282,20 @@ static inline std::pair FindMatchLength(const char* s1, int shift = Bits::FindLSBSetNonZero64(xorval); size_t matched_bytes = shift >> 3; uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); -#ifndef __x86_64__ - a2 = static_cast(xorval) == 0 ? a3 : a2; -#else +#ifdef __x86_64__ asm("testl %k2, %k2\n\t" "cmovzq %1, %0\n\t" : "+r"(a2) : "r"(a3), "r"(xorval) : "cc"); +#elif defined(__aarch64__) + asm("cmp %w[xorval], 0\n\t" + "csel %x[a2], %[a3], %[a2], eq\n\t" + : [a2] "+r" (a2) + : [a3] "r" (a3) , [xorval] "r" (xorval) + : "cc"); +#else + a2 = static_cast(xorval) == 0 ? a3 : a2; #endif *data = a2 >> (shift & (3 * 8)); matched += matched_bytes; diff --git a/snappy.cc b/snappy.cc index 6473123..d07fcb6 100644 --- a/snappy.cc +++ b/snappy.cc @@ -100,6 +100,57 @@ using internal::V128_StoreU; using internal::V128_DupChar; #endif +// GCC dispatches to libc for memmoves > 16 bytes, so we need to +// do some work to get good code from that compiler. Clang handles +// powers-of-2 at least up to 64 well. +#if !defined(__GNUC__) || defined(__clang__) +template +SNAPPY_ATTRIBUTE_ALWAYS_INLINE +inline void FixedSizeMemMove(void* dest, const void* src) { + memmove(dest, src, SIZE); +} +#else + +template +SNAPPY_ATTRIBUTE_ALWAYS_INLINE +inline void FixedSizeMemMove(void* dest, const void* src) { + if (SIZE <= 16) { + // gcc has patterns for memmove up to 16 bytes + memmove(dest, src, SIZE); + } else { + // This generates reasonable code on x86_64, but on aarch64 this produces a + // dead store to tmp, plus takes up stack space. + char tmp[SIZE]; + memcpy(tmp, src, SIZE); + memcpy(dest, tmp, SIZE); + } +} + +#ifdef __aarch64__ // Implies neon support +template <> +SNAPPY_ATTRIBUTE_ALWAYS_INLINE +inline void FixedSizeMemMove<32>(void* dest, const void* src) { + V128 a = V128_LoadU(reinterpret_cast(src)); + V128 b = V128_LoadU(reinterpret_cast(src) + 1); + V128_StoreU(reinterpret_cast(dest), a); + V128_StoreU(reinterpret_cast(dest) + 1, b); +} + +template <> +SNAPPY_ATTRIBUTE_ALWAYS_INLINE +inline void FixedSizeMemMove<64>(void* dest, const void* src) { + V128 a = V128_LoadU(reinterpret_cast(src)); + V128 b = V128_LoadU(reinterpret_cast(src) + 1); + V128 c = V128_LoadU(reinterpret_cast(src) + 2); + V128 d = V128_LoadU(reinterpret_cast(src) + 3); + V128_StoreU(reinterpret_cast(dest), a); + V128_StoreU(reinterpret_cast(dest) + 1, b); + V128_StoreU(reinterpret_cast(dest) + 2, c); + V128_StoreU(reinterpret_cast(dest) + 3, d); +} +#endif +#endif + // We translate the information encoded in a tag through a lookup table to a // format that requires fewer instructions to decode. Effectively we store // the length minus the tag part of the offset. The lowest significant byte @@ -1060,13 +1111,18 @@ void MemCopy64(char* dst, const void* src, size_t size) { data = _mm256_lddqu_si256(static_cast(src) + 1); _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data); } +#elif defined(__aarch64__) + // Emperically it is faster to just copy all 64 rather than branching. + (void)kShortMemCopy; + (void)size; + FixedSizeMemMove<64>(dst, src); #else - std::memmove(dst, src, kShortMemCopy); + FixedSizeMemMove(dst, src); // Profiling shows that nearly all copies are short. if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { - std::memmove(dst + kShortMemCopy, - static_cast(src) + kShortMemCopy, - 64 - kShortMemCopy); + FixedSizeMemMove( + dst + kShortMemCopy, + static_cast(src) + kShortMemCopy); } #endif } @@ -1102,14 +1158,9 @@ inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) { // instruction (csinc) and it removes several register moves. const size_t tag_type = *tag & 3; const bool is_literal = (tag_type == 0); - if (is_literal) { - size_t next_literal_tag = (*tag >> 2) + 1; - *tag = ip[next_literal_tag]; - ip += next_literal_tag + 1; - } else { - *tag = ip[tag_type]; - ip += tag_type + 1; - } + const size_t next_tag = is_literal ? (*tag >> 2) + 1 : tag_type; + *tag = ip[next_tag]; + ip += (next_tag) + 1; return tag_type; } @@ -2013,7 +2064,7 @@ class SnappyArrayWriter { *op_p = IncrementalCopy(op - offset, op, op_end, op_limit_); return true; } - std::memmove(op, op - offset, kSlopBytes); + FixedSizeMemMove(op, op - offset); *op_p = op_end; return true; } @@ -2265,7 +2316,7 @@ class SnappyScatteredWriter { } // Fast path char* const op_end = op + len; - std::memmove(op, op - offset, kSlopBytes); + FixedSizeMemMove(op, op - offset); *op_p = op_end; return true; }