From a9730ed50526b714eff8731066a70006cf816420 Mon Sep 17 00:00:00 2001 From: Shahriar Rouf Date: Wed, 9 Dec 2020 02:27:22 +0000 Subject: [PATCH] Optimize zippy decompression by making IncrementalCopy faster. When SSSE3 is available: - Use PSHUFB (_mm_shuffle_epi8) to handle pattern size 1 to 15 (previously it handled size 1 to 7). - This enables us to do 16 byte copies instead of 8 bytes copies because we know that the pattern size >= 16. - Use shuffle-reshuffle strategy to generate the next pattern after loading the initial pattern. This enables us to write 4 conditionals (similar to when pattern size >= 16) which would allow FDO to layout the code with respect to actual probabilities of each length. - The PSHUFB masks are now generated programmatically at compile-time. When SSSE3 is unavailable: - No change. In both cases: - assert(op < op_limit) in IncrementalCopy so that we can check 'op_limit <= buf_limit - 15' instead of 'op_limit <= buf_limit - 16'. All existing call sites of IncrementalCopy guarantee this. 'bin' case is notably >20% faster because it has many repeated character patterns (i.e. pattern_size = 1). PiperOrigin-RevId: 346454471 --- snappy.cc | 284 ++++++++++++++++++++++++++++++++++++--------- snappy_unittest.cc | 41 +++++++ 2 files changed, 267 insertions(+), 58 deletions(-) diff --git a/snappy.cc b/snappy.cc index e5e69ab..7578901 100644 --- a/snappy.cc +++ b/snappy.cc @@ -74,6 +74,7 @@ #include #include #include +#include #include namespace snappy { @@ -178,6 +179,16 @@ void UnalignedCopy128(const void* src, void* dst) { std::memcpy(dst, tmp, 16); } +template +inline void ConditionalUnalignedCopy128(const char* src, char* dst) { + if (use_16bytes_chunk) { + UnalignedCopy128(src, dst); + } else { + UnalignedCopy64(src, dst); + UnalignedCopy64(src + 8, dst + 8); + } +} + // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used // for handling COPY operations where the input and output regions may overlap. // For example, suppose: @@ -205,36 +216,164 @@ inline char* IncrementalCopySlow(const char* src, char* op, #if SNAPPY_HAVE_SSSE3 -// This is a table of shuffle control masks that can be used as the source +// Computes the bytes for shuffle control mask (please read comments on +// 'pattern_generation_masks' as well) for the given index_offset and +// pattern_size. For example, when the 'offset' is 6, it will generate a +// repeating pattern of size 6. So, the first 16 byte indexes will correspond to +// the pattern-bytes {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3} and the +// next 16 byte indexes will correspond to the pattern-bytes {4, 5, 0, 1, 2, 3, +// 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by +// calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and +// MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively. +template +inline constexpr std::array MakePatternMaskBytes( + int index_offset, int pattern_size, index_sequence) { + return {static_cast((index_offset + indexes) % pattern_size)...}; +} + +// Computes the shuffle control mask bytes array for given pattern-sizes and +// returns an array. +template +inline constexpr std::array, + sizeof...(pattern_sizes_minus_one)> +MakePatternMaskBytesTable(int index_offset, + index_sequence) { + return {MakePatternMaskBytes( + index_offset, pattern_sizes_minus_one + 1, + make_index_sequence())...}; +} + +// This is an array of shuffle control masks that can be used as the source // operand for PSHUFB to permute the contents of the destination XMM register // into a repeating byte pattern. -alignas(16) const char pshufb_fill_patterns[7][16] = { - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}, - {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, - {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}, - {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0}, - {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3}, - {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1}, -}; +alignas(16) inline constexpr std::array, + 16> pattern_generation_masks = + MakePatternMaskBytesTable( + /*index_offset=*/0, + /*pattern_sizes_minus_one=*/make_index_sequence<16>()); -// j * (16 / j) for all j from 0 to 7. 0 is not actually used. -const uint8_t pattern_size_table[8] = {0, 16, 16, 15, 16, 15, 12, 14}; +// Similar to 'pattern_generation_masks', this table is used to "rotate" the +// pattern so that we can copy the *next 16 bytes* consistent with the pattern. +// Basically, pattern_reshuffle_masks is a continuation of +// pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as +// pattern_generation_masks for offsets 1, 2, 4, 8 and 16. +alignas(16) inline constexpr std::array, + 16> pattern_reshuffle_masks = + MakePatternMaskBytesTable( + /*index_offset=*/16, + /*pattern_sizes_minus_one=*/make_index_sequence<16>()); + +SNAPPY_ATTRIBUTE_ALWAYS_INLINE +static inline __m128i LoadPattern(const char* src, const size_t pattern_size) { + __m128i generation_mask = _mm_load_si128(reinterpret_cast( + pattern_generation_masks[pattern_size - 1].data())); + // Uninitialized bytes are masked out by the shuffle mask. + // TODO: remove annotation and macro defs once MSan is fixed. + SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(src + pattern_size, 16 - pattern_size); + return _mm_shuffle_epi8( + _mm_loadu_si128(reinterpret_cast(src)), generation_mask); +} + +SNAPPY_ATTRIBUTE_ALWAYS_INLINE +static inline std::pair<__m128i /* pattern */, __m128i /* reshuffle_mask */> +LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) { + __m128i pattern = LoadPattern(src, pattern_size); + + // This mask will generate the next 16 bytes in-place. Doing so enables us to + // write data by at most 4 _mm_storeu_si128. + // + // For example, suppose pattern is: abcdefabcdefabcd + // Shuffling with this mask will generate: efabcdefabcdefab + // Shuffling again will generate: cdefabcdefabcdef + __m128i reshuffle_mask = _mm_load_si128(reinterpret_cast( + pattern_reshuffle_masks[pattern_size - 1].data())); + return {pattern, reshuffle_mask}; +} #endif // SNAPPY_HAVE_SSSE3 +// Fallback for when we need to copy while extending the pattern, for example +// copying 10 bytes from 3 positions back abc -> abcabcabcabca. +// +// REQUIRES: [dst - offset, dst + 64) is a valid address range. +SNAPPY_ATTRIBUTE_ALWAYS_INLINE +static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) { +#if SNAPPY_HAVE_SSSE3 + if (SNAPPY_PREDICT_TRUE(offset <= 16)) { + switch (offset) { + case 0: + return false; + case 1: { + std::memset(dst, dst[-1], 64); + return true; + } + case 2: + case 4: + case 8: + case 16: { + __m128i pattern = LoadPattern(dst - offset, offset); + for (int i = 0; i < 4; i++) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern); + } + return true; + } + default: { + auto pattern_and_reshuffle_mask = + LoadPatternAndReshuffleMask(dst - offset, offset); + __m128i pattern = pattern_and_reshuffle_mask.first; + __m128i reshuffle_mask = pattern_and_reshuffle_mask.second; + for (int i = 0; i < 4; i++) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern); + pattern = _mm_shuffle_epi8(pattern, reshuffle_mask); + } + return true; + } + } + } +#else + if (SNAPPY_PREDICT_TRUE(offset < 16)) { + if (SNAPPY_PREDICT_FALSE(offset == 0)) return false; + // Extend the pattern to the first 16 bytes. + for (int i = 0; i < 16; i++) dst[i] = dst[i - offset]; + // Find a multiple of pattern >= 16. + static std::array pattern_sizes = []() { + std::array res; + for (int i = 1; i < 16; i++) res[i] = (16 / i + 1) * i; + return res; + }(); + offset = pattern_sizes[offset]; + for (int i = 1; i < 4; i++) { + std::memcpy(dst + i * 16, dst + i * 16 - offset, 16); + } + return true; + } +#endif // SNAPPY_HAVE_SSSE3 + + // Very rare. + for (int i = 0; i < 4; i++) { + std::memcpy(dst + i * 16, dst + i * 16 - offset, 16); + } + return true; +} + // Copy [src, src+(op_limit-op)) to [op, op_limit) but faster than // IncrementalCopySlow. buf_limit is the address past the end of the writable // region of the buffer. inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, char* const buf_limit) { +#if SNAPPY_HAVE_SSSE3 + constexpr int big_pattern_size_lower_bound = 16; +#else + constexpr int big_pattern_size_lower_bound = 8; +#endif + // Terminology: // // slop = buf_limit - op // pat = op - src - // len = limit - op + // len = op_limit - op assert(src < op); - assert(op <= op_limit); + assert(op < op_limit); assert(op_limit <= buf_limit); // NOTE: The copy tags use 3 or 6 bits to store the copy length, so len <= 64. assert(op_limit - op <= 64); @@ -265,11 +404,13 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, // input. In general if we always predict len <= 16 it would be an ok // prediction. // - // In order to be fast we want a pattern >= 8 bytes and an unrolled loop - // copying 2x 8 bytes at a time. + // In order to be fast we want a pattern >= 16 bytes (or 8 bytes in non-SSE) + // and an unrolled loop copying 1x 16 bytes (or 2x 8 bytes in non-SSE) at a + // time. - // Handle the uncommon case where pattern is less than 8 bytes. - if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) { + // Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE) + // bytes. + if (pattern_size < big_pattern_size_lower_bound) { #if SNAPPY_HAVE_SSSE3 // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB // to permute the register's contents in-place into a repeating sequence of @@ -283,24 +424,55 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, // The non-SSE fallback implementation suffers from store-forwarding stalls // because its loads and stores partly overlap. By expanding the pattern // in-place, we avoid the penalty. - if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) { - const __m128i shuffle_mask = _mm_load_si128( - reinterpret_cast(pshufb_fill_patterns) + - pattern_size - 1); - const __m128i pattern = _mm_shuffle_epi8( - _mm_loadl_epi64(reinterpret_cast(src)), shuffle_mask); - // Uninitialized bytes are masked out by the shuffle mask. - // TODO: remove annotation and macro defs once MSan is fixed. - SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern)); - pattern_size = pattern_size_table[pattern_size]; - char* op_end = std::min(op_limit, buf_limit - 15); - while (op < op_end) { - _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern); - op += pattern_size; + + // Typically, the op_limit is the gating factor so try to simplify the loop + // based on that. + if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) { + auto pattern_and_reshuffle_mask = + LoadPatternAndReshuffleMask(src, pattern_size); + __m128i pattern = pattern_and_reshuffle_mask.first; + __m128i reshuffle_mask = pattern_and_reshuffle_mask.second; + + // There is at least one, and at most four 16-byte blocks. Writing four + // conditionals instead of a loop allows FDO to layout the code with + // respect to the actual probabilities of each length. + // TODO: Replace with loop with trip count hint. + _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern); + + if (op + 16 < op_limit) { + pattern = _mm_shuffle_epi8(pattern, reshuffle_mask); + _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 16), pattern); } - if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit; + if (op + 32 < op_limit) { + pattern = _mm_shuffle_epi8(pattern, reshuffle_mask); + _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 32), pattern); + } + if (op + 48 < op_limit) { + pattern = _mm_shuffle_epi8(pattern, reshuffle_mask); + _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 48), pattern); + } + return op_limit; } - return IncrementalCopySlow(src, op, op_limit); + char* const op_end = buf_limit - 15; + if (SNAPPY_PREDICT_TRUE(op < op_end)) { + auto pattern_and_reshuffle_mask = + LoadPatternAndReshuffleMask(src, pattern_size); + __m128i pattern = pattern_and_reshuffle_mask.first; + __m128i reshuffle_mask = pattern_and_reshuffle_mask.second; + + // This code path is relatively cold however so we save code size + // by avoiding unrolling and vectorizing. + // + // TODO: Remove pragma when when cold regions don't get + // vectorized or unrolled. +#pragma nounroll + do { + _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern); + pattern = _mm_shuffle_epi8(pattern, reshuffle_mask); + op += 16; + } while (SNAPPY_PREDICT_TRUE(op < op_end)); + } + return IncrementalCopySlow(op - pattern_size, op, op_limit); #else // !SNAPPY_HAVE_SSSE3 // If plenty of buffer space remains, expand the pattern to at least 8 // bytes. The way the following loop is written, we need 8 bytes of buffer @@ -320,34 +492,30 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, } #endif // SNAPPY_HAVE_SSSE3 } - assert(pattern_size >= 8); + assert(pattern_size >= big_pattern_size_lower_bound); + constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16; - // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single - // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe - // because expanding the pattern to at least 8 bytes guarantees that - // op - src >= 8. + // Copy 1x 16 bytes (or 2x 8 bytes in non-SSE) at a time. Because op - src can + // be < 16 in non-SSE, a single UnalignedCopy128 might overwrite data in op. + // UnalignedCopy64 is safe because expanding the pattern to at least 8 bytes + // guarantees that op - src >= 8. // // Typically, the op_limit is the gating factor so try to simplify the loop // based on that. - if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) { + if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) { // There is at least one, and at most four 16-byte blocks. Writing four // conditionals instead of a loop allows FDO to layout the code with respect // to the actual probabilities of each length. // TODO: Replace with loop with trip count hint. - UnalignedCopy64(src, op); - UnalignedCopy64(src + 8, op + 8); - + ConditionalUnalignedCopy128(src, op); if (op + 16 < op_limit) { - UnalignedCopy64(src + 16, op + 16); - UnalignedCopy64(src + 24, op + 24); + ConditionalUnalignedCopy128(src + 16, op + 16); } if (op + 32 < op_limit) { - UnalignedCopy64(src + 32, op + 32); - UnalignedCopy64(src + 40, op + 40); + ConditionalUnalignedCopy128(src + 32, op + 32); } if (op + 48 < op_limit) { - UnalignedCopy64(src + 48, op + 48); - UnalignedCopy64(src + 56, op + 56); + ConditionalUnalignedCopy128(src + 48, op + 48); } return op_limit; } @@ -358,12 +526,9 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, // // TODO: Remove pragma when when cold regions don't get vectorized // or unrolled. -#ifdef __clang__ -#pragma clang loop unroll(disable) -#endif +#pragma nounroll for (char* op_end = buf_limit - 16; op < op_end; op += 16, src += 16) { - UnalignedCopy64(src, op); - UnalignedCopy64(src + 8, op + 8); + ConditionalUnalignedCopy128(src, op); } if (op >= op_limit) return op_limit; @@ -894,10 +1059,10 @@ std::pair DecompressBranchless( if (SNAPPY_PREDICT_FALSE(std::size_t(offset) < len)) { assert(tag_type != 0); // offset 0 is an error. - if (SNAPPY_PREDICT_FALSE(offset == 0)) break; - op = IncrementalCopy(op_base + delta, op_base + op, op_base + op + len, - op_base + op_limit) - - op_base; + if (!Copy64BytesWithPatternExtension(op_base + op, offset)) { + break; + } + op += len; continue; } @@ -1089,6 +1254,7 @@ class SnappyDecompressor { preload = LittleEndian::Load32(ip); const uint32_t trailer = ExtractLowBytes(preload, c & 3); const uint32_t length = entry & 0xff; + assert(length > 0); // copy_offset/256 is encoded in bits 8..10. By just fetching // those bits, we get copy_offset (since the bit-field starts at @@ -1459,6 +1625,7 @@ class SnappyIOVecWriter { if (to_copy > len) { to_copy = len; } + assert(to_copy > 0); IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset), curr_iov_output_, curr_iov_output_ + to_copy, @@ -1552,6 +1719,7 @@ class SnappyArrayWriter { SNAPPY_ATTRIBUTE_ALWAYS_INLINE inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) { + assert(len > 0); char* const op = *op_p; assert(op >= base_); char* const op_end = op + len; diff --git a/snappy_unittest.cc b/snappy_unittest.cc index 91270a5..0b568b2 100644 --- a/snappy_unittest.cc +++ b/snappy_unittest.cc @@ -695,6 +695,41 @@ TEST(Snappy, SimpleTests) { Verify("abcaaaaaaa" + std::string(65536, 'b') + std::string("aaaaa") + "abc"); } +// Regression test for cr/345340892. +TEST(Snappy, AppendSelfPatternExtensionEdgeCases) { + Verify("abcabcabcabcabcabcab"); + Verify("abcabcabcabcabcabcab0123456789ABCDEF"); + + Verify("abcabcabcabcabcabcabcabcabcabcabcabc"); + Verify("abcabcabcabcabcabcabcabcabcabcabcabc0123456789ABCDEF"); +} + +// Regression test for cr/345340892. +TEST(Snappy, AppendSelfPatternExtensionEdgeCasesExhaustive) { + std::mt19937 rng; + std::uniform_int_distribution uniform_byte(0, 255); + for (int pattern_size = 1; pattern_size <= 18; ++pattern_size) { + for (int length = 1; length <= 64; ++length) { + for (int extra_bytes_after_pattern : {0, 1, 15, 16, 128}) { + const int size = pattern_size + length + extra_bytes_after_pattern; + std::string input; + input.resize(size); + for (int i = 0; i < pattern_size; ++i) { + input[i] = 'a' + i; + } + for (int i = 0; i < length; ++i) { + input[pattern_size + i] = input[i]; + } + for (int i = 0; i < extra_bytes_after_pattern; ++i) { + input[pattern_size + length + i] = + static_cast(uniform_byte(rng)); + } + Verify(input); + } + } + } +} + // Verify max blowup (lots of four-byte copies) TEST(Snappy, MaxBlowup) { std::mt19937 rng; @@ -1285,6 +1320,12 @@ static struct { { "gaviota", "kppkn.gtb", 0 }, }; +TEST(Snappy, TestBenchmarkFiles) { + for (int i = 0; i < ARRAYSIZE(files); ++i) { + Verify(ReadTestDataFile(files[i].filename, files[i].size_limit)); + } +} + static void BM_UFlat(int iters, int arg) { StopBenchmarkTiming();