From 7b82423c597bfed223dff27f880fe91e169375b4 Mon Sep 17 00:00:00 2001 From: Snappy Team Date: Mon, 23 Jan 2023 09:50:52 -0800 Subject: [PATCH] The output buffer in DecompressBranchless is never read from and the source buffers are never written. This allows us to defer any writes to the output buffer for an arbitrary amount of time as long as the writes all occur in the proper order. When a MemCopy64 would have normally occurred we save away the source address and length. Once we reach the location of the next write to the output buffer first perform the deferred copy. This gives time for the source address calculation and length to finish before the deferred copy. This change gives 1.84% on CLX and 0.97% Milan. PiperOrigin-RevId: 504012310 --- snappy.cc | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/snappy.cc b/snappy.cc index 250e543..d414718 100644 --- a/snappy.cc +++ b/snappy.cc @@ -29,7 +29,6 @@ #include "snappy-internal.h" #include "snappy-sinksource.h" #include "snappy.h" - #if !defined(SNAPPY_HAVE_BMI2) // __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2 // specifically, but it does define __AVX2__ when AVX2 support is available. @@ -1085,6 +1084,18 @@ void MemCopy64(ptrdiff_t dst, const void* src, size_t size) { (void)size; } +void ClearDeferred(const void** deferred_src, size_t* deferred_length, + uint8_t* safe_source) { + *deferred_src = safe_source; + *deferred_length = 0; +} + +void DeferMemCopy(const void** deferred_src, size_t* deferred_length, + const void* src, size_t length) { + *deferred_src = src; + *deferred_length = length; +} + SNAPPY_ATTRIBUTE_ALWAYS_INLINE inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) { const uint8_t*& ip = *ip_p; @@ -1189,6 +1200,12 @@ template std::pair DecompressBranchless( const uint8_t* ip, const uint8_t* ip_limit, ptrdiff_t op, T op_base, ptrdiff_t op_limit_min_slop) { + // If deferred_src is invalid point it here. + uint8_t safe_source[64]; + const void* deferred_src; + size_t deferred_length; + ClearDeferred(&deferred_src, &deferred_length, safe_source); + // We unroll the inner loop twice so we need twice the spare room. op_limit_min_slop -= kSlopBytes; if (2 * (kSlopBytes + 1) < ip_limit - ip && op < op_limit_min_slop) { @@ -1211,7 +1228,7 @@ std::pair DecompressBranchless( // twice reduces the amount of instructions checking limits and also // leads to reduced mov's. - SNAPPY_PREFETCH(ip+128); + SNAPPY_PREFETCH(ip + 128); for (int i = 0; i < 2; i++) { const uint8_t* old_ip = ip; assert(tag == ip[-1]); @@ -1238,23 +1255,29 @@ std::pair DecompressBranchless( } // Only copy-1 or copy-2 tags can get here. assert(tag_type == 1 || tag_type == 2); - std::ptrdiff_t delta = op + len_min_offset - len; + std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len; // Guard against copies before the buffer start. + // Execute any deferred MemCopy since we write to dst here. + MemCopy64(op_base + op, deferred_src, deferred_length); + op += deferred_length; + ClearDeferred(&deferred_src, &deferred_length, safe_source); if (SNAPPY_PREDICT_FALSE(delta < 0 || !Copy64BytesWithPatternExtension( op_base + op, len - len_min_offset))) { goto break_loop; } + // We aren't deferring this copy so add length right away. op += len; continue; } - std::ptrdiff_t delta = op + len_min_offset - len; + std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len; if (SNAPPY_PREDICT_FALSE(delta < 0)) { // Due to the spurious offset in literals have this will trigger // at the start of a block when op is still smaller than 256. if (tag_type != 0) goto break_loop; - MemCopy64(op_base + op, old_ip, len); - op += len; + MemCopy64(op_base + op, deferred_src, deferred_length); + op += deferred_length; + DeferMemCopy(&deferred_src, &deferred_length, old_ip, len); continue; } @@ -1262,14 +1285,23 @@ std::pair DecompressBranchless( // we need to copy from ip instead of from the stream. const void* from = tag_type ? reinterpret_cast(op_base + delta) : old_ip; - MemCopy64(op_base + op, from, len); - op += len; + MemCopy64(op_base + op, deferred_src, deferred_length); + op += deferred_length; + DeferMemCopy(&deferred_src, &deferred_length, from, len); } - } while (ip < ip_limit_min_slop && op < op_limit_min_slop); + } while (ip < ip_limit_min_slop && + (op + deferred_length) < op_limit_min_slop); exit: ip--; assert(ip <= ip_limit); } + // If we deferred a copy then we can perform. If we are up to date then we + // might not have enough slop bytes and could run past the end. + if (deferred_length) { + MemCopy64(op_base + op, deferred_src, deferred_length); + op += deferred_length; + ClearDeferred(&deferred_src, &deferred_length, safe_source); + } return {ip, op}; }