The output buffer in DecompressBranchless is never read from and the source buffers are never written. This allows us to defer any writes to the output buffer for an arbitrary amount of time as long as the writes all occur in the proper order. When a MemCopy64 would have normally occurred we save away the source address and length. Once we reach the location of the next write to the output buffer first perform the deferred copy. This gives time for the source address calculation and length to finish before the deferred copy.

This change gives 1.84% on CLX and 0.97% Milan. PiperOrigin-RevId: 504012310
2023-01-23 09:50:52 -08:00 · 2023-01-23 09:50:52 -08:00 · 7b82423c59
parent 30326e5b8c
commit 7b82423c59
1 changed files with 41 additions and 9 deletions
--- a/snappy.cc
+++ b/snappy.cc
@ -29,7 +29,6 @@
 #include "snappy-internal.h"
 #include "snappy-sinksource.h"
 #include "snappy.h"
-
 #if !defined(SNAPPY_HAVE_BMI2)
 // __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
 // specifically, but it does define __AVX2__ when AVX2 support is available.
@ -1085,6 +1084,18 @@ void MemCopy64(ptrdiff_t dst, const void* src, size_t size) {
  (void)size;
 }

+void ClearDeferred(const void** deferred_src, size_t* deferred_length,
+                   uint8_t* safe_source) {
+  *deferred_src = safe_source;
+  *deferred_length = 0;
+}
+
+void DeferMemCopy(const void** deferred_src, size_t* deferred_length,
+                  const void* src, size_t length) {
+  *deferred_src = src;
+  *deferred_length = length;
+}
+
 SNAPPY_ATTRIBUTE_ALWAYS_INLINE
 inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) {
  const uint8_t*& ip = *ip_p;
@ -1189,6 +1200,12 @@ template <typename T>
 std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
    const uint8_t* ip, const uint8_t* ip_limit, ptrdiff_t op, T op_base,
    ptrdiff_t op_limit_min_slop) {
+  // If deferred_src is invalid point it here.
+  uint8_t safe_source[64];
+  const void* deferred_src;
+  size_t deferred_length;
+  ClearDeferred(&deferred_src, &deferred_length, safe_source);
+
  // We unroll the inner loop twice so we need twice the spare room.
  op_limit_min_slop -= kSlopBytes;
  if (2 * (kSlopBytes + 1) < ip_limit - ip && op < op_limit_min_slop) {
@ -1211,7 +1228,7 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
      // twice reduces the amount of instructions checking limits and also
      // leads to reduced mov's.

-      SNAPPY_PREFETCH(ip+128);
+      SNAPPY_PREFETCH(ip + 128);
      for (int i = 0; i < 2; i++) {
        const uint8_t* old_ip = ip;
        assert(tag == ip[-1]);
@ -1238,23 +1255,29 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
          }
          // Only copy-1 or copy-2 tags can get here.
          assert(tag_type == 1 || tag_type == 2);
-          std::ptrdiff_t delta = op + len_min_offset - len;
+          std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len;
          // Guard against copies before the buffer start.
+          // Execute any deferred MemCopy since we write to dst here.
+          MemCopy64(op_base + op, deferred_src, deferred_length);
+          op += deferred_length;
+          ClearDeferred(&deferred_src, &deferred_length, safe_source);
          if (SNAPPY_PREDICT_FALSE(delta < 0 ||
                                  !Copy64BytesWithPatternExtension(
                                      op_base + op, len - len_min_offset))) {
            goto break_loop;
          }
+          // We aren't deferring this copy so add length right away.
          op += len;
          continue;
        }
-        std::ptrdiff_t delta = op + len_min_offset - len;
+        std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len;
        if (SNAPPY_PREDICT_FALSE(delta < 0)) {
          // Due to the spurious offset in literals have this will trigger
          // at the start of a block when op is still smaller than 256.
          if (tag_type != 0) goto break_loop;
-          MemCopy64(op_base + op, old_ip, len);
-          op += len;
+          MemCopy64(op_base + op, deferred_src, deferred_length);
+          op += deferred_length;
+          DeferMemCopy(&deferred_src, &deferred_length, old_ip, len);
          continue;
        }

@ -1262,14 +1285,23 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
        // we need to copy from ip instead of from the stream.
        const void* from =
            tag_type ? reinterpret_cast<void*>(op_base + delta) : old_ip;
-        MemCopy64(op_base + op, from, len);
-        op += len;
+        MemCopy64(op_base + op, deferred_src, deferred_length);
+        op += deferred_length;
+        DeferMemCopy(&deferred_src, &deferred_length, from, len);
      }
-    } while (ip < ip_limit_min_slop && op < op_limit_min_slop);
+    } while (ip < ip_limit_min_slop &&
+             (op + deferred_length) < op_limit_min_slop);
  exit:
    ip--;
    assert(ip <= ip_limit);
  }
+  // If we deferred a copy then we can perform.  If we are up to date then we
+  // might not have enough slop bytes and could run past the end.
+  if (deferred_length) {
+    MemCopy64(op_base + op, deferred_src, deferred_length);
+    op += deferred_length;
+    ClearDeferred(&deferred_src, &deferred_length, safe_source);
+  }
  return {ip, op};
 }