From 3c77e0145993725d79ede2f4c1fe4962860b6f12 Mon Sep 17 00:00:00 2001
From: Snappy Team <no-reply@google.com>
Date: Wed, 12 Feb 2020 18:04:58 +0000
Subject: [PATCH] 1) Make the output pointer a local variable such it doesn't
 need a load add store on it's loop carried dependency chain. 2) Reduce the
 input pointer loop carried dependency chain from 7 cycles to 4 cycles by
 using pre-loading. This is a very subtle point. 3) Just brutally copy 64
 bytes which removes a difficult to predict branch from the inner most loop.
 There is enough bandwidth to do so in the intrinsic cycles of the loop. 4)
 Implement limit pointers that include the slop region. This removes
 unnecessary instructions from the hot path. 5) It seems the removal of the
 difficult to predict branch has removed the code sensitivity to alignment, so
 remove the asm nop's.

PiperOrigin-RevId: 294692928
---
 snappy.cc | 278 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 166 insertions(+), 112 deletions(-)

diff --git a/snappy.cc b/snappy.cc
index 73d987e..5bca64f 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -76,6 +76,9 @@
 
 namespace snappy {
 
+// The amount of slop bytes writers are using for unconditional copies.
+constexpr int kSlopBytes = 64;
+
 using internal::char_table;
 using internal::COPY_1_BYTE_OFFSET;
 using internal::COPY_2_BYTE_OFFSET;
@@ -688,12 +691,28 @@ static inline void Report(const char *algorithm, size_t compressed_size,
 //   // Called before decompression
 //   void SetExpectedLength(size_t length);
 //
+//   // For performance a writer may choose to donate the cursor variable to the
+//   // decompression function. The decompression will inject it in all its
+//   // function calls to the writer. Keeping the important output cursor as a
+//   // function local stack variable allows the compiler to keep it in
+//   // register, which greatly aids performance by avoiding loads and stores of
+//   // this variable in the fast path loop iterations.
+//   T GetOutputPtr() const;
+//
+//   // At end of decompression the loop donates the ownership of the cursor
+//   // variable back to the writer by calling this function.
+//   void SetOutputPtr(T op);
+//
 //   // Called after decompression
 //   bool CheckLength() const;
 //
 //   // Called repeatedly during decompression
-//   bool Append(const char* ip, size_t length);
-//   bool AppendFromSelf(uint32 offset, size_t length);
+//   // Each function get a pointer to the op (output pointer), that the writer
+//   // can use and update. Note it's important that these functions get fully
+//   // inlined so that no actual address of the local variable needs to be
+//   // taken.
+//   bool Append(const char* ip, size_t length, T* op);
+//   bool AppendFromSelf(uint32 offset, size_t length, T* op);
 //
 //   // The rules for how TryFastAppend differs from Append are somewhat
 //   // convoluted:
@@ -715,7 +734,7 @@ static inline void Report(const char *algorithm, size_t compressed_size,
 //   //    as it is unlikely that one would implement a fast path accepting
 //   //    this much data.
 //   //
-//   bool TryFastAppend(const char* ip, size_t available, size_t length);
+//   bool TryFastAppend(const char* ip, size_t available, size_t length, T* op);
 // };
 
 static inline uint32 ExtractLowBytes(uint32 v, int n) {
@@ -747,6 +766,9 @@ class SnappyDecompressor {
   Source*       reader_;         // Underlying source of bytes to decompress
   const char*   ip_;             // Points to next buffered byte
   const char*   ip_limit_;       // Points just past buffered bytes
+  // If ip < ip_limit_min_maxtaglen_ it's safe to read kMaxTagLength from
+  // buffer.
+  const char* ip_limit_min_maxtaglen_;
   uint32        peeked_;         // Bytes peeked from reader (need to skip)
   bool          eof_;            // Hit end of input without an error?
   char          scratch_[kMaximumTagLength];  // See RefillTag().
@@ -758,6 +780,11 @@ class SnappyDecompressor {
   // Returns true on success, false on error or end of input.
   bool RefillTag();
 
+  void ResetLimit(const char* ip) {
+    ip_limit_min_maxtaglen_ =
+        ip_limit_ - std::min<ptrdiff_t>(ip_limit_ - ip, kMaximumTagLength - 1);
+  }
+
  public:
   explicit SnappyDecompressor(Source* reader)
       : reader_(reader),
@@ -810,37 +837,29 @@ class SnappyDecompressor {
   __attribute__((aligned(32)))
 #endif
   void DecompressAllTags(Writer* writer) {
-    // In x86, pad the function body to start 16 bytes later. This function has
-    // a couple of hotspots that are highly sensitive to alignment: we have
-    // observed regressions by more than 20% in some metrics just by moving the
-    // exact same code to a different position in the benchmark binary.
-    //
-    // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
-    // the "lucky" case consistently. Unfortunately, this is a very brittle
-    // workaround, and future differences in code generation may reintroduce
-    // this regression. If you experience a big, difficult to explain, benchmark
-    // performance regression here, first try removing this hack.
-#if defined(__GNUC__) && defined(__x86_64__)
-    // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
-    asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
-    asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
-#endif
-
     const char* ip = ip_;
+    ResetLimit(ip);
+    auto op = writer->GetOutputPtr();
     // We could have put this refill fragment only at the beginning of the loop.
     // However, duplicating it at the end of each branch gives the compiler more
     // scope to optimize the <ip_limit_ - ip> expression based on the local
     // context, which overall increases speed.
-    #define MAYBE_REFILL() \
-        if (ip_limit_ - ip < kMaximumTagLength) { \
-          ip_ = ip; \
-          if (!RefillTag()) return; \
-          ip = ip_; \
-        }
+#define MAYBE_REFILL()                                      \
+  if (SNAPPY_PREDICT_FALSE(ip >= ip_limit_min_maxtaglen_)) { \
+    ip_ = ip;                                               \
+    if (SNAPPY_PREDICT_FALSE(!RefillTag())) goto exit;       \
+    ip = ip_;                                               \
+    ResetLimit(ip);                                         \
+  }                                                         \
+  preload = static_cast<uint8>(*ip)
 
+    // At the start of the for loop below the least significant byte of preload
+    // contains the tag.
+    uint32 preload;
     MAYBE_REFILL();
     for ( ;; ) {
-      const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
+      const uint8 c = static_cast<uint8>(preload);
+      ip++;
 
       // Ratio of iterations that have LITERAL vs non-LITERAL for different
       // inputs.
@@ -856,12 +875,13 @@ class SnappyDecompressor {
       // bin             24%        76%
       if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) {
         size_t literal_length = (c >> 2) + 1u;
-        if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
+        if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length, &op)) {
           assert(literal_length < 61);
           ip += literal_length;
           // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
           // will not return true unless there's already at least five spare
           // bytes in addition to the literal.
+          preload = static_cast<uint8>(*ip);
           continue;
         }
         if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) {
@@ -875,62 +895,51 @@ class SnappyDecompressor {
 
         size_t avail = ip_limit_ - ip;
         while (avail < literal_length) {
-          if (!writer->Append(ip, avail)) return;
+          if (!writer->Append(ip, avail, &op)) goto exit;
           literal_length -= avail;
           reader_->Skip(peeked_);
           size_t n;
           ip = reader_->Peek(&n);
           avail = n;
           peeked_ = avail;
-          if (avail == 0) return;  // Premature end of input
+          if (avail == 0) goto exit;
           ip_limit_ = ip + avail;
+          ResetLimit(ip);
         }
-        if (!writer->Append(ip, literal_length)) {
-          return;
-        }
+        if (!writer->Append(ip, literal_length, &op)) goto exit;
         ip += literal_length;
         MAYBE_REFILL();
       } else {
-        // If we ignore branch prediction misses, then the throughput of the
-        // loop over tags is limited by the data dependency chain carried by
-        // ip. The amount ip advances per iteration depends on the value c.
-        // For COPY_1 it's 2 bytes, for COPY_2 it's 3 bytes and for COPY_4 it's
-        // 5 bytes. We could be branch free and use information from the entry
-        // in char_table to advance ip, but that would lead to the following
-        // critical data dependency chain
-        //   ip -> c = *ip++ -> entry = char_table[c] ->
-        //   advance = entry >> 11 -> ip += advance
-        // which is 5 + 5 + 1 + 1 = 12 cycles of latency. Meaning the processor
-        // will never be able to execute the loop faster then 12 cycles per
-        // iteration. It's better to speculate it's not a COPY_4, which is an
-        // uncommon tag, in which case we get the data dependency chain
-        //   ip -> c = *ip++ -> c &= 3 -> ip += c
-        // which is 5 + 1 + 1 = 7 cycles of latency.
         if (SNAPPY_PREDICT_FALSE((c & 3) == COPY_4_BYTE_OFFSET)) {
           const size_t copy_offset = LittleEndian::Load32(ip);
           const size_t length = (c >> 2) + 1;
           ip += 4;
-          if (!writer->AppendFromSelf(copy_offset, length)) return;
+
+          if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;
         } else {
-          const size_t entry = char_table[c];
-          const size_t trailer =
-              ExtractLowBytes(LittleEndian::Load32(ip), c & 3);
-          const size_t length = entry & 0xff;
-          ip += (c & 3);
+          const uint32 entry = char_table[c];
+          preload = LittleEndian::Load32(ip);
+          const uint32 trailer = ExtractLowBytes(preload, c & 3);
+          const uint32 length = entry & 0xff;
 
           // copy_offset/256 is encoded in bits 8..10.  By just fetching
           // those bits, we get copy_offset (since the bit-field starts at
           // bit 8).
-          const size_t copy_offset = entry & 0x700;
-          if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
-            return;
-          }
+          const uint32 copy_offset = (entry & 0x700) + trailer;
+          if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;
+
+          ip += (c & 3);
+          // By using the result of the previous load we reduce the critical
+          // dependency chain of ip to 4 cycles.
+          preload >>= (c & 3) * 8;
+          if (ip < ip_limit_min_maxtaglen_) continue;
         }
         MAYBE_REFILL();
       }
     }
-
 #undef MAYBE_REFILL
+  exit:
+    writer->SetOutputPtr(op);
   }
 };
 
@@ -1151,7 +1160,7 @@ class SnappyIOVecWriter {
     return total_written_ == output_limit_;
   }
 
-  inline bool Append(const char* ip, size_t len) {
+  inline bool Append(const char* ip, size_t len, char**) {
     if (total_written_ + len > output_limit_) {
       return false;
     }
@@ -1159,6 +1168,9 @@ class SnappyIOVecWriter {
     return AppendNoCheck(ip, len);
   }
 
+  char* GetOutputPtr() { return nullptr; }
+  void SetOutputPtr(char* op) {}
+
   inline bool AppendNoCheck(const char* ip, size_t len) {
     while (len > 0) {
       if (curr_iov_remaining_ == 0) {
@@ -1183,7 +1195,8 @@ class SnappyIOVecWriter {
     return true;
   }
 
-  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
+  inline bool TryFastAppend(const char* ip, size_t available, size_t len,
+                            char**) {
     const size_t space_left = output_limit_ - total_written_;
     if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
         curr_iov_remaining_ >= 16) {
@@ -1198,7 +1211,7 @@ class SnappyIOVecWriter {
     return false;
   }
 
-  inline bool AppendFromSelf(size_t offset, size_t len) {
+  inline bool AppendFromSelf(size_t offset, size_t len, char**) {
     // See SnappyArrayWriter::AppendFromSelf for an explanation of
     // the "offset - 1u" trick.
     if (offset - 1u >= total_written_) {
@@ -1296,59 +1309,68 @@ class SnappyArrayWriter {
   char* base_;
   char* op_;
   char* op_limit_;
+  // If op < op_limit_min_slop_ then it's safe to unconditionally write
+  // kSlopBytes starting at op.
+  char* op_limit_min_slop_;
 
  public:
   inline explicit SnappyArrayWriter(char* dst)
       : base_(dst),
         op_(dst),
-        op_limit_(dst) {
-  }
+        op_limit_(dst),
+        op_limit_min_slop_(dst) {}  // Safe default see invariant.
 
   inline void SetExpectedLength(size_t len) {
     op_limit_ = op_ + len;
+    // Prevent pointer from being past the buffer.
+    op_limit_min_slop_ = op_limit_ - std::min<size_t>(kSlopBytes - 1, len);
   }
 
   inline bool CheckLength() const {
     return op_ == op_limit_;
   }
 
-  inline bool Append(const char* ip, size_t len) {
-    char* op = op_;
+  char* GetOutputPtr() { return op_; }
+  void SetOutputPtr(char* op) { op_ = op; }
+
+  inline bool Append(const char* ip, size_t len, char** op_p) {
+    char* op = *op_p;
     const size_t space_left = op_limit_ - op;
-    if (space_left < len) {
-      return false;
-    }
+    if (space_left < len) return false;
     memcpy(op, ip, len);
-    op_ = op + len;
+    *op_p = op + len;
     return true;
   }
 
-  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
-    char* op = op_;
+  inline bool TryFastAppend(const char* ip, size_t available, size_t len,
+                            char** op_p) {
+    char* op = *op_p;
     const size_t space_left = op_limit_ - op;
     if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
       // Fast path, used for the majority (about 95%) of invocations.
       UnalignedCopy128(ip, op);
-      op_ = op + len;
+      *op_p = op + len;
       return true;
     } else {
       return false;
     }
   }
 
-  inline bool AppendFromSelf(size_t offset, size_t len) {
-    char* const op_end = op_ + len;
+  ABSL_ATTRIBUTE_ALWAYS_INLINE
+  inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) {
+    char* const op = *op_p;
+    char* const op_end = op + len;
 
     // Check if we try to append from before the start of the buffer.
-    // Normally this would just be a check for "produced < offset",
-    // but "produced <= offset - 1u" is equivalent for every case
-    // except the one where offset==0, where the right side will wrap around
-    // to a very big number. This is convenient, as offset==0 is another
-    // invalid case that we also want to catch, so that we do not go
-    // into an infinite loop.
-    if (Produced() <= offset - 1u || op_end > op_limit_) return false;
-    op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_);
-
+    if (SNAPPY_PREDICT_FALSE(op - base_ < offset)) return false;
+    if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) ||
+                            op >= op_limit_min_slop_ || offset < len)) {
+      if (op_end > op_limit_ || offset == 0) return false;
+      *op_p = IncrementalCopy(op - offset, op, op_end, op_limit_);
+      return true;
+    }
+    std::memmove(op, op - offset, kSlopBytes);
+    *op_p = op_end;
     return true;
   }
   inline size_t Produced() const {
@@ -1393,22 +1415,25 @@ class SnappyDecompressionValidator {
   inline void SetExpectedLength(size_t len) {
     expected_ = len;
   }
+  size_t GetOutputPtr() { return produced_; }
+  void SetOutputPtr(size_t op) { produced_ = op; }
   inline bool CheckLength() const {
     return expected_ == produced_;
   }
-  inline bool Append(const char* ip, size_t len) {
-    produced_ += len;
-    return produced_ <= expected_;
+  inline bool Append(const char* ip, size_t len, size_t* produced) {
+    *produced += len;
+    return *produced <= expected_;
   }
-  inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
+  inline bool TryFastAppend(const char* ip, size_t available, size_t length,
+                            size_t* produced) {
     return false;
   }
-  inline bool AppendFromSelf(size_t offset, size_t len) {
+  inline bool AppendFromSelf(size_t offset, size_t len, size_t* produced) {
     // See SnappyArrayWriter::AppendFromSelf for an explanation of
     // the "offset - 1u" trick.
-    if (produced_ <= offset - 1u) return false;
-    produced_ += len;
-    return produced_ <= expected_;
+    if (*produced <= offset - 1u) return false;
+    *produced += len;
+    return *produced <= expected_;
   }
   inline void Flush() {}
 };
@@ -1472,6 +1497,9 @@ class SnappyScatteredWriter {
   char* op_base_;       // Base of output block
   char* op_ptr_;        // Pointer to next unfilled byte in block
   char* op_limit_;      // Pointer just past block
+  // If op < op_limit_min_slop_ then it's safe to unconditionally write
+  // kSlopBytes starting at op.
+  char* op_limit_min_slop_;
 
   inline size_t Size() const {
     return full_size_ + (op_ptr_ - op_base_);
@@ -1488,6 +1516,8 @@ class SnappyScatteredWriter {
         op_ptr_(NULL),
         op_limit_(NULL) {
   }
+  char* GetOutputPtr() { return op_ptr_; }
+  void SetOutputPtr(char* op) { op_ptr_ = op; }
 
   inline void SetExpectedLength(size_t len) {
     assert(blocks_.empty());
@@ -1503,43 +1533,59 @@ class SnappyScatteredWriter {
     return Size();
   }
 
-  inline bool Append(const char* ip, size_t len) {
-    size_t avail = op_limit_ - op_ptr_;
+  inline bool Append(const char* ip, size_t len, char** op_p) {
+    char* op = *op_p;
+    size_t avail = op_limit_ - op;
     if (len <= avail) {
       // Fast path
-      memcpy(op_ptr_, ip, len);
-      op_ptr_ += len;
+      memcpy(op, ip, len);
+      *op_p = op + len;
       return true;
     } else {
-      return SlowAppend(ip, len);
+      op_ptr_ = op;
+      bool res = SlowAppend(ip, len);
+      *op_p = op_ptr_;
+      return res;
     }
   }
 
-  inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
-    char* op = op_ptr_;
+  inline bool TryFastAppend(const char* ip, size_t available, size_t length,
+                            char** op_p) {
+    char* op = *op_p;
     const int space_left = op_limit_ - op;
     if (length <= 16 && available >= 16 + kMaximumTagLength &&
         space_left >= 16) {
       // Fast path, used for the majority (about 95%) of invocations.
       UnalignedCopy128(ip, op);
-      op_ptr_ = op + length;
+      *op_p = op + length;
       return true;
     } else {
       return false;
     }
   }
 
-  inline bool AppendFromSelf(size_t offset, size_t len) {
-    char* const op_end = op_ptr_ + len;
-    // See SnappyArrayWriter::AppendFromSelf for an explanation of
-    // the "offset - 1u" trick.
-    if (SNAPPY_PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ &&
-                          op_end <= op_limit_)) {
-      // Fast path: src and dst in current block.
-      op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_);
+  inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) {
+    char* op = *op_p;
+    // Check if we try to append from before the start of the buffer.
+    if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) ||
+                            op - op_base_ < offset ||
+                            op >= op_limit_min_slop_ || offset < len)) {
+      if (offset == 0) return false;
+      char* const op_end = op + len;
+      if (SNAPPY_PREDICT_FALSE(op - op_base_ < offset || op_end > op_limit_)) {
+        op_ptr_ = op;
+        bool res = SlowAppendFromSelf(offset, len);
+        *op_p = op_ptr_;
+        return res;
+      }
+      *op_p = IncrementalCopy(op - offset, op, op_end, op_limit_);
       return true;
     }
-    return SlowAppendFromSelf(offset, len);
+    // Fast path
+    char* const op_end = op + len;
+    std::memmove(op, op - offset, kSlopBytes);
+    *op_p = op_end;
+    return true;
   }
 
   // Called at the end of the decompress. We ask the allocator
@@ -1560,15 +1606,15 @@ bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
     ip += avail;
 
     // Bounds check
-    if (full_size_ + len > expected_) {
-      return false;
-    }
+    if (full_size_ + len > expected_) return false;
 
     // Make new block
     size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);
     op_base_ = allocator_.Allocate(bsize);
     op_ptr_ = op_base_;
     op_limit_ = op_base_ + bsize;
+    op_limit_min_slop_ = op_limit_ - std::min<size_t>(kSlopBytes - 1, bsize);
+
     blocks_.push_back(op_base_);
     avail = bsize;
   }
@@ -1593,12 +1639,20 @@ bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset,
   // nice if we do not rely on that, since we can get better compression if we
   // allow cross-block copies and thus might want to change the compressor in
   // the future.
+  // TODO Replace this with a properly optimized path. This is not
+  // triggered right now. But this is so super slow, that it would regress
+  // performance unacceptably if triggered.
   size_t src = cur - offset;
+  char* op = op_ptr_;
   while (len-- > 0) {
     char c = blocks_[src >> kBlockLog][src & (kBlockSize-1)];
-    Append(&c, 1);
+    if (!Append(&c, 1, &op)) {
+      op_ptr_ = op;
+      return false;
+    }
     src++;
   }
+  op_ptr_ = op;
   return true;
 }