In the fast path for decompressing literals, instead of checking

whether there's 16 bytes free and then checking right afterwards (when having subtracted the literal size) that there are now 5 bytes free, just check once for 21 bytes. This skips a compare and a branch; although it is easily predictable, it is still a few cycles on a fast path that we would like to get rid of. Benchmarking this yields very confusing results. On open-source GCC 4.8.1 on Haswell, we get exactly the expected results; the benchmarks where we hit the fast path for literals (in particular the two HTML benchmarks and the protobuf benchmark) give very nice speedups, and the others are not really affected. However, benchmarks with Google's GCC branch on other hardware is much less clear. It seems that we have a weak loss in some cases (and the win for the “typical” win cases are not nearly as clear), but that it depends on microarchitecture and plain luck in how we run the benchmark. Looking at the generated assembler, it seems that the removal of the if causes other large-scale changes in how the function is laid out, which makes it likely that this is just bad luck. Thus, we should keep this change, even though its exact current impact is unclear; it's a sensible change per se, and dropping it on the basis of microoptimization for a given compiler (or even branch of a compiler) would seem like a bad strategy in the long run. Microbenchmark results (all in 64-bit, opt mode): Nehalem, Google GCC: Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------------------ BM_UFlat/0 76747 75591 1.3GB/s html +1.5% BM_UFlat/1 765756 757040 886.3MB/s urls +1.2% BM_UFlat/2 10867 10893 10.9GB/s jpg -0.2% BM_UFlat/3 124 131 1.4GB/s jpg_200 -5.3% BM_UFlat/4 31663 31596 2.8GB/s pdf +0.2% BM_UFlat/5 314162 308176 1.2GB/s html4 +1.9% BM_UFlat/6 29668 29746 790.6MB/s cp -0.3% BM_UFlat/7 12958 13386 796.4MB/s c -3.2% BM_UFlat/8 3596 3682 966.0MB/s lsp -2.3% BM_UFlat/9 1019193 1033493 953.3MB/s xls -1.4% BM_UFlat/10 239 247 775.3MB/s xls_200 -3.2% BM_UFlat/11 236411 240271 606.9MB/s txt1 -1.6% BM_UFlat/12 206639 209768 571.2MB/s txt2 -1.5% BM_UFlat/13 627803 635722 641.4MB/s txt3 -1.2% BM_UFlat/14 845932 857816 538.2MB/s txt4 -1.4% BM_UFlat/15 402107 391670 1.2GB/s bin +2.7% BM_UFlat/16 283 279 683.6MB/s bin_200 +1.4% BM_UFlat/17 46070 46815 781.5MB/s sum -1.6% BM_UFlat/18 5053 5163 782.0MB/s man -2.1% BM_UFlat/19 79721 76581 1.4GB/s pb +4.1% BM_UFlat/20 251158 252330 697.5MB/s gaviota -0.5% Sum of all benchmarks 4966150 4980396 -0.3% Sandy Bridge, Google GCC: Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------------------ BM_UFlat/0 42850 42182 2.3GB/s html +1.6% BM_UFlat/1 525660 515816 1.3GB/s urls +1.9% BM_UFlat/2 7173 7283 16.3GB/s jpg -1.5% BM_UFlat/3 92 91 2.1GB/s jpg_200 +1.1% BM_UFlat/4 15147 14872 5.9GB/s pdf +1.8% BM_UFlat/5 199936 192116 2.0GB/s html4 +4.1% BM_UFlat/6 12796 12443 1.8GB/s cp +2.8% BM_UFlat/7 6588 6400 1.6GB/s c +2.9% BM_UFlat/8 2010 1951 1.8GB/s lsp +3.0% BM_UFlat/9 761124 763049 1.3GB/s xls -0.3% BM_UFlat/10 186 189 1016.1MB/s xls_200 -1.6% BM_UFlat/11 159354 158460 918.6MB/s txt1 +0.6% BM_UFlat/12 139732 139950 856.1MB/s txt2 -0.2% BM_UFlat/13 429917 425027 961.7MB/s txt3 +1.2% BM_UFlat/14 585255 587324 785.8MB/s txt4 -0.4% BM_UFlat/15 276186 266173 1.8GB/s bin +3.8% BM_UFlat/16 205 207 925.5MB/s bin_200 -1.0% BM_UFlat/17 24925 24935 1.4GB/s sum -0.0% BM_UFlat/18 2632 2576 1.5GB/s man +2.2% BM_UFlat/19 40546 39108 2.8GB/s pb +3.7% BM_UFlat/20 175803 168209 1048.9MB/s gaviota +4.5% Sum of all benchmarks 3408117 3368361 +1.2% Haswell, upstream GCC 4.8.1: Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------------------ BM_UFlat/0 46308 40641 2.3GB/s html +13.9% BM_UFlat/1 513385 514706 1.3GB/s urls -0.3% BM_UFlat/2 6197 6151 19.2GB/s jpg +0.7% BM_UFlat/3 61 61 3.0GB/s jpg_200 +0.0% BM_UFlat/4 13551 13429 6.5GB/s pdf +0.9% BM_UFlat/5 198317 190243 2.0GB/s html4 +4.2% BM_UFlat/6 14768 12560 1.8GB/s cp +17.6% BM_UFlat/7 6453 6447 1.6GB/s c +0.1% BM_UFlat/8 1991 1980 1.8GB/s lsp +0.6% BM_UFlat/9 766947 770424 1.2GB/s xls -0.5% BM_UFlat/10 170 169 1.1GB/s xls_200 +0.6% BM_UFlat/11 164350 163554 888.7MB/s txt1 +0.5% BM_UFlat/12 145444 143830 832.1MB/s txt2 +1.1% BM_UFlat/13 437849 438413 929.2MB/s txt3 -0.1% BM_UFlat/14 603587 605309 759.8MB/s txt4 -0.3% BM_UFlat/15 249799 248067 1.9GB/s bin +0.7% BM_UFlat/16 191 188 1011.4MB/s bin_200 +1.6% BM_UFlat/17 26064 24778 1.4GB/s sum +5.2% BM_UFlat/18 2620 2601 1.5GB/s man +0.7% BM_UFlat/19 44551 37373 3.0GB/s pb +19.2% BM_UFlat/20 165408 164584 1.0GB/s gaviota +0.5% Sum of all benchmarks 3408011 3385508 +0.7% git-svn-id: https://snappy.googlecode.com/svn/trunk@78 03e5f5b5-db94-4691-08a0-1a8bf15f6143
2013-06-30 19:24:03 +00:00 · 2013-06-30 19:24:03 +00:00 · 2f0aaf8631
parent 062bf544a6
commit 2f0aaf8631
2 changed files with 27 additions and 20 deletions
--- a/snappy.cc
+++ b/snappy.cc
@ -82,6 +82,7 @@ enum {
  COPY_2_BYTE_OFFSET = 2,
  COPY_4_BYTE_OFFSET = 3
 };
+static const int kMaximumTagLength = 5;  // COPY_4_BYTE_OFFSET plus the actual offset.

 // Copy "len" bytes from "src" to "op", one byte at a time.  Used for
 // handling COPY operations where the input and output regions may
@ -469,21 +470,26 @@ char* CompressFragment(const char* input,
 //   bool Append(const char* ip, size_t length);
 //   bool AppendFromSelf(uint32 offset, size_t length);
 //
-//   // The difference between TryFastAppend and Append is that TryFastAppend
-//   // is allowed to read up to <available> bytes from the input buffer,
-//   // whereas Append is allowed to read <length>.
+//   // The rules for how TryFastAppend differs from Append are somewhat
+//   // convoluted:
 //   //
-//   // Also, TryFastAppend is allowed to return false, declining the append,
-//   // without it being a fatal error -- just "return false" would be
-//   // a perfectly legal implementation of TryFastAppend. The intention
-//   // is for TryFastAppend to allow a fast path in the common case of
-//   // a small append.
+//   //  - TryFastAppend is allowed to decline (return false) at any
+//   //    time, for any reason -- just "return false" would be
+//   //    a perfectly legal implementation of TryFastAppend.
+//   //    The intention is for TryFastAppend to allow a fast path
+//   //    in the common case of a small append.
+//   //  - TryFastAppend is allowed to read up to <available> bytes
+//   //    from the input buffer, whereas Append is allowed to read
+//   //    <length>. However, if it returns true, it must leave
+//   //    at least five (kMaximumTagLength) bytes in the input buffer
+//   //    afterwards, so that there is always enough space to read the
+//   //    next tag without checking for a refill.
+//   //  - TryFastAppend must always return decline (return false)
+//   //    if <length> is 61 or more, as in this case the literal length is not
+//   //    decoded fully. In practice, this should not be a big problem,
+//   //    as it is unlikely that one would implement a fast path accepting
+//   //    this much data.
 //   //
-//   // NOTE(user): TryFastAppend must always return decline (return false)
-//   // if <length> is 61 or more, as in this case the literal length is not
-//   // decoded fully. In practice, this should not be a big problem,
-//   // as it is unlikely that one would implement a fast path accepting
-//   // this much data.
 //   bool TryFastAppend(const char* ip, size_t available, size_t length);
 // };

@ -652,7 +658,7 @@ class SnappyDecompressor {
  const char*   ip_limit_;       // Points just past buffered bytes
  uint32        peeked_;         // Bytes peeked from reader (need to skip)
  bool          eof_;            // Hit end of input without an error?
-  char          scratch_[5];     // Temporary buffer for PeekFast() boundaries
+  char          scratch_[kMaximumTagLength];  // See RefillTag().

  // Ensure that all of the tag metadata for the next tag is available
  // in [ip_..ip_limit_-1].  Also ensures that [ip,ip+4] is readable even
@ -715,7 +721,7 @@ class SnappyDecompressor {
    // scope to optimize the <ip_limit_ - ip> expression based on the local
    // context, which overall increases speed.
    #define MAYBE_REFILL() \
-        if (ip_limit_ - ip < 5) { \
+        if (ip_limit_ - ip < kMaximumTagLength) { \
          ip_ = ip; \
          if (!RefillTag()) return; \
          ip = ip_; \
@ -730,7 +736,9 @@ class SnappyDecompressor {
        if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
          assert(literal_length < 61);
          ip += literal_length;
-          MAYBE_REFILL();
+          // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
+          // will not return true unless there's already at least five spare
+          // bytes in addition to the literal.
          continue;
        }
        if (PREDICT_FALSE(literal_length >= 61)) {
@ -823,7 +831,7 @@ bool SnappyDecompressor::RefillTag() {
    assert(nbuf == needed);
    ip_ = scratch_;
    ip_limit_ = scratch_ + needed;
-  } else if (nbuf < 5) {
+  } else if (nbuf < kMaximumTagLength) {
    // Have enough bytes, but move into scratch_ so that we do not
    // read past end of input
    memmove(scratch_, ip, nbuf);
@ -1025,7 +1033,7 @@ class SnappyIOVecWriter {

  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
    const size_t space_left = output_limit_ - total_written_;
-    if (len <= 16 && available >= 16 && space_left >= 16 &&
+    if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
        output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
      // Fast path, used for the majority (about 95%) of invocations.
      char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
@ -1162,7 +1170,7 @@ class SnappyArrayWriter {
  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
    char* op = op_;
    const size_t space_left = op_limit_ - op;
-    if (len <= 16 && available >= 16 && space_left >= 16) {
+    if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
      // Fast path, used for the majority (about 95%) of invocations.
      UnalignedCopy64(ip, op);
      UnalignedCopy64(ip + 8, op + 8);
--- a/snappy.h
+++ b/snappy.h
@ -178,7 +178,6 @@ namespace snappy {

  static const int kMaxHashTableBits = 14;
  static const size_t kMaxHashTableSize = 1 << kMaxHashTableBits;
-
 }  // end namespace snappy