Improve zippy with 5-10%.

BM_ZCord/0 [html ] 1.26GB/s ± 0% 1.35GB/s ± 0% +7.90% (p=0.008 n=5+5) BM_ZCord/1 [urls ] 535MB/s ± 0% 562MB/s ± 0% +5.05% (p=0.008 n=5+5) BM_ZCord/2 [jpg ] 10.2GB/s ± 1% 10.2GB/s ± 0% ~ (p=0.310 n=5+5) BM_ZCord/3 [jpg_200] 841MB/s ± 1% 846MB/s ± 1% ~ (p=0.421 n=5+5) BM_ZCord/4 [pdf ] 6.77GB/s ± 1% 7.06GB/s ± 1% +4.28% (p=0.008 n=5+5) BM_ZCord/5 [html4 ] 1.00GB/s ± 0% 1.08GB/s ± 0% +7.94% (p=0.008 n=5+5) BM_ZCord/6 [txt1 ] 391MB/s ± 0% 417MB/s ± 0% +6.71% (p=0.008 n=5+5) BM_ZCord/7 [txt2 ] 363MB/s ± 0% 388MB/s ± 0% +6.73% (p=0.016 n=5+4) BM_ZCord/8 [txt3 ] 400MB/s ± 0% 426MB/s ± 0% +6.55% (p=0.008 n=5+5) BM_ZCord/9 [txt4 ] 328MB/s ± 0% 350MB/s ± 0% +6.66% (p=0.008 n=5+5) BM_ZCord/10 [pb ] 1.67GB/s ± 1% 1.80GB/s ± 0% +7.52% (p=0.008 n=5+5) 1) A key bottleneck in the data dependency chain is figuring out how many bytes are matched and loading the data for next hash value. The load-to-use latency is 5 cycles, in previous cl/303353110 we removed the load in lieu of "shrd" to align previous loads. Unfortunately "shrd" itself has a latency of 4 cycles, we'd prefer "shrx" which takes 1 cycle for variable shifts. 2)Maximally use data already computed. The above trick calculates 5 bytes of useful data. So in case we need to search for new match we can use this for the first search (which is one byte further). PiperOrigin-RevId: 303875535
2020-03-31 02:46:46 +00:00 · 2020-03-31 02:46:46 +00:00 · d674348a0c
parent 4dfcad9f4e
commit d674348a0c
2 changed files with 90 additions and 31 deletions
--- a/snappy-internal.h
+++ b/snappy-internal.h
@ -89,7 +89,7 @@ char* CompressFragment(const char* input,
 // Does not read *(s1 + (s2_limit - s2)) or beyond.
 // Requires that s2_limit >= s2.
 //
-// In addition populate *data with the next 8 bytes from the end of the match.
+// In addition populate *data with the next 5 bytes from the end of the match.
 // This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is
 // that on some arch's this can be done faster in this routine than subsequent
 // loading from s2 + n.
@ -113,23 +113,65 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
    uint64 a1 = UNALIGNED_LOAD64(s1);
    uint64 a2 = UNALIGNED_LOAD64(s2);
    if (SNAPPY_PREDICT_TRUE(a1 != a2)) {
+      // This code is critical for performance. The reason is that it determines
+      // how much to advance `ip` (s2). This obviously depends on both the loads
+      // from the `candidate` (s1) and `ip`. Furthermore the next `candidate`
+      // depends on the advanced `ip` calculated here through a load, hash and
+      // new candidate hash lookup (a lot of cycles). This makes s1 (ie.
+      // `candidate`) the variable that limits throughput. This is the reason we
+      // go through hoops to have this function update `data` for the next iter.
+      // The straightforward code would use *data, given by
+      //
+      // *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles),
+      //
+      // as input for the hash table lookup to find next candidate. However
+      // this forces the load on the data dependency chain of s1, because
+      // matched_bytes directly depends on s1. However matched_bytes is 0..7, so
+      // we can also calculate *data by
+      //
+      // *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8),
+      //                    matched_bytes);
+      //
+      // The loads do not depend on s1 anymore and are thus off the bottleneck.
+      // The straightforward implementation on x86_64 would be to use
+      //
+      // shrd rax, rdx, cl  (cl being matched_bytes * 8)
+      //
+      // unfortunately shrd with a variable shift has a 4 cycle latency. So this
+      // only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable
+      // shift instruction but can only shift 64 bits. If we focus on just
+      // obtaining the least significant 4 bytes, we can obtain this by
+      //
+      // *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2),
+      //     UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8);
+      //
+      // Writen like above this is not a big win, the conditional move would be
+      // a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle).
+      // However matched_bytes < 4 is equal to static_cast<uint32>(xorval) != 0.
+      // Writen that way the conditional move (2 cycles) can execute parallel
+      // with FindLSBSetNonZero64 (tzcnt), which takes 3 cycles.
      uint64 xorval = a1 ^ a2;
      int shift = Bits::FindLSBSetNonZero64(xorval);
      size_t matched_bytes = shift >> 3;
 #ifndef __x86_64__
      *data = UNALIGNED_LOAD64(s2 + matched_bytes);
 #else
-      // Unfortunately the compiler cannot find this using the obvious c++ code
-      // *data = shift == 0 ? a2 : (a2 >> shift) | (a3 << (64 - shift);
-      // the reason is that the above needs the conditional clause to guard
-      // against UB when shift == 0. The compiler doesn't realize the full
-      // expression can be lowered into a single "shrd" instruction and in
-      // effect the conditional can be ignored.
-      uint64 a3 = UNALIGNED_LOAD64(s2 + 8);
-      asm ("shrdq %%cl, %1, %0\n\t" : "+r"(a2) : "r"(a3), "c"(shift & -8));
-      *data = a2;
+      // Ideally this would just be
+      //
+      // a2 = static_cast<uint32>(xorval) == 0 ? a3 : a2;
+      //
+      // However clang correctly infers that the above statement participates on
+      // a critical data dependency chain and thus, unfortunately, refuses to
+      // use a conditional move (it's tuned to cut data dependencies). In this
+      // case there is a longer parallel chain anyway AND this will be fairly
+      // unpredictable.
+      uint64 a3 = UNALIGNED_LOAD64(s2 + 4);
+      asm("testl %k2, %k2\n\t"
+          "cmovzq %1, %0\n\t"
+          : "+r"(a2)
+          : "r"(a3), "r"(xorval));
+      *data = a2 >> (shift & (3 * 8));
 #endif
-      assert(*data == UNALIGNED_LOAD64(s2 + matched_bytes));
      return std::pair<size_t, bool>(matched_bytes, true);
    } else {
      matched = 8;
@ -154,11 +196,13 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
 #ifndef __x86_64__
      *data = UNALIGNED_LOAD64(s2 + matched_bytes);
 #else
-      uint64 a3 = UNALIGNED_LOAD64(s2 + 8);
-      asm("shrdq %%cl, %1, %0\n\t" : "+r"(a2) : "r"(a3), "c"(shift & -8));
-      *data = a2;
+      uint64 a3 = UNALIGNED_LOAD64(s2 + 4);
+      asm("testl %k2, %k2\n\t"
+          "cmovzq %1, %0\n\t"
+          : "+r"(a2)
+          : "r"(a3), "r"(xorval));
+      *data = a2 >> (shift & (3 * 8));
 #endif
-      assert(*data == UNALIGNED_LOAD64(s2 + matched_bytes));
      matched += matched_bytes;
      assert(matched >= 8);
      return std::pair<size_t, bool>(matched, false);
--- a/snappy.cc
+++ b/snappy.cc
@ -516,16 +516,16 @@ char* CompressFragment(const char* input,
  assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
  const char* ip_end = input + input_size;
  const char* base_ip = ip;
-  // Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
-  // [next_emit, ip_end) after the main loop.
-  const char* next_emit = ip;

  const size_t kInputMarginBytes = 15;
  if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
    const char* ip_limit = input + input_size - kInputMarginBytes;

-    for (uint64 data = LittleEndian::Load64(++ip);;) {
-      assert(next_emit < ip);
+    for (uint32 preload = LittleEndian::Load32(ip + 1);;) {
+      // Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
+      // [next_emit, ip_end) after the main loop.
+      const char* next_emit = ip++;
+      uint64 data = LittleEndian::Load64(ip);
      // The body of this loop calls EmitLiteral once and then EmitCopy one or
      // more times.  (The exception is that when we're close to exhausting
      // the input we goto emit_remainder.)
@ -559,14 +559,17 @@ char* CompressFragment(const char* input,
        for (int j = 0; j < 4; j++) {
          for (int k = 0; k < 4; k++) {
            int i = 4 * j + k;
-            assert(static_cast<uint32>(data) == LittleEndian::Load32(ip + i));
-            uint32 hash = HashBytes(data, shift);
+            // These for-loops are meant to be unrolled. So we can freely
+            // special case the first iteration to use the value already
+            // loaded in preload.
+            uint32 dword = i == 0 ? preload : data;
+            assert(dword == LittleEndian::Load32(ip + i));
+            uint32 hash = HashBytes(dword, shift);
            candidate = base_ip + table[hash];
            assert(candidate >= base_ip);
            assert(candidate < ip + i);
            table[hash] = delta + i;
-            if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) ==
-                                    static_cast<uint32>(data))) {
+            if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) == dword)) {
              *op = LITERAL | (i << 2);
              UnalignedCopy128(next_emit, op + 1);
              ip += i;
@ -587,6 +590,7 @@ char* CompressFragment(const char* input,
        skip += bytes_between_hash_lookups;
        const char* next_ip = ip + bytes_between_hash_lookups;
        if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
+          ip = next_emit;
          goto emit_remainder;
        }
        candidate = base_ip + table[hash];
@ -632,11 +636,12 @@ char* CompressFragment(const char* input,
        } else {
          op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
        }
-        next_emit = ip;
        if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
          goto emit_remainder;
        }
-        assert(LittleEndian::Load64(ip) == data);
+        // Expect 5 bytes to match
+        assert((data & 0xFFFFFFFFFF) ==
+               (LittleEndian::Load64(ip) & 0xFFFFFFFFFF));
        // We are now looking for a 4-byte match again.  We read
        // table[Hash(ip, shift)] for that.  To improve compression,
        // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
@ -645,17 +650,27 @@ char* CompressFragment(const char* input,
        uint32 hash = HashBytes(data, shift);
        candidate = base_ip + table[hash];
        table[hash] = ip - base_ip;
+        // Measurements on the benchmarks have shown the following probabilities
+        // for the loop to exit (ie. avg. number of iterations is reciprocal).
+        // BM_Flat/6  txt1    p = 0.3-0.4
+        // BM_Flat/7  txt2    p = 0.35
+        // BM_Flat/8  txt3    p = 0.3-0.4
+        // BM_Flat/9  txt3    p = 0.34-0.4
+        // BM_Flat/10 pb      p = 0.4
+        // BM_Flat/11 gaviota p = 0.1
+        // BM_Flat/12 cp      p = 0.5
+        // BM_Flat/13 c       p = 0.3
      } while (static_cast<uint32>(data) == LittleEndian::Load32(candidate));
-      ++ip;
-      data = LittleEndian::Load64(ip);
+      // Because the least significant 5 bytes matched, we can utilize data
+      // for the next iteration.
+      preload = data >> 8;
    }
  }

 emit_remainder:
  // Emit the remaining bytes as a literal
-  if (next_emit < ip_end) {
-    op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
-                                                ip_end - next_emit);
+  if (ip < ip_end) {
+    op = EmitLiteral</*allow_fast_path=*/false>(op, ip, ip_end - ip);
  }

  return op;