Avoid store-forwarding stalls in Zippy's IncrementalCopy

NEW: Annotate `pattern` as initialized, for MSan. Snappy's IncrementalCopy routine optimizes for speed by reading and writing memory in blocks of eight or sixteen bytes. If the gap between the source and destination pointers is smaller than eight bytes, snappy's strategy is to expand the gap by issuing a series of partly-overlapping eight-byte loads+stores. Because the range of each load partly overlaps that of the store which preceded it, the store buffer cannot be forwarded to the load, and the load stalls while it waits for the store to retire. This is called a store-forwarding stall. We can use fewer loads and avoid most of the stalls by loading the first eight bytes into an 128-bit XMM register, then using PSHUFB to permute the register's contents in-place into the desired repeating sequence of bytes. When falling back to IncrementalCopySlow, use memset if the pattern size == 1. This eliminates around 60% of the stalls. name old time/op new time/op delta BM_UFlat/0 [html] 48.6µs ± 0% 48.2µs ± 0% -0.92% (p=0.000 n=19+18) BM_UFlat/1 [urls] 589µs ± 0% 576µs ± 0% -2.17% (p=0.000 n=19+18) BM_UFlat/2 [jpg] 7.12µs ± 0% 7.10µs ± 0% ~ (p=0.071 n=19+18) BM_UFlat/3 [jpg_200] 162ns ± 0% 151ns ± 0% -7.06% (p=0.000 n=19+18) BM_UFlat/4 [pdf] 8.25µs ± 0% 8.19µs ± 0% -0.74% (p=0.000 n=19+18) BM_UFlat/5 [html4] 218µs ± 0% 218µs ± 0% +0.09% (p=0.000 n=17+18) BM_UFlat/6 [txt1] 191µs ± 0% 189µs ± 0% -1.12% (p=0.000 n=19+18) BM_UFlat/7 [txt2] 168µs ± 0% 167µs ± 0% -1.01% (p=0.000 n=19+18) BM_UFlat/8 [txt3] 502µs ± 0% 499µs ± 0% -0.52% (p=0.000 n=19+18) BM_UFlat/9 [txt4] 704µs ± 0% 695µs ± 0% -1.26% (p=0.000 n=19+18) BM_UFlat/10 [pb] 45.6µs ± 0% 44.2µs ± 0% -3.13% (p=0.000 n=19+15) BM_UFlat/11 [gaviota] 188µs ± 0% 194µs ± 0% +3.06% (p=0.000 n=15+18) BM_UFlat/12 [cp] 15.1µs ± 2% 14.7µs ± 1% -2.09% (p=0.000 n=18+18) BM_UFlat/13 [c] 7.38µs ± 0% 7.36µs ± 0% -0.28% (p=0.000 n=16+18) BM_UFlat/14 [lsp] 2.31µs ± 0% 2.37µs ± 0% +2.64% (p=0.000 n=19+18) BM_UFlat/15 [xls] 984µs ± 0% 909µs ± 0% -7.59% (p=0.000 n=19+18) BM_UFlat/16 [xls_200] 215ns ± 0% 217ns ± 0% +0.71% (p=0.000 n=19+15) BM_UFlat/17 [bin] 289µs ± 0% 287µs ± 0% -0.71% (p=0.000 n=19+18) BM_UFlat/18 [bin_200] 161ns ± 0% 116ns ± 0% -28.09% (p=0.000 n=19+16) BM_UFlat/19 [sum] 31.9µs ± 0% 29.2µs ± 0% -8.37% (p=0.000 n=19+18) BM_UFlat/20 [man] 3.13µs ± 1% 3.07µs ± 0% -1.79% (p=0.000 n=19+18) name old allocs/op new allocs/op delta BM_UFlat/0 [html] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/1 [urls] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/2 [jpg] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/3 [jpg_200] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/4 [pdf] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/5 [html4] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/6 [txt1] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/7 [txt2] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/8 [txt3] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/9 [txt4] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/10 [pb] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/11 [gaviota] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/12 [cp] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/13 [c] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/14 [lsp] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/15 [xls] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/16 [xls_200] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/17 [bin] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/18 [bin_200] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/19 [sum] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/20 [man] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) name old speed new speed delta BM_UFlat/0 [html] 2.11GB/s ± 0% 2.13GB/s ± 0% +0.92% (p=0.000 n=19+18) BM_UFlat/1 [urls] 1.19GB/s ± 0% 1.22GB/s ± 0% +2.22% (p=0.000 n=16+17) BM_UFlat/2 [jpg] 17.3GB/s ± 0% 17.3GB/s ± 0% ~ (p=0.074 n=19+18) BM_UFlat/3 [jpg_200] 1.23GB/s ± 0% 1.33GB/s ± 0% +7.58% (p=0.000 n=19+18) BM_UFlat/4 [pdf] 12.4GB/s ± 0% 12.5GB/s ± 0% +0.74% (p=0.000 n=19+18) BM_UFlat/5 [html4] 1.88GB/s ± 0% 1.88GB/s ± 0% -0.09% (p=0.000 n=18+18) BM_UFlat/6 [txt1] 798MB/s ± 0% 807MB/s ± 0% +1.13% (p=0.000 n=19+18) BM_UFlat/7 [txt2] 743MB/s ± 0% 751MB/s ± 0% +1.02% (p=0.000 n=19+18) BM_UFlat/8 [txt3] 850MB/s ± 0% 855MB/s ± 0% +0.52% (p=0.000 n=19+18) BM_UFlat/9 [txt4] 684MB/s ± 0% 693MB/s ± 0% +1.28% (p=0.000 n=19+18) BM_UFlat/10 [pb] 2.60GB/s ± 0% 2.69GB/s ± 0% +3.25% (p=0.000 n=19+16) BM_UFlat/11 [gaviota] 979MB/s ± 0% 950MB/s ± 0% -2.97% (p=0.000 n=15+18) BM_UFlat/12 [cp] 1.63GB/s ± 2% 1.67GB/s ± 1% +2.13% (p=0.000 n=18+18) BM_UFlat/13 [c] 1.51GB/s ± 0% 1.52GB/s ± 0% +0.29% (p=0.000 n=16+18) BM_UFlat/14 [lsp] 1.61GB/s ± 1% 1.57GB/s ± 0% -2.57% (p=0.000 n=19+18) BM_UFlat/15 [xls] 1.05GB/s ± 0% 1.13GB/s ± 0% +8.22% (p=0.000 n=19+18) BM_UFlat/16 [xls_200] 928MB/s ± 0% 921MB/s ± 0% -0.81% (p=0.000 n=19+17) BM_UFlat/17 [bin] 1.78GB/s ± 0% 1.79GB/s ± 0% +0.71% (p=0.000 n=19+18) BM_UFlat/18 [bin_200] 1.24GB/s ± 0% 1.72GB/s ± 0% +38.92% (p=0.000 n=19+18) BM_UFlat/19 [sum] 1.20GB/s ± 0% 1.31GB/s ± 0% +9.15% (p=0.000 n=19+18) BM_UFlat/20 [man] 1.35GB/s ± 1% 1.38GB/s ± 0% +1.84% (p=0.000 n=19+18)
2018-03-26 21:55:23 -07:00 · 2018-03-26 21:55:23 -07:00 · 8f469d97e2
parent 4f7bd2dbfd
commit 8f469d97e2
3 changed files with 58 additions and 1 deletions
--- a/snappy-internal.h
+++ b/snappy-internal.h
@ -218,6 +218,19 @@ static const uint16 char_table[256] = {
  0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
 };

+// This is a table of shuffle control masks that can be used as the source
+// operand for PSHUFB to permute the contents of the destination XMM register
+// into a repeating byte pattern.
+alignas(16) static const char pshufb_fill_patterns[7][16] = {
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+  {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
+  {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
+  {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
+  {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
+  {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
+  {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
+};
+
 }  // end namespace internal
 }  // end namespace snappy

--- a/snappy-stubs-internal.h
+++ b/snappy-stubs-internal.h
@ -53,6 +53,18 @@
 #include <intrin.h>
 #endif  // defined(_MSC_VER)

+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#if __has_feature(memory_sanitizer)
+#include <sanitizer/msan_interface.h>
+#define SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) \
+    __msan_unpoison((address), (size))
+#else
+#define SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) /* empty */
+#endif  // __has_feature(memory_sanitizer)
+
 #include "snappy-stubs-public.h"

 #if defined(__x86_64__)
--- a/snappy.cc
+++ b/snappy.cc
@ -40,7 +40,7 @@
 #endif

 #if SNAPPY_HAVE_SSE2
-#include <emmintrin.h>
+#include <x86intrin.h>
 #endif
 #include <stdio.h>

@ -56,6 +56,7 @@ using internal::COPY_2_BYTE_OFFSET;
 using internal::LITERAL;
 using internal::char_table;
 using internal::kMaximumTagLength;
+using internal::pshufb_fill_patterns;

 // Any hash function will produce a valid compressed bitstream, but a good
 // hash function reduces the number of collisions and thus yields better
@ -182,6 +183,36 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,

  // Handle the uncommon case where pattern is less than 8 bytes.
  if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
+#if SNAPPY_HAVE_SSE2
+    // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
+    // to permute the register's contents in-place into a repeating sequence of
+    // the first "pattern_size" bytes.
+    // For example, suppose:
+    //    src       == "abc"
+    //    op        == op + 3
+    // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
+    // followed by one byte of slop: abcabcabcabcabca.
+    //
+    // The non-SSE fallback implementation suffers from store-forwarding stalls
+    // because its loads and stores partly overlap. By expanding the pattern
+    // in-place, we avoid the penalty.
+    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
+      const __m128i shuffle_mask = _mm_load_si128(
+          reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
+          + pattern_size - 1);
+      const __m128i pattern = _mm_shuffle_epi8(
+          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
+      // Uninitialized bytes are masked out by the shuffle mask.
+      SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
+      pattern_size *= 16 / pattern_size;
+      while (op < op_limit && op <= buf_limit - 16) {
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
+        op += pattern_size;
+      }
+      if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
+    }
+    return IncrementalCopySlow(src, op, op_limit);
+#else
    // If plenty of buffer space remains, expand the pattern to at least 8
    // bytes. The way the following loop is written, we need 8 bytes of buffer
    // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
@ -198,6 +229,7 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
    } else {
      return IncrementalCopySlow(src, op, op_limit);
    }
+#endif
  }
  assert(pattern_size >= 8);