Avoid store-forwarding stalls in Zippy's IncrementalCopy

NEW: Annotate `pattern` as initialized, for MSan.

Snappy's IncrementalCopy routine optimizes for speed by reading and writing
memory in blocks of eight or sixteen bytes. If the gap between the source
and destination pointers is smaller than eight bytes, snappy's strategy is
to expand the gap by issuing a series of partly-overlapping eight-byte
loads+stores. Because the range of each load partly overlaps that of the
store which preceded it, the store buffer cannot be forwarded to the load,
and the load stalls while it waits for the store to retire. This is called a
store-forwarding stall.

We can use fewer loads and avoid most of the stalls by loading the first
eight bytes into an 128-bit XMM register, then using PSHUFB to permute the
register's contents in-place into the desired repeating sequence of bytes.
When falling back to IncrementalCopySlow, use memset if the pattern size == 1.
This eliminates around 60% of the stalls.

name                       old time/op    new time/op    delta
BM_UFlat/0 [html]        48.6µs ± 0%    48.2µs ± 0%   -0.92%        (p=0.000 n=19+18)
BM_UFlat/1 [urls]         589µs ± 0%     576µs ± 0%   -2.17%        (p=0.000 n=19+18)
BM_UFlat/2 [jpg]         7.12µs ± 0%    7.10µs ± 0%     ~           (p=0.071 n=19+18)
BM_UFlat/3 [jpg_200]      162ns ± 0%     151ns ± 0%   -7.06%        (p=0.000 n=19+18)
BM_UFlat/4 [pdf]         8.25µs ± 0%    8.19µs ± 0%   -0.74%        (p=0.000 n=19+18)
BM_UFlat/5 [html4]        218µs ± 0%     218µs ± 0%   +0.09%        (p=0.000 n=17+18)
BM_UFlat/6 [txt1]         191µs ± 0%     189µs ± 0%   -1.12%        (p=0.000 n=19+18)
BM_UFlat/7 [txt2]         168µs ± 0%     167µs ± 0%   -1.01%        (p=0.000 n=19+18)
BM_UFlat/8 [txt3]         502µs ± 0%     499µs ± 0%   -0.52%        (p=0.000 n=19+18)
BM_UFlat/9 [txt4]         704µs ± 0%     695µs ± 0%   -1.26%        (p=0.000 n=19+18)
BM_UFlat/10 [pb]         45.6µs ± 0%    44.2µs ± 0%   -3.13%        (p=0.000 n=19+15)
BM_UFlat/11 [gaviota]     188µs ± 0%     194µs ± 0%   +3.06%        (p=0.000 n=15+18)
BM_UFlat/12 [cp]         15.1µs ± 2%    14.7µs ± 1%   -2.09%        (p=0.000 n=18+18)
BM_UFlat/13 [c]          7.38µs ± 0%    7.36µs ± 0%   -0.28%        (p=0.000 n=16+18)
BM_UFlat/14 [lsp]        2.31µs ± 0%    2.37µs ± 0%   +2.64%        (p=0.000 n=19+18)
BM_UFlat/15 [xls]         984µs ± 0%     909µs ± 0%   -7.59%        (p=0.000 n=19+18)
BM_UFlat/16 [xls_200]     215ns ± 0%     217ns ± 0%   +0.71%        (p=0.000 n=19+15)
BM_UFlat/17 [bin]         289µs ± 0%     287µs ± 0%   -0.71%        (p=0.000 n=19+18)
BM_UFlat/18 [bin_200]     161ns ± 0%     116ns ± 0%  -28.09%        (p=0.000 n=19+16)
BM_UFlat/19 [sum]        31.9µs ± 0%    29.2µs ± 0%   -8.37%        (p=0.000 n=19+18)
BM_UFlat/20 [man]        3.13µs ± 1%    3.07µs ± 0%   -1.79%        (p=0.000 n=19+18)

name                       old allocs/op  new allocs/op  delta
BM_UFlat/0 [html]         0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/1 [urls]         0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/2 [jpg]          0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/3 [jpg_200]      0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/4 [pdf]          0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/5 [html4]        0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/6 [txt1]         0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/7 [txt2]         0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/8 [txt3]         0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/9 [txt4]         0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/10 [pb]          0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/11 [gaviota]     0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/12 [cp]          0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/13 [c]           0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/14 [lsp]         0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/15 [xls]         0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/16 [xls_200]     0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/17 [bin]         0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/18 [bin_200]     0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/19 [sum]         0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)
BM_UFlat/20 [man]         0.00 ±NaN%     0.00 ±NaN%     ~     (all samples are equal)

name                       old speed      new speed      delta
BM_UFlat/0 [html]      2.11GB/s ± 0%  2.13GB/s ± 0%   +0.92%        (p=0.000 n=19+18)
BM_UFlat/1 [urls]      1.19GB/s ± 0%  1.22GB/s ± 0%   +2.22%        (p=0.000 n=16+17)
BM_UFlat/2 [jpg]       17.3GB/s ± 0%  17.3GB/s ± 0%     ~           (p=0.074 n=19+18)
BM_UFlat/3 [jpg_200]   1.23GB/s ± 0%  1.33GB/s ± 0%   +7.58%        (p=0.000 n=19+18)
BM_UFlat/4 [pdf]       12.4GB/s ± 0%  12.5GB/s ± 0%   +0.74%        (p=0.000 n=19+18)
BM_UFlat/5 [html4]     1.88GB/s ± 0%  1.88GB/s ± 0%   -0.09%        (p=0.000 n=18+18)
BM_UFlat/6 [txt1]       798MB/s ± 0%   807MB/s ± 0%   +1.13%        (p=0.000 n=19+18)
BM_UFlat/7 [txt2]       743MB/s ± 0%   751MB/s ± 0%   +1.02%        (p=0.000 n=19+18)
BM_UFlat/8 [txt3]       850MB/s ± 0%   855MB/s ± 0%   +0.52%        (p=0.000 n=19+18)
BM_UFlat/9 [txt4]       684MB/s ± 0%   693MB/s ± 0%   +1.28%        (p=0.000 n=19+18)
BM_UFlat/10 [pb]       2.60GB/s ± 0%  2.69GB/s ± 0%   +3.25%        (p=0.000 n=19+16)
BM_UFlat/11 [gaviota]   979MB/s ± 0%   950MB/s ± 0%   -2.97%        (p=0.000 n=15+18)
BM_UFlat/12 [cp]       1.63GB/s ± 2%  1.67GB/s ± 1%   +2.13%        (p=0.000 n=18+18)
BM_UFlat/13 [c]        1.51GB/s ± 0%  1.52GB/s ± 0%   +0.29%        (p=0.000 n=16+18)
BM_UFlat/14 [lsp]      1.61GB/s ± 1%  1.57GB/s ± 0%   -2.57%        (p=0.000 n=19+18)
BM_UFlat/15 [xls]      1.05GB/s ± 0%  1.13GB/s ± 0%   +8.22%        (p=0.000 n=19+18)
BM_UFlat/16 [xls_200]   928MB/s ± 0%   921MB/s ± 0%   -0.81%        (p=0.000 n=19+17)
BM_UFlat/17 [bin]      1.78GB/s ± 0%  1.79GB/s ± 0%   +0.71%        (p=0.000 n=19+18)
BM_UFlat/18 [bin_200]  1.24GB/s ± 0%  1.72GB/s ± 0%  +38.92%        (p=0.000 n=19+18)
BM_UFlat/19 [sum]      1.20GB/s ± 0%  1.31GB/s ± 0%   +9.15%        (p=0.000 n=19+18)
BM_UFlat/20 [man]      1.35GB/s ± 1%  1.38GB/s ± 0%   +1.84%        (p=0.000 n=19+18)
This commit is contained in:
atdt 2018-03-26 21:55:23 -07:00 committed by Victor Costan
parent 4f7bd2dbfd
commit 8f469d97e2
3 changed files with 58 additions and 1 deletions

View File

@ -218,6 +218,19 @@ static const uint16 char_table[256] = {
0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
}; };
// This is a table of shuffle control masks that can be used as the source
// operand for PSHUFB to permute the contents of the destination XMM register
// into a repeating byte pattern.
alignas(16) static const char pshufb_fill_patterns[7][16] = {
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
{0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
{0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
{0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
{0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
};
} // end namespace internal } // end namespace internal
} // end namespace snappy } // end namespace snappy

View File

@ -53,6 +53,18 @@
#include <intrin.h> #include <intrin.h>
#endif // defined(_MSC_VER) #endif // defined(_MSC_VER)
#ifndef __has_feature
#define __has_feature(x) 0
#endif
#if __has_feature(memory_sanitizer)
#include <sanitizer/msan_interface.h>
#define SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) \
__msan_unpoison((address), (size))
#else
#define SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) /* empty */
#endif // __has_feature(memory_sanitizer)
#include "snappy-stubs-public.h" #include "snappy-stubs-public.h"
#if defined(__x86_64__) #if defined(__x86_64__)

View File

@ -40,7 +40,7 @@
#endif #endif
#if SNAPPY_HAVE_SSE2 #if SNAPPY_HAVE_SSE2
#include <emmintrin.h> #include <x86intrin.h>
#endif #endif
#include <stdio.h> #include <stdio.h>
@ -56,6 +56,7 @@ using internal::COPY_2_BYTE_OFFSET;
using internal::LITERAL; using internal::LITERAL;
using internal::char_table; using internal::char_table;
using internal::kMaximumTagLength; using internal::kMaximumTagLength;
using internal::pshufb_fill_patterns;
// Any hash function will produce a valid compressed bitstream, but a good // Any hash function will produce a valid compressed bitstream, but a good
// hash function reduces the number of collisions and thus yields better // hash function reduces the number of collisions and thus yields better
@ -182,6 +183,36 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
// Handle the uncommon case where pattern is less than 8 bytes. // Handle the uncommon case where pattern is less than 8 bytes.
if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) { if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
#if SNAPPY_HAVE_SSE2
// Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
// to permute the register's contents in-place into a repeating sequence of
// the first "pattern_size" bytes.
// For example, suppose:
// src == "abc"
// op == op + 3
// After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
// followed by one byte of slop: abcabcabcabcabca.
//
// The non-SSE fallback implementation suffers from store-forwarding stalls
// because its loads and stores partly overlap. By expanding the pattern
// in-place, we avoid the penalty.
if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
const __m128i shuffle_mask = _mm_load_si128(
reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
+ pattern_size - 1);
const __m128i pattern = _mm_shuffle_epi8(
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
// Uninitialized bytes are masked out by the shuffle mask.
SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
pattern_size *= 16 / pattern_size;
while (op < op_limit && op <= buf_limit - 16) {
_mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
op += pattern_size;
}
if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
}
return IncrementalCopySlow(src, op, op_limit);
#else
// If plenty of buffer space remains, expand the pattern to at least 8 // If plenty of buffer space remains, expand the pattern to at least 8
// bytes. The way the following loop is written, we need 8 bytes of buffer // bytes. The way the following loop is written, we need 8 bytes of buffer
// space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10 // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
@ -198,6 +229,7 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
} else { } else {
return IncrementalCopySlow(src, op, op_limit); return IncrementalCopySlow(src, op, op_limit);
} }
#endif
} }
assert(pattern_size >= 8); assert(pattern_size >= 8);