mirror of
https://github.com/google/snappy.git
synced 2024-11-25 22:47:10 +00:00
Optimize zippy decompression by making IncrementalCopy faster.
When SSSE3 is available: - Use PSHUFB (_mm_shuffle_epi8) to handle pattern size 1 to 15 (previously it handled size 1 to 7). - This enables us to do 16 byte copies instead of 8 bytes copies because we know that the pattern size >= 16. - Use shuffle-reshuffle strategy to generate the next pattern after loading the initial pattern. This enables us to write 4 conditionals (similar to when pattern size >= 16) which would allow FDO to layout the code with respect to actual probabilities of each length. - The PSHUFB masks are now generated programmatically at compile-time. When SSSE3 is unavailable: - No change. In both cases: - assert(op < op_limit) in IncrementalCopy so that we can check 'op_limit <= buf_limit - 15' instead of 'op_limit <= buf_limit - 16'. All existing call sites of IncrementalCopy guarantee this. 'bin' case is notably >20% faster because it has many repeated character patterns (i.e. pattern_size = 1). PiperOrigin-RevId: 346454471
This commit is contained in:
parent
56c2c247d0
commit
a9730ed505
284
snappy.cc
284
snappy.cc
|
@ -74,6 +74,7 @@
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
namespace snappy {
|
namespace snappy {
|
||||||
|
@ -178,6 +179,16 @@ void UnalignedCopy128(const void* src, void* dst) {
|
||||||
std::memcpy(dst, tmp, 16);
|
std::memcpy(dst, tmp, 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <bool use_16bytes_chunk>
|
||||||
|
inline void ConditionalUnalignedCopy128(const char* src, char* dst) {
|
||||||
|
if (use_16bytes_chunk) {
|
||||||
|
UnalignedCopy128(src, dst);
|
||||||
|
} else {
|
||||||
|
UnalignedCopy64(src, dst);
|
||||||
|
UnalignedCopy64(src + 8, dst + 8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
|
// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
|
||||||
// for handling COPY operations where the input and output regions may overlap.
|
// for handling COPY operations where the input and output regions may overlap.
|
||||||
// For example, suppose:
|
// For example, suppose:
|
||||||
|
@ -205,36 +216,164 @@ inline char* IncrementalCopySlow(const char* src, char* op,
|
||||||
|
|
||||||
#if SNAPPY_HAVE_SSSE3
|
#if SNAPPY_HAVE_SSSE3
|
||||||
|
|
||||||
// This is a table of shuffle control masks that can be used as the source
|
// Computes the bytes for shuffle control mask (please read comments on
|
||||||
|
// 'pattern_generation_masks' as well) for the given index_offset and
|
||||||
|
// pattern_size. For example, when the 'offset' is 6, it will generate a
|
||||||
|
// repeating pattern of size 6. So, the first 16 byte indexes will correspond to
|
||||||
|
// the pattern-bytes {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3} and the
|
||||||
|
// next 16 byte indexes will correspond to the pattern-bytes {4, 5, 0, 1, 2, 3,
|
||||||
|
// 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
|
||||||
|
// calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and
|
||||||
|
// MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively.
|
||||||
|
template <size_t... indexes>
|
||||||
|
inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
|
||||||
|
int index_offset, int pattern_size, index_sequence<indexes...>) {
|
||||||
|
return {static_cast<char>((index_offset + indexes) % pattern_size)...};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Computes the shuffle control mask bytes array for given pattern-sizes and
|
||||||
|
// returns an array.
|
||||||
|
template <size_t... pattern_sizes_minus_one>
|
||||||
|
inline constexpr std::array<std::array<char, sizeof(__m128i)>,
|
||||||
|
sizeof...(pattern_sizes_minus_one)>
|
||||||
|
MakePatternMaskBytesTable(int index_offset,
|
||||||
|
index_sequence<pattern_sizes_minus_one...>) {
|
||||||
|
return {MakePatternMaskBytes(
|
||||||
|
index_offset, pattern_sizes_minus_one + 1,
|
||||||
|
make_index_sequence</*indexes=*/sizeof(__m128i)>())...};
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is an array of shuffle control masks that can be used as the source
|
||||||
// operand for PSHUFB to permute the contents of the destination XMM register
|
// operand for PSHUFB to permute the contents of the destination XMM register
|
||||||
// into a repeating byte pattern.
|
// into a repeating byte pattern.
|
||||||
alignas(16) const char pshufb_fill_patterns[7][16] = {
|
alignas(16) inline constexpr std::array<std::array<char, sizeof(__m128i)>,
|
||||||
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
16> pattern_generation_masks =
|
||||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
MakePatternMaskBytesTable(
|
||||||
{0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
|
/*index_offset=*/0,
|
||||||
{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
|
/*pattern_sizes_minus_one=*/make_index_sequence<16>());
|
||||||
{0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
|
|
||||||
{0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
|
|
||||||
{0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
|
|
||||||
};
|
|
||||||
|
|
||||||
// j * (16 / j) for all j from 0 to 7. 0 is not actually used.
|
// Similar to 'pattern_generation_masks', this table is used to "rotate" the
|
||||||
const uint8_t pattern_size_table[8] = {0, 16, 16, 15, 16, 15, 12, 14};
|
// pattern so that we can copy the *next 16 bytes* consistent with the pattern.
|
||||||
|
// Basically, pattern_reshuffle_masks is a continuation of
|
||||||
|
// pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
|
||||||
|
// pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
|
||||||
|
alignas(16) inline constexpr std::array<std::array<char, sizeof(__m128i)>,
|
||||||
|
16> pattern_reshuffle_masks =
|
||||||
|
MakePatternMaskBytesTable(
|
||||||
|
/*index_offset=*/16,
|
||||||
|
/*pattern_sizes_minus_one=*/make_index_sequence<16>());
|
||||||
|
|
||||||
|
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
|
||||||
|
static inline __m128i LoadPattern(const char* src, const size_t pattern_size) {
|
||||||
|
__m128i generation_mask = _mm_load_si128(reinterpret_cast<const __m128i*>(
|
||||||
|
pattern_generation_masks[pattern_size - 1].data()));
|
||||||
|
// Uninitialized bytes are masked out by the shuffle mask.
|
||||||
|
// TODO: remove annotation and macro defs once MSan is fixed.
|
||||||
|
SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(src + pattern_size, 16 - pattern_size);
|
||||||
|
return _mm_shuffle_epi8(
|
||||||
|
_mm_loadu_si128(reinterpret_cast<const __m128i*>(src)), generation_mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
|
||||||
|
static inline std::pair<__m128i /* pattern */, __m128i /* reshuffle_mask */>
|
||||||
|
LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
|
||||||
|
__m128i pattern = LoadPattern(src, pattern_size);
|
||||||
|
|
||||||
|
// This mask will generate the next 16 bytes in-place. Doing so enables us to
|
||||||
|
// write data by at most 4 _mm_storeu_si128.
|
||||||
|
//
|
||||||
|
// For example, suppose pattern is: abcdefabcdefabcd
|
||||||
|
// Shuffling with this mask will generate: efabcdefabcdefab
|
||||||
|
// Shuffling again will generate: cdefabcdefabcdef
|
||||||
|
__m128i reshuffle_mask = _mm_load_si128(reinterpret_cast<const __m128i*>(
|
||||||
|
pattern_reshuffle_masks[pattern_size - 1].data()));
|
||||||
|
return {pattern, reshuffle_mask};
|
||||||
|
}
|
||||||
|
|
||||||
#endif // SNAPPY_HAVE_SSSE3
|
#endif // SNAPPY_HAVE_SSSE3
|
||||||
|
|
||||||
|
// Fallback for when we need to copy while extending the pattern, for example
|
||||||
|
// copying 10 bytes from 3 positions back abc -> abcabcabcabca.
|
||||||
|
//
|
||||||
|
// REQUIRES: [dst - offset, dst + 64) is a valid address range.
|
||||||
|
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
|
||||||
|
static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
|
||||||
|
#if SNAPPY_HAVE_SSSE3
|
||||||
|
if (SNAPPY_PREDICT_TRUE(offset <= 16)) {
|
||||||
|
switch (offset) {
|
||||||
|
case 0:
|
||||||
|
return false;
|
||||||
|
case 1: {
|
||||||
|
std::memset(dst, dst[-1], 64);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
case 2:
|
||||||
|
case 4:
|
||||||
|
case 8:
|
||||||
|
case 16: {
|
||||||
|
__m128i pattern = LoadPattern(dst - offset, offset);
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
auto pattern_and_reshuffle_mask =
|
||||||
|
LoadPatternAndReshuffleMask(dst - offset, offset);
|
||||||
|
__m128i pattern = pattern_and_reshuffle_mask.first;
|
||||||
|
__m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern);
|
||||||
|
pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (SNAPPY_PREDICT_TRUE(offset < 16)) {
|
||||||
|
if (SNAPPY_PREDICT_FALSE(offset == 0)) return false;
|
||||||
|
// Extend the pattern to the first 16 bytes.
|
||||||
|
for (int i = 0; i < 16; i++) dst[i] = dst[i - offset];
|
||||||
|
// Find a multiple of pattern >= 16.
|
||||||
|
static std::array<uint8_t, 16> pattern_sizes = []() {
|
||||||
|
std::array<uint8_t, 16> res;
|
||||||
|
for (int i = 1; i < 16; i++) res[i] = (16 / i + 1) * i;
|
||||||
|
return res;
|
||||||
|
}();
|
||||||
|
offset = pattern_sizes[offset];
|
||||||
|
for (int i = 1; i < 4; i++) {
|
||||||
|
std::memcpy(dst + i * 16, dst + i * 16 - offset, 16);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#endif // SNAPPY_HAVE_SSSE3
|
||||||
|
|
||||||
|
// Very rare.
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
std::memcpy(dst + i * 16, dst + i * 16 - offset, 16);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// Copy [src, src+(op_limit-op)) to [op, op_limit) but faster than
|
// Copy [src, src+(op_limit-op)) to [op, op_limit) but faster than
|
||||||
// IncrementalCopySlow. buf_limit is the address past the end of the writable
|
// IncrementalCopySlow. buf_limit is the address past the end of the writable
|
||||||
// region of the buffer.
|
// region of the buffer.
|
||||||
inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||||
char* const buf_limit) {
|
char* const buf_limit) {
|
||||||
|
#if SNAPPY_HAVE_SSSE3
|
||||||
|
constexpr int big_pattern_size_lower_bound = 16;
|
||||||
|
#else
|
||||||
|
constexpr int big_pattern_size_lower_bound = 8;
|
||||||
|
#endif
|
||||||
|
|
||||||
// Terminology:
|
// Terminology:
|
||||||
//
|
//
|
||||||
// slop = buf_limit - op
|
// slop = buf_limit - op
|
||||||
// pat = op - src
|
// pat = op - src
|
||||||
// len = limit - op
|
// len = op_limit - op
|
||||||
assert(src < op);
|
assert(src < op);
|
||||||
assert(op <= op_limit);
|
assert(op < op_limit);
|
||||||
assert(op_limit <= buf_limit);
|
assert(op_limit <= buf_limit);
|
||||||
// NOTE: The copy tags use 3 or 6 bits to store the copy length, so len <= 64.
|
// NOTE: The copy tags use 3 or 6 bits to store the copy length, so len <= 64.
|
||||||
assert(op_limit - op <= 64);
|
assert(op_limit - op <= 64);
|
||||||
|
@ -265,11 +404,13 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||||
// input. In general if we always predict len <= 16 it would be an ok
|
// input. In general if we always predict len <= 16 it would be an ok
|
||||||
// prediction.
|
// prediction.
|
||||||
//
|
//
|
||||||
// In order to be fast we want a pattern >= 8 bytes and an unrolled loop
|
// In order to be fast we want a pattern >= 16 bytes (or 8 bytes in non-SSE)
|
||||||
// copying 2x 8 bytes at a time.
|
// and an unrolled loop copying 1x 16 bytes (or 2x 8 bytes in non-SSE) at a
|
||||||
|
// time.
|
||||||
|
|
||||||
// Handle the uncommon case where pattern is less than 8 bytes.
|
// Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE)
|
||||||
if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
|
// bytes.
|
||||||
|
if (pattern_size < big_pattern_size_lower_bound) {
|
||||||
#if SNAPPY_HAVE_SSSE3
|
#if SNAPPY_HAVE_SSSE3
|
||||||
// Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
|
// Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
|
||||||
// to permute the register's contents in-place into a repeating sequence of
|
// to permute the register's contents in-place into a repeating sequence of
|
||||||
|
@ -283,24 +424,55 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||||
// The non-SSE fallback implementation suffers from store-forwarding stalls
|
// The non-SSE fallback implementation suffers from store-forwarding stalls
|
||||||
// because its loads and stores partly overlap. By expanding the pattern
|
// because its loads and stores partly overlap. By expanding the pattern
|
||||||
// in-place, we avoid the penalty.
|
// in-place, we avoid the penalty.
|
||||||
if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
|
|
||||||
const __m128i shuffle_mask = _mm_load_si128(
|
// Typically, the op_limit is the gating factor so try to simplify the loop
|
||||||
reinterpret_cast<const __m128i*>(pshufb_fill_patterns) +
|
// based on that.
|
||||||
pattern_size - 1);
|
if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
|
||||||
const __m128i pattern = _mm_shuffle_epi8(
|
auto pattern_and_reshuffle_mask =
|
||||||
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
|
LoadPatternAndReshuffleMask(src, pattern_size);
|
||||||
// Uninitialized bytes are masked out by the shuffle mask.
|
__m128i pattern = pattern_and_reshuffle_mask.first;
|
||||||
// TODO: remove annotation and macro defs once MSan is fixed.
|
__m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
|
||||||
SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
|
|
||||||
pattern_size = pattern_size_table[pattern_size];
|
// There is at least one, and at most four 16-byte blocks. Writing four
|
||||||
char* op_end = std::min(op_limit, buf_limit - 15);
|
// conditionals instead of a loop allows FDO to layout the code with
|
||||||
while (op < op_end) {
|
// respect to the actual probabilities of each length.
|
||||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
|
// TODO: Replace with loop with trip count hint.
|
||||||
op += pattern_size;
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
|
||||||
|
|
||||||
|
if (op + 16 < op_limit) {
|
||||||
|
pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
|
||||||
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(op + 16), pattern);
|
||||||
}
|
}
|
||||||
if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
|
if (op + 32 < op_limit) {
|
||||||
|
pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
|
||||||
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(op + 32), pattern);
|
||||||
|
}
|
||||||
|
if (op + 48 < op_limit) {
|
||||||
|
pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
|
||||||
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(op + 48), pattern);
|
||||||
|
}
|
||||||
|
return op_limit;
|
||||||
}
|
}
|
||||||
return IncrementalCopySlow(src, op, op_limit);
|
char* const op_end = buf_limit - 15;
|
||||||
|
if (SNAPPY_PREDICT_TRUE(op < op_end)) {
|
||||||
|
auto pattern_and_reshuffle_mask =
|
||||||
|
LoadPatternAndReshuffleMask(src, pattern_size);
|
||||||
|
__m128i pattern = pattern_and_reshuffle_mask.first;
|
||||||
|
__m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
|
||||||
|
|
||||||
|
// This code path is relatively cold however so we save code size
|
||||||
|
// by avoiding unrolling and vectorizing.
|
||||||
|
//
|
||||||
|
// TODO: Remove pragma when when cold regions don't get
|
||||||
|
// vectorized or unrolled.
|
||||||
|
#pragma nounroll
|
||||||
|
do {
|
||||||
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
|
||||||
|
pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
|
||||||
|
op += 16;
|
||||||
|
} while (SNAPPY_PREDICT_TRUE(op < op_end));
|
||||||
|
}
|
||||||
|
return IncrementalCopySlow(op - pattern_size, op, op_limit);
|
||||||
#else // !SNAPPY_HAVE_SSSE3
|
#else // !SNAPPY_HAVE_SSSE3
|
||||||
// If plenty of buffer space remains, expand the pattern to at least 8
|
// If plenty of buffer space remains, expand the pattern to at least 8
|
||||||
// bytes. The way the following loop is written, we need 8 bytes of buffer
|
// bytes. The way the following loop is written, we need 8 bytes of buffer
|
||||||
|
@ -320,34 +492,30 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||||
}
|
}
|
||||||
#endif // SNAPPY_HAVE_SSSE3
|
#endif // SNAPPY_HAVE_SSSE3
|
||||||
}
|
}
|
||||||
assert(pattern_size >= 8);
|
assert(pattern_size >= big_pattern_size_lower_bound);
|
||||||
|
constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16;
|
||||||
|
|
||||||
// Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
|
// Copy 1x 16 bytes (or 2x 8 bytes in non-SSE) at a time. Because op - src can
|
||||||
// UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
|
// be < 16 in non-SSE, a single UnalignedCopy128 might overwrite data in op.
|
||||||
// because expanding the pattern to at least 8 bytes guarantees that
|
// UnalignedCopy64 is safe because expanding the pattern to at least 8 bytes
|
||||||
// op - src >= 8.
|
// guarantees that op - src >= 8.
|
||||||
//
|
//
|
||||||
// Typically, the op_limit is the gating factor so try to simplify the loop
|
// Typically, the op_limit is the gating factor so try to simplify the loop
|
||||||
// based on that.
|
// based on that.
|
||||||
if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
|
if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
|
||||||
// There is at least one, and at most four 16-byte blocks. Writing four
|
// There is at least one, and at most four 16-byte blocks. Writing four
|
||||||
// conditionals instead of a loop allows FDO to layout the code with respect
|
// conditionals instead of a loop allows FDO to layout the code with respect
|
||||||
// to the actual probabilities of each length.
|
// to the actual probabilities of each length.
|
||||||
// TODO: Replace with loop with trip count hint.
|
// TODO: Replace with loop with trip count hint.
|
||||||
UnalignedCopy64(src, op);
|
ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
|
||||||
UnalignedCopy64(src + 8, op + 8);
|
|
||||||
|
|
||||||
if (op + 16 < op_limit) {
|
if (op + 16 < op_limit) {
|
||||||
UnalignedCopy64(src + 16, op + 16);
|
ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 16, op + 16);
|
||||||
UnalignedCopy64(src + 24, op + 24);
|
|
||||||
}
|
}
|
||||||
if (op + 32 < op_limit) {
|
if (op + 32 < op_limit) {
|
||||||
UnalignedCopy64(src + 32, op + 32);
|
ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 32, op + 32);
|
||||||
UnalignedCopy64(src + 40, op + 40);
|
|
||||||
}
|
}
|
||||||
if (op + 48 < op_limit) {
|
if (op + 48 < op_limit) {
|
||||||
UnalignedCopy64(src + 48, op + 48);
|
ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 48, op + 48);
|
||||||
UnalignedCopy64(src + 56, op + 56);
|
|
||||||
}
|
}
|
||||||
return op_limit;
|
return op_limit;
|
||||||
}
|
}
|
||||||
|
@ -358,12 +526,9 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||||
//
|
//
|
||||||
// TODO: Remove pragma when when cold regions don't get vectorized
|
// TODO: Remove pragma when when cold regions don't get vectorized
|
||||||
// or unrolled.
|
// or unrolled.
|
||||||
#ifdef __clang__
|
#pragma nounroll
|
||||||
#pragma clang loop unroll(disable)
|
|
||||||
#endif
|
|
||||||
for (char* op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
|
for (char* op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
|
||||||
UnalignedCopy64(src, op);
|
ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
|
||||||
UnalignedCopy64(src + 8, op + 8);
|
|
||||||
}
|
}
|
||||||
if (op >= op_limit) return op_limit;
|
if (op >= op_limit) return op_limit;
|
||||||
|
|
||||||
|
@ -894,10 +1059,10 @@ std::pair<const uint8_t*, char*> DecompressBranchless(
|
||||||
if (SNAPPY_PREDICT_FALSE(std::size_t(offset) < len)) {
|
if (SNAPPY_PREDICT_FALSE(std::size_t(offset) < len)) {
|
||||||
assert(tag_type != 0);
|
assert(tag_type != 0);
|
||||||
// offset 0 is an error.
|
// offset 0 is an error.
|
||||||
if (SNAPPY_PREDICT_FALSE(offset == 0)) break;
|
if (!Copy64BytesWithPatternExtension(op_base + op, offset)) {
|
||||||
op = IncrementalCopy(op_base + delta, op_base + op, op_base + op + len,
|
break;
|
||||||
op_base + op_limit) -
|
}
|
||||||
op_base;
|
op += len;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1089,6 +1254,7 @@ class SnappyDecompressor {
|
||||||
preload = LittleEndian::Load32(ip);
|
preload = LittleEndian::Load32(ip);
|
||||||
const uint32_t trailer = ExtractLowBytes(preload, c & 3);
|
const uint32_t trailer = ExtractLowBytes(preload, c & 3);
|
||||||
const uint32_t length = entry & 0xff;
|
const uint32_t length = entry & 0xff;
|
||||||
|
assert(length > 0);
|
||||||
|
|
||||||
// copy_offset/256 is encoded in bits 8..10. By just fetching
|
// copy_offset/256 is encoded in bits 8..10. By just fetching
|
||||||
// those bits, we get copy_offset (since the bit-field starts at
|
// those bits, we get copy_offset (since the bit-field starts at
|
||||||
|
@ -1459,6 +1625,7 @@ class SnappyIOVecWriter {
|
||||||
if (to_copy > len) {
|
if (to_copy > len) {
|
||||||
to_copy = len;
|
to_copy = len;
|
||||||
}
|
}
|
||||||
|
assert(to_copy > 0);
|
||||||
|
|
||||||
IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
|
IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
|
||||||
curr_iov_output_, curr_iov_output_ + to_copy,
|
curr_iov_output_, curr_iov_output_ + to_copy,
|
||||||
|
@ -1552,6 +1719,7 @@ class SnappyArrayWriter {
|
||||||
|
|
||||||
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
|
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
|
||||||
inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) {
|
inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) {
|
||||||
|
assert(len > 0);
|
||||||
char* const op = *op_p;
|
char* const op = *op_p;
|
||||||
assert(op >= base_);
|
assert(op >= base_);
|
||||||
char* const op_end = op + len;
|
char* const op_end = op + len;
|
||||||
|
|
|
@ -695,6 +695,41 @@ TEST(Snappy, SimpleTests) {
|
||||||
Verify("abcaaaaaaa" + std::string(65536, 'b') + std::string("aaaaa") + "abc");
|
Verify("abcaaaaaaa" + std::string(65536, 'b') + std::string("aaaaa") + "abc");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Regression test for cr/345340892.
|
||||||
|
TEST(Snappy, AppendSelfPatternExtensionEdgeCases) {
|
||||||
|
Verify("abcabcabcabcabcabcab");
|
||||||
|
Verify("abcabcabcabcabcabcab0123456789ABCDEF");
|
||||||
|
|
||||||
|
Verify("abcabcabcabcabcabcabcabcabcabcabcabc");
|
||||||
|
Verify("abcabcabcabcabcabcabcabcabcabcabcabc0123456789ABCDEF");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regression test for cr/345340892.
|
||||||
|
TEST(Snappy, AppendSelfPatternExtensionEdgeCasesExhaustive) {
|
||||||
|
std::mt19937 rng;
|
||||||
|
std::uniform_int_distribution<int> uniform_byte(0, 255);
|
||||||
|
for (int pattern_size = 1; pattern_size <= 18; ++pattern_size) {
|
||||||
|
for (int length = 1; length <= 64; ++length) {
|
||||||
|
for (int extra_bytes_after_pattern : {0, 1, 15, 16, 128}) {
|
||||||
|
const int size = pattern_size + length + extra_bytes_after_pattern;
|
||||||
|
std::string input;
|
||||||
|
input.resize(size);
|
||||||
|
for (int i = 0; i < pattern_size; ++i) {
|
||||||
|
input[i] = 'a' + i;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < length; ++i) {
|
||||||
|
input[pattern_size + i] = input[i];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < extra_bytes_after_pattern; ++i) {
|
||||||
|
input[pattern_size + length + i] =
|
||||||
|
static_cast<char>(uniform_byte(rng));
|
||||||
|
}
|
||||||
|
Verify(input);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Verify max blowup (lots of four-byte copies)
|
// Verify max blowup (lots of four-byte copies)
|
||||||
TEST(Snappy, MaxBlowup) {
|
TEST(Snappy, MaxBlowup) {
|
||||||
std::mt19937 rng;
|
std::mt19937 rng;
|
||||||
|
@ -1285,6 +1320,12 @@ static struct {
|
||||||
{ "gaviota", "kppkn.gtb", 0 },
|
{ "gaviota", "kppkn.gtb", 0 },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
TEST(Snappy, TestBenchmarkFiles) {
|
||||||
|
for (int i = 0; i < ARRAYSIZE(files); ++i) {
|
||||||
|
Verify(ReadTestDataFile(files[i].filename, files[i].size_limit));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void BM_UFlat(int iters, int arg) {
|
static void BM_UFlat(int iters, int arg) {
|
||||||
StopBenchmarkTiming();
|
StopBenchmarkTiming();
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue