mirror of https://github.com/google/snappy.git
Enable vector byte shuffle optimizations on ARM NEON
The SSSE3 intrinsics we use have their direct analogues in NEON, so making this optimization portable requires a very thin translation layer. PiperOrigin-RevId: 381280165
This commit is contained in:
parent
b638ebe5d9
commit
b3fb0b5b4b
|
@ -36,6 +36,56 @@
|
||||||
namespace snappy {
|
namespace snappy {
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
|
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||||
|
#if SNAPPY_HAVE_SSSE3
|
||||||
|
using V128 = __m128i;
|
||||||
|
#else
|
||||||
|
using V128 = uint8x16_t;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Load 128 bits of integer data. `src` must be 16-byte aligned.
|
||||||
|
inline V128 V128_Load(const V128* src);
|
||||||
|
|
||||||
|
// Load 128 bits of integer data. `src` does not need to be aligned.
|
||||||
|
inline V128 V128_LoadU(const V128* src);
|
||||||
|
|
||||||
|
// Store 128 bits of integer data. `dst` does not need to be aligned.
|
||||||
|
inline void V128_StoreU(V128* dst, V128 val);
|
||||||
|
|
||||||
|
// Shuffle packed 8-bit integers using a shuffle mask.
|
||||||
|
// Each packed integer in the shuffle mask must be in [0,16).
|
||||||
|
inline V128 V128_Shuffle(V128 input, V128 shuffle_mask);
|
||||||
|
|
||||||
|
#if SNAPPY_HAVE_SSSE3
|
||||||
|
inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
|
||||||
|
|
||||||
|
inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
|
||||||
|
|
||||||
|
inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); }
|
||||||
|
|
||||||
|
inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
|
||||||
|
return _mm_shuffle_epi8(input, shuffle_mask);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
inline V128 V128_Load(const V128* src) {
|
||||||
|
return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline V128 V128_LoadU(const V128* src) {
|
||||||
|
return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void V128_StoreU(V128* dst, V128 val) {
|
||||||
|
vst1q_u8(reinterpret_cast<uint8_t*>(dst), val);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
|
||||||
|
assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15);
|
||||||
|
return vqtbl1q_u8(input, shuffle_mask);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||||
|
|
||||||
// Working memory performs a single allocation to hold all scratch space
|
// Working memory performs a single allocation to hold all scratch space
|
||||||
// required for compression.
|
// required for compression.
|
||||||
class WorkingMemory {
|
class WorkingMemory {
|
||||||
|
|
108
snappy.cc
108
snappy.cc
|
@ -30,17 +30,6 @@
|
||||||
#include "snappy-sinksource.h"
|
#include "snappy-sinksource.h"
|
||||||
#include "snappy.h"
|
#include "snappy.h"
|
||||||
|
|
||||||
#if !defined(SNAPPY_HAVE_SSSE3)
|
|
||||||
// __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
|
|
||||||
// support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
|
|
||||||
// defines __AVX__ when AVX support is available.
|
|
||||||
#if defined(__SSSE3__) || defined(__AVX__)
|
|
||||||
#define SNAPPY_HAVE_SSSE3 1
|
|
||||||
#else
|
|
||||||
#define SNAPPY_HAVE_SSSE3 0
|
|
||||||
#endif
|
|
||||||
#endif // !defined(SNAPPY_HAVE_SSSE3)
|
|
||||||
|
|
||||||
#if !defined(SNAPPY_HAVE_BMI2)
|
#if !defined(SNAPPY_HAVE_BMI2)
|
||||||
// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
|
// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
|
||||||
// specifically, but it does define __AVX2__ when AVX2 support is available.
|
// specifically, but it does define __AVX2__ when AVX2 support is available.
|
||||||
|
@ -56,12 +45,6 @@
|
||||||
#endif
|
#endif
|
||||||
#endif // !defined(SNAPPY_HAVE_BMI2)
|
#endif // !defined(SNAPPY_HAVE_BMI2)
|
||||||
|
|
||||||
#if SNAPPY_HAVE_SSSE3
|
|
||||||
// Please do not replace with <x86intrin.h>. or with headers that assume more
|
|
||||||
// advanced SSE versions without checking with all the OWNERS.
|
|
||||||
#include <tmmintrin.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if SNAPPY_HAVE_BMI2
|
#if SNAPPY_HAVE_BMI2
|
||||||
// Please do not replace with <x86intrin.h>. or with headers that assume more
|
// Please do not replace with <x86intrin.h>. or with headers that assume more
|
||||||
// advanced SSE versions without checking with all the OWNERS.
|
// advanced SSE versions without checking with all the OWNERS.
|
||||||
|
@ -91,6 +74,13 @@ using internal::COPY_2_BYTE_OFFSET;
|
||||||
using internal::COPY_4_BYTE_OFFSET;
|
using internal::COPY_4_BYTE_OFFSET;
|
||||||
using internal::kMaximumTagLength;
|
using internal::kMaximumTagLength;
|
||||||
using internal::LITERAL;
|
using internal::LITERAL;
|
||||||
|
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||||
|
using internal::V128;
|
||||||
|
using internal::V128_Load;
|
||||||
|
using internal::V128_LoadU;
|
||||||
|
using internal::V128_Shuffle;
|
||||||
|
using internal::V128_StoreU;
|
||||||
|
#endif
|
||||||
|
|
||||||
// We translate the information encoded in a tag through a lookup table to a
|
// We translate the information encoded in a tag through a lookup table to a
|
||||||
// format that requires fewer instructions to decode. Effectively we store
|
// format that requires fewer instructions to decode. Effectively we store
|
||||||
|
@ -228,7 +218,7 @@ inline char* IncrementalCopySlow(const char* src, char* op,
|
||||||
return op_limit;
|
return op_limit;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if SNAPPY_HAVE_SSSE3
|
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||||
|
|
||||||
// Computes the bytes for shuffle control mask (please read comments on
|
// Computes the bytes for shuffle control mask (please read comments on
|
||||||
// 'pattern_generation_masks' as well) for the given index_offset and
|
// 'pattern_generation_masks' as well) for the given index_offset and
|
||||||
|
@ -248,19 +238,19 @@ inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
|
||||||
// Computes the shuffle control mask bytes array for given pattern-sizes and
|
// Computes the shuffle control mask bytes array for given pattern-sizes and
|
||||||
// returns an array.
|
// returns an array.
|
||||||
template <size_t... pattern_sizes_minus_one>
|
template <size_t... pattern_sizes_minus_one>
|
||||||
inline constexpr std::array<std::array<char, sizeof(__m128i)>,
|
inline constexpr std::array<std::array<char, sizeof(V128)>,
|
||||||
sizeof...(pattern_sizes_minus_one)>
|
sizeof...(pattern_sizes_minus_one)>
|
||||||
MakePatternMaskBytesTable(int index_offset,
|
MakePatternMaskBytesTable(int index_offset,
|
||||||
index_sequence<pattern_sizes_minus_one...>) {
|
index_sequence<pattern_sizes_minus_one...>) {
|
||||||
return {MakePatternMaskBytes(
|
return {
|
||||||
index_offset, pattern_sizes_minus_one + 1,
|
MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,
|
||||||
make_index_sequence</*indexes=*/sizeof(__m128i)>())...};
|
make_index_sequence</*indexes=*/sizeof(V128)>())...};
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is an array of shuffle control masks that can be used as the source
|
// This is an array of shuffle control masks that can be used as the source
|
||||||
// operand for PSHUFB to permute the contents of the destination XMM register
|
// operand for PSHUFB to permute the contents of the destination XMM register
|
||||||
// into a repeating byte pattern.
|
// into a repeating byte pattern.
|
||||||
alignas(16) constexpr std::array<std::array<char, sizeof(__m128i)>,
|
alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
|
||||||
16> pattern_generation_masks =
|
16> pattern_generation_masks =
|
||||||
MakePatternMaskBytesTable(
|
MakePatternMaskBytesTable(
|
||||||
/*index_offset=*/0,
|
/*index_offset=*/0,
|
||||||
|
@ -271,40 +261,40 @@ alignas(16) constexpr std::array<std::array<char, sizeof(__m128i)>,
|
||||||
// Basically, pattern_reshuffle_masks is a continuation of
|
// Basically, pattern_reshuffle_masks is a continuation of
|
||||||
// pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
|
// pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
|
||||||
// pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
|
// pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
|
||||||
alignas(16) constexpr std::array<std::array<char, sizeof(__m128i)>,
|
alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
|
||||||
16> pattern_reshuffle_masks =
|
16> pattern_reshuffle_masks =
|
||||||
MakePatternMaskBytesTable(
|
MakePatternMaskBytesTable(
|
||||||
/*index_offset=*/16,
|
/*index_offset=*/16,
|
||||||
/*pattern_sizes_minus_one=*/make_index_sequence<16>());
|
/*pattern_sizes_minus_one=*/make_index_sequence<16>());
|
||||||
|
|
||||||
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
|
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
|
||||||
static inline __m128i LoadPattern(const char* src, const size_t pattern_size) {
|
static inline V128 LoadPattern(const char* src, const size_t pattern_size) {
|
||||||
__m128i generation_mask = _mm_load_si128(reinterpret_cast<const __m128i*>(
|
V128 generation_mask = V128_Load(reinterpret_cast<const V128*>(
|
||||||
pattern_generation_masks[pattern_size - 1].data()));
|
pattern_generation_masks[pattern_size - 1].data()));
|
||||||
// Uninitialized bytes are masked out by the shuffle mask.
|
// Uninitialized bytes are masked out by the shuffle mask.
|
||||||
// TODO: remove annotation and macro defs once MSan is fixed.
|
// TODO: remove annotation and macro defs once MSan is fixed.
|
||||||
SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(src + pattern_size, 16 - pattern_size);
|
SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(src + pattern_size, 16 - pattern_size);
|
||||||
return _mm_shuffle_epi8(
|
return V128_Shuffle(V128_LoadU(reinterpret_cast<const V128*>(src)),
|
||||||
_mm_loadu_si128(reinterpret_cast<const __m128i*>(src)), generation_mask);
|
generation_mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
|
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
|
||||||
static inline std::pair<__m128i /* pattern */, __m128i /* reshuffle_mask */>
|
static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */>
|
||||||
LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
|
LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
|
||||||
__m128i pattern = LoadPattern(src, pattern_size);
|
V128 pattern = LoadPattern(src, pattern_size);
|
||||||
|
|
||||||
// This mask will generate the next 16 bytes in-place. Doing so enables us to
|
// This mask will generate the next 16 bytes in-place. Doing so enables us to
|
||||||
// write data by at most 4 _mm_storeu_si128.
|
// write data by at most 4 V128_StoreU.
|
||||||
//
|
//
|
||||||
// For example, suppose pattern is: abcdefabcdefabcd
|
// For example, suppose pattern is: abcdefabcdefabcd
|
||||||
// Shuffling with this mask will generate: efabcdefabcdefab
|
// Shuffling with this mask will generate: efabcdefabcdefab
|
||||||
// Shuffling again will generate: cdefabcdefabcdef
|
// Shuffling again will generate: cdefabcdefabcdef
|
||||||
__m128i reshuffle_mask = _mm_load_si128(reinterpret_cast<const __m128i*>(
|
V128 reshuffle_mask = V128_Load(reinterpret_cast<const V128*>(
|
||||||
pattern_reshuffle_masks[pattern_size - 1].data()));
|
pattern_reshuffle_masks[pattern_size - 1].data()));
|
||||||
return {pattern, reshuffle_mask};
|
return {pattern, reshuffle_mask};
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // SNAPPY_HAVE_SSSE3
|
#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||||
|
|
||||||
// Fallback for when we need to copy while extending the pattern, for example
|
// Fallback for when we need to copy while extending the pattern, for example
|
||||||
// copying 10 bytes from 3 positions back abc -> abcabcabcabca.
|
// copying 10 bytes from 3 positions back abc -> abcabcabcabca.
|
||||||
|
@ -312,7 +302,7 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
|
||||||
// REQUIRES: [dst - offset, dst + 64) is a valid address range.
|
// REQUIRES: [dst - offset, dst + 64) is a valid address range.
|
||||||
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
|
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
|
||||||
static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
|
static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
|
||||||
#if SNAPPY_HAVE_SSSE3
|
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||||
if (SNAPPY_PREDICT_TRUE(offset <= 16)) {
|
if (SNAPPY_PREDICT_TRUE(offset <= 16)) {
|
||||||
switch (offset) {
|
switch (offset) {
|
||||||
case 0:
|
case 0:
|
||||||
|
@ -325,20 +315,20 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
|
||||||
case 4:
|
case 4:
|
||||||
case 8:
|
case 8:
|
||||||
case 16: {
|
case 16: {
|
||||||
__m128i pattern = LoadPattern(dst - offset, offset);
|
V128 pattern = LoadPattern(dst - offset, offset);
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern);
|
V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
default: {
|
default: {
|
||||||
auto pattern_and_reshuffle_mask =
|
auto pattern_and_reshuffle_mask =
|
||||||
LoadPatternAndReshuffleMask(dst - offset, offset);
|
LoadPatternAndReshuffleMask(dst - offset, offset);
|
||||||
__m128i pattern = pattern_and_reshuffle_mask.first;
|
V128 pattern = pattern_and_reshuffle_mask.first;
|
||||||
__m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
|
V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern);
|
V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);
|
||||||
pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
|
pattern = V128_Shuffle(pattern, reshuffle_mask);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -361,7 +351,7 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
#endif // SNAPPY_HAVE_SSSE3
|
#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||||
|
|
||||||
// Very rare.
|
// Very rare.
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
|
@ -375,7 +365,7 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
|
||||||
// region of the buffer.
|
// region of the buffer.
|
||||||
inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||||
char* const buf_limit) {
|
char* const buf_limit) {
|
||||||
#if SNAPPY_HAVE_SSSE3
|
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||||
constexpr int big_pattern_size_lower_bound = 16;
|
constexpr int big_pattern_size_lower_bound = 16;
|
||||||
#else
|
#else
|
||||||
constexpr int big_pattern_size_lower_bound = 8;
|
constexpr int big_pattern_size_lower_bound = 8;
|
||||||
|
@ -425,14 +415,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||||
// Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE)
|
// Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE)
|
||||||
// bytes.
|
// bytes.
|
||||||
if (pattern_size < big_pattern_size_lower_bound) {
|
if (pattern_size < big_pattern_size_lower_bound) {
|
||||||
#if SNAPPY_HAVE_SSSE3
|
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||||
// Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
|
// Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
|
||||||
// to permute the register's contents in-place into a repeating sequence of
|
// to permute the register's contents in-place into a repeating sequence of
|
||||||
// the first "pattern_size" bytes.
|
// the first "pattern_size" bytes.
|
||||||
// For example, suppose:
|
// For example, suppose:
|
||||||
// src == "abc"
|
// src == "abc"
|
||||||
// op == op + 3
|
// op == op + 3
|
||||||
// After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
|
// After V128_Shuffle(), "pattern" will have five copies of "abc"
|
||||||
// followed by one byte of slop: abcabcabcabcabca.
|
// followed by one byte of slop: abcabcabcabcabca.
|
||||||
//
|
//
|
||||||
// The non-SSE fallback implementation suffers from store-forwarding stalls
|
// The non-SSE fallback implementation suffers from store-forwarding stalls
|
||||||
|
@ -444,26 +434,26 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||||
if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
|
if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
|
||||||
auto pattern_and_reshuffle_mask =
|
auto pattern_and_reshuffle_mask =
|
||||||
LoadPatternAndReshuffleMask(src, pattern_size);
|
LoadPatternAndReshuffleMask(src, pattern_size);
|
||||||
__m128i pattern = pattern_and_reshuffle_mask.first;
|
V128 pattern = pattern_and_reshuffle_mask.first;
|
||||||
__m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
|
V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
|
||||||
|
|
||||||
// There is at least one, and at most four 16-byte blocks. Writing four
|
// There is at least one, and at most four 16-byte blocks. Writing four
|
||||||
// conditionals instead of a loop allows FDO to layout the code with
|
// conditionals instead of a loop allows FDO to layout the code with
|
||||||
// respect to the actual probabilities of each length.
|
// respect to the actual probabilities of each length.
|
||||||
// TODO: Replace with loop with trip count hint.
|
// TODO: Replace with loop with trip count hint.
|
||||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
|
V128_StoreU(reinterpret_cast<V128*>(op), pattern);
|
||||||
|
|
||||||
if (op + 16 < op_limit) {
|
if (op + 16 < op_limit) {
|
||||||
pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
|
pattern = V128_Shuffle(pattern, reshuffle_mask);
|
||||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(op + 16), pattern);
|
V128_StoreU(reinterpret_cast<V128*>(op + 16), pattern);
|
||||||
}
|
}
|
||||||
if (op + 32 < op_limit) {
|
if (op + 32 < op_limit) {
|
||||||
pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
|
pattern = V128_Shuffle(pattern, reshuffle_mask);
|
||||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(op + 32), pattern);
|
V128_StoreU(reinterpret_cast<V128*>(op + 32), pattern);
|
||||||
}
|
}
|
||||||
if (op + 48 < op_limit) {
|
if (op + 48 < op_limit) {
|
||||||
pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
|
pattern = V128_Shuffle(pattern, reshuffle_mask);
|
||||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(op + 48), pattern);
|
V128_StoreU(reinterpret_cast<V128*>(op + 48), pattern);
|
||||||
}
|
}
|
||||||
return op_limit;
|
return op_limit;
|
||||||
}
|
}
|
||||||
|
@ -471,8 +461,8 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||||
if (SNAPPY_PREDICT_TRUE(op < op_end)) {
|
if (SNAPPY_PREDICT_TRUE(op < op_end)) {
|
||||||
auto pattern_and_reshuffle_mask =
|
auto pattern_and_reshuffle_mask =
|
||||||
LoadPatternAndReshuffleMask(src, pattern_size);
|
LoadPatternAndReshuffleMask(src, pattern_size);
|
||||||
__m128i pattern = pattern_and_reshuffle_mask.first;
|
V128 pattern = pattern_and_reshuffle_mask.first;
|
||||||
__m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
|
V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
|
||||||
|
|
||||||
// This code path is relatively cold however so we save code size
|
// This code path is relatively cold however so we save code size
|
||||||
// by avoiding unrolling and vectorizing.
|
// by avoiding unrolling and vectorizing.
|
||||||
|
@ -483,13 +473,13 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||||
#pragma clang loop unroll(disable)
|
#pragma clang loop unroll(disable)
|
||||||
#endif
|
#endif
|
||||||
do {
|
do {
|
||||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
|
V128_StoreU(reinterpret_cast<V128*>(op), pattern);
|
||||||
pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
|
pattern = V128_Shuffle(pattern, reshuffle_mask);
|
||||||
op += 16;
|
op += 16;
|
||||||
} while (SNAPPY_PREDICT_TRUE(op < op_end));
|
} while (SNAPPY_PREDICT_TRUE(op < op_end));
|
||||||
}
|
}
|
||||||
return IncrementalCopySlow(op - pattern_size, op, op_limit);
|
return IncrementalCopySlow(op - pattern_size, op, op_limit);
|
||||||
#else // !SNAPPY_HAVE_SSSE3
|
#else // !SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||||
// If plenty of buffer space remains, expand the pattern to at least 8
|
// If plenty of buffer space remains, expand the pattern to at least 8
|
||||||
// bytes. The way the following loop is written, we need 8 bytes of buffer
|
// bytes. The way the following loop is written, we need 8 bytes of buffer
|
||||||
// space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
|
// space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
|
||||||
|
@ -506,7 +496,7 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||||
} else {
|
} else {
|
||||||
return IncrementalCopySlow(src, op, op_limit);
|
return IncrementalCopySlow(src, op, op_limit);
|
||||||
}
|
}
|
||||||
#endif // SNAPPY_HAVE_SSSE3
|
#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||||
}
|
}
|
||||||
assert(pattern_size >= big_pattern_size_lower_bound);
|
assert(pattern_size >= big_pattern_size_lower_bound);
|
||||||
constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16;
|
constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16;
|
||||||
|
|
Loading…
Reference in New Issue