mirror of https://github.com/google/snappy.git
Optimize memset to pure SIMD because compilers generate consistently bad code. clang for ARM and gcc for x86 https://gcc.godbolt.org/z/oxeGG7aEx
PiperOrigin-RevId: 383467656
This commit is contained in:
parent
b4888f7616
commit
9cc3689b21
|
@ -56,6 +56,9 @@ inline void V128_StoreU(V128* dst, V128 val);
|
|||
// Each packed integer in the shuffle mask must be in [0,16).
|
||||
inline V128 V128_Shuffle(V128 input, V128 shuffle_mask);
|
||||
|
||||
// Constructs V128 with 16 chars |c|.
|
||||
inline V128 V128_DupChar(char c);
|
||||
|
||||
#if SNAPPY_HAVE_SSSE3
|
||||
inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
|
||||
|
||||
|
@ -66,6 +69,9 @@ inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); }
|
|||
inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
|
||||
return _mm_shuffle_epi8(input, shuffle_mask);
|
||||
}
|
||||
|
||||
inline V128 V128_DupChar(char c) { return _mm_set1_epi8(c); }
|
||||
|
||||
#else
|
||||
inline V128 V128_Load(const V128* src) {
|
||||
return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
|
||||
|
@ -83,6 +89,8 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
|
|||
assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15);
|
||||
return vqtbl1q_u8(input, shuffle_mask);
|
||||
}
|
||||
|
||||
inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
|
||||
#endif
|
||||
#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
||||
|
||||
|
|
|
@ -80,6 +80,7 @@ using internal::V128_Load;
|
|||
using internal::V128_LoadU;
|
||||
using internal::V128_Shuffle;
|
||||
using internal::V128_StoreU;
|
||||
using internal::V128_DupChar;
|
||||
#endif
|
||||
|
||||
// We translate the information encoded in a tag through a lookup table to a
|
||||
|
@ -308,7 +309,12 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
|
|||
case 0:
|
||||
return false;
|
||||
case 1: {
|
||||
std::memset(dst, dst[-1], 64);
|
||||
// TODO: Ideally we should memset, move back once the
|
||||
// codegen issues are fixed.
|
||||
V128 pattern = V128_DupChar(dst[-1]);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case 2:
|
||||
|
|
Loading…
Reference in New Issue