From 9cc3689b2120bd25a74f264f05ed8692c9e22dfe Mon Sep 17 00:00:00 2001 From: Snappy Team Date: Wed, 7 Jul 2021 19:22:28 +0000 Subject: [PATCH] Optimize memset to pure SIMD because compilers generate consistently bad code. clang for ARM and gcc for x86 https://gcc.godbolt.org/z/oxeGG7aEx PiperOrigin-RevId: 383467656 --- snappy-internal.h | 8 ++++++++ snappy.cc | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/snappy-internal.h b/snappy-internal.h index ad2b36a..f1aafa9 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -56,6 +56,9 @@ inline void V128_StoreU(V128* dst, V128 val); // Each packed integer in the shuffle mask must be in [0,16). inline V128 V128_Shuffle(V128 input, V128 shuffle_mask); +// Constructs V128 with 16 chars |c|. +inline V128 V128_DupChar(char c); + #if SNAPPY_HAVE_SSSE3 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } @@ -66,6 +69,9 @@ inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); } inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { return _mm_shuffle_epi8(input, shuffle_mask); } + +inline V128 V128_DupChar(char c) { return _mm_set1_epi8(c); } + #else inline V128 V128_Load(const V128* src) { return vld1q_u8(reinterpret_cast(src)); @@ -83,6 +89,8 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15); return vqtbl1q_u8(input, shuffle_mask); } + +inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); } #endif #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE diff --git a/snappy.cc b/snappy.cc index 7d0ff71..632ab85 100644 --- a/snappy.cc +++ b/snappy.cc @@ -80,6 +80,7 @@ using internal::V128_Load; using internal::V128_LoadU; using internal::V128_Shuffle; using internal::V128_StoreU; +using internal::V128_DupChar; #endif // We translate the information encoded in a tag through a lookup table to a @@ -308,7 +309,12 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) { case 0: return false; case 1: { - std::memset(dst, dst[-1], 64); + // TODO: Ideally we should memset, move back once the + // codegen issues are fixed. + V128 pattern = V128_DupChar(dst[-1]); + for (int i = 0; i < 4; i++) { + V128_StoreU(reinterpret_cast(dst + 16 * i), pattern); + } return true; } case 2: