Optimize Zippy compression for ARM by 5-10% by choosing csel instructions

PiperOrigin-RevId: 444863689
This commit is contained in:
Snappy Team 2022-04-27 15:16:35 +00:00 committed by Victor Costan
parent 8dd58a519f
commit 6a2b78a379
1 changed files with 7 additions and 7 deletions

View File

@ -230,8 +230,9 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
uint64_t xorval = a1 ^ a2;
int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
#ifndef __x86_64__
*data = UNALIGNED_LOAD64(s2 + matched_bytes);
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#else
// Ideally this would just be
//
@ -242,13 +243,12 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
// use a conditional move (it's tuned to cut data dependencies). In this
// case there is a longer parallel chain anyway AND this will be fairly
// unpredictable.
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
asm("testl %k2, %k2\n\t"
"cmovzq %1, %0\n\t"
: "+r"(a2)
: "r"(a3), "r"(xorval));
*data = a2 >> (shift & (3 * 8));
#endif
*data = a2 >> (shift & (3 * 8));
return std::pair<size_t, bool>(matched_bytes, true);
} else {
matched = 8;
@ -270,16 +270,16 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
uint64_t xorval = a1 ^ a2;
int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
#ifndef __x86_64__
*data = UNALIGNED_LOAD64(s2 + matched_bytes);
#else
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
#ifndef __x86_64__
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#else
asm("testl %k2, %k2\n\t"
"cmovzq %1, %0\n\t"
: "+r"(a2)
: "r"(a3), "r"(xorval));
*data = a2 >> (shift & (3 * 8));
#endif
*data = a2 >> (shift & (3 * 8));
matched += matched_bytes;
assert(matched >= 8);
return std::pair<size_t, bool>(matched, false);