assertion failure on darwin_x86_64, have to investigage

PiperOrigin-RevId: 303346402
This commit is contained in:
Snappy Team 2020-03-27 16:35:10 +00:00 committed by Victor Costan
parent 0faf56378e
commit e19178748f
3 changed files with 21 additions and 67 deletions

View File

@ -89,18 +89,12 @@ char* CompressFragment(const char* input,
// Does not read *(s1 + (s2_limit - s2)) or beyond. // Does not read *(s1 + (s2_limit - s2)) or beyond.
// Requires that s2_limit >= s2. // Requires that s2_limit >= s2.
// //
// In addition populate *data with the next 8 bytes from the end of the match.
// This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is
// that on some arch's this can be done faster in this routine than subsequent
// loading from s2 + n.
//
// Separate implementation for 64-bit, little-endian cpus. // Separate implementation for 64-bit, little-endian cpus.
#if !defined(SNAPPY_IS_BIG_ENDIAN) && \ #if !defined(SNAPPY_IS_BIG_ENDIAN) && \
(defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)) (defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM))
static inline std::pair<size_t, bool> FindMatchLength(const char* s1, static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
const char* s2, const char* s2,
const char* s2_limit, const char* s2_limit) {
uint64* data) {
assert(s2_limit >= s2); assert(s2_limit >= s2);
size_t matched = 0; size_t matched = 0;
@ -109,28 +103,12 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
// uncommon code paths that determine, without extra effort, whether the match // uncommon code paths that determine, without extra effort, whether the match
// length is less than 8. In short, we are hoping to avoid a conditional // length is less than 8. In short, we are hoping to avoid a conditional
// branch, and perhaps get better code layout from the C++ compiler. // branch, and perhaps get better code layout from the C++ compiler.
if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) { if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 8)) {
uint64 a1 = UNALIGNED_LOAD64(s1); uint64 a1 = UNALIGNED_LOAD64(s1);
uint64 a2 = UNALIGNED_LOAD64(s2); uint64 a2 = UNALIGNED_LOAD64(s2);
if (SNAPPY_PREDICT_TRUE(a1 != a2)) { if (a1 != a2) {
uint64 xorval = a1 ^ a2; return std::pair<size_t, bool>(Bits::FindLSBSetNonZero64(a1 ^ a2) >> 3,
int shift = Bits::FindLSBSetNonZero64(xorval); true);
size_t matched_bytes = shift >> 3;
#ifndef __x86_64__
*data = UNALIGNED_LOAD64(s2 + matched_bytes);
#else
// Unfortunately the compiler cannot find this using the obvious c++ code
// *data = shift == 0 ? a2 : (a2 >> shift) | (a3 << (64 - shift);
// the reason is that the above needs the conditional clause to guard
// against UB when shift == 0. The compiler doesn't realize the full
// expression can be lowered into a single "shrd" instruction and in
// effect the conditional can be ignored.
uint64 a3 = UNALIGNED_LOAD64(s2 + 8);
asm ("shrdq %%cl, %1, %0\n\t" : "+r"(a2) : "r"(a3), "c"(shift & -8));
*data = a2;
#endif
assert(*data == UNALIGNED_LOAD64(s2 + matched_bytes));
return std::pair<size_t, bool>(matched_bytes, true);
} else { } else {
matched = 8; matched = 8;
s2 += 8; s2 += 8;
@ -141,25 +119,14 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
// time until we find a 64-bit block that doesn't match; then we find // time until we find a 64-bit block that doesn't match; then we find
// the first non-matching bit and use that to calculate the total // the first non-matching bit and use that to calculate the total
// length of the match. // length of the match.
while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) { while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 8)) {
uint64 a1 = UNALIGNED_LOAD64(s1 + matched); if (UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched)) {
uint64 a2 = UNALIGNED_LOAD64(s2);
if (a1 == a2) {
s2 += 8; s2 += 8;
matched += 8; matched += 8;
} else { } else {
uint64 xorval = a1 ^ a2; uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched);
int shift = Bits::FindLSBSetNonZero64(xorval); int matching_bits = Bits::FindLSBSetNonZero64(x);
size_t matched_bytes = shift >> 3; matched += matching_bits >> 3;
#ifndef __x86_64__
*data = UNALIGNED_LOAD64(s2 + matched_bytes);
#else
uint64 a3 = UNALIGNED_LOAD64(s2 + 8);
asm("shrdq %%cl, %1, %0\n\t" : "+r"(a2) : "r"(a3), "c"(shift & -8));
*data = a2;
#endif
assert(*data == UNALIGNED_LOAD64(s2 + matched_bytes));
matched += matched_bytes;
assert(matched >= 8); assert(matched >= 8);
return std::pair<size_t, bool>(matched, false); return std::pair<size_t, bool>(matched, false);
} }
@ -169,9 +136,6 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
++s2; ++s2;
++matched; ++matched;
} else { } else {
if (s2 <= s2_limit - 8) {
*data = UNALIGNED_LOAD64(s2);
}
return std::pair<size_t, bool>(matched, matched < 8); return std::pair<size_t, bool>(matched, matched < 8);
} }
} }
@ -180,8 +144,7 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
#else #else
static inline std::pair<size_t, bool> FindMatchLength(const char* s1, static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
const char* s2, const char* s2,
const char* s2_limit, const char* s2_limit) {
uint64* data) {
// Implementation based on the x86-64 version, above. // Implementation based on the x86-64 version, above.
assert(s2_limit >= s2); assert(s2_limit >= s2);
int matched = 0; int matched = 0;
@ -201,7 +164,6 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
++matched; ++matched;
} }
} }
*data = UNALIGNED_LOAD64(s2 + matched);
return std::pair<size_t, bool>(matched, matched < 8); return std::pair<size_t, bool>(matched, matched < 8);
} }
#endif #endif

View File

@ -385,18 +385,11 @@ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
assert(offset < 65536); assert(offset < 65536);
assert(len_less_than_12 == (len < 12)); assert(len_less_than_12 == (len < 12));
if (len_less_than_12) { if (len_less_than_12 && SNAPPY_PREDICT_TRUE(offset < 2048)) {
uint32 u = (len << 2) + (offset << 8); // offset fits in 11 bits. The 3 highest go in the top of the first byte,
uint32 copy1 = COPY_1_BYTE_OFFSET - (4 << 2) + ((offset >> 3) & 0xe0); // and the rest go in the second byte.
uint32 copy2 = COPY_2_BYTE_OFFSET - (1 << 2); *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
// It turns out that offset < 2048 is a difficult to predict branch. *op++ = offset & 0xff;
// `perf record` shows this is the highest percentage of branch misses in
// benchmarks. This code produces branch free code, the data dependency
// chain that bottlenecks the throughput is so long that a few extra
// instructions are completely free (IPC << 6 because of data deps).
u += offset < 2048 ? copy1 : copy2;
LittleEndian::Store32(op, u);
op += offset < 2048 ? 2 : 3;
} else { } else {
// Write 4 bytes, though we only care about 3 of them. The output buffer // Write 4 bytes, though we only care about 3 of them. The output buffer
// is required to have some slack, so the extra byte won't overrun it. // is required to have some slack, so the extra byte won't overrun it.
@ -622,7 +615,7 @@ char* CompressFragment(const char* input,
// "literal bytes" prior to ip. // "literal bytes" prior to ip.
const char* base = ip; const char* base = ip;
std::pair<size_t, bool> p = std::pair<size_t, bool> p =
FindMatchLength(candidate + 4, ip + 4, ip_end, &data); FindMatchLength(candidate + 4, ip + 4, ip_end);
size_t matched = 4 + p.first; size_t matched = 4 + p.first;
ip += matched; ip += matched;
size_t offset = base - candidate; size_t offset = base - candidate;
@ -636,12 +629,12 @@ char* CompressFragment(const char* input,
if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) { if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
goto emit_remainder; goto emit_remainder;
} }
assert(LittleEndian::Load64(ip) == data);
// We are now looking for a 4-byte match again. We read // We are now looking for a 4-byte match again. We read
// table[Hash(ip, shift)] for that. To improve compression, // table[Hash(ip, shift)] for that. To improve compression,
// we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)]. // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
table[HashBytes(LittleEndian::Load32(ip - 1), shift)] = data = LittleEndian::Load64(ip - 1);
ip - base_ip - 1; table[HashBytes(data, shift)] = ip - base_ip - 1;
data >>= 8;
uint32 hash = HashBytes(data, shift); uint32 hash = HashBytes(data, shift);
candidate = base_ip + table[hash]; candidate = base_ip + table[hash];
table[hash] = ip - base_ip; table[hash] = ip - base_ip;

View File

@ -957,9 +957,8 @@ TEST(Snappy, ZeroOffsetCopyValidation) {
namespace { namespace {
int TestFindMatchLength(const char* s1, const char *s2, unsigned length) { int TestFindMatchLength(const char* s1, const char *s2, unsigned length) {
uint64 data;
std::pair<size_t, bool> p = std::pair<size_t, bool> p =
snappy::internal::FindMatchLength(s1, s2, s2 + length, &data); snappy::internal::FindMatchLength(s1, s2, s2 + length);
CHECK_EQ(p.first < 8, p.second); CHECK_EQ(p.first < 8, p.second);
return p.first; return p.first;
} }