Make zippy decompression branchless

PiperOrigin-RevId: 342423961
This commit is contained in:
Snappy Team 2020-11-14 15:27:36 +00:00 committed by Victor Costan
parent 3bfa265a04
commit 289c8a3c0a
2 changed files with 210 additions and 6 deletions

View File

@ -274,7 +274,8 @@ static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual o
// because of efficiency reasons: // because of efficiency reasons:
// (1) Extracting a byte is faster than a bit-field // (1) Extracting a byte is faster than a bit-field
// (2) It properly aligns copy offset so we do not need a <<8 // (2) It properly aligns copy offset so we do not need a <<8
static const uint16_t char_table[256] = { static constexpr uint16_t char_table[256] = {
// clang-format off
0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
@ -306,7 +307,8 @@ static const uint16_t char_table[256] = {
0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040,
// clang-format on
}; };
} // end namespace internal } // end namespace internal

210
snappy.cc
View File

@ -86,6 +86,45 @@ using internal::COPY_4_BYTE_OFFSET;
using internal::kMaximumTagLength; using internal::kMaximumTagLength;
using internal::LITERAL; using internal::LITERAL;
// We translate the information encoded in a tag through a lookup table to a
// format that requires fewer instructions to decode.
// The returned format encodes the offset in the high byte and the length
// in the low byte. Because length will never be 0, we use zero as an indicator
// for an exceptional value (copy 3 tag or a literal > 60 bytes).
constexpr size_t kLiteralOffset = 256;
inline constexpr uint16_t OffsetAndLength(uint8_t tag) {
switch (tag & 3) {
case 0: {
if (tag >= 60 * 4) {
return 0; // literal longer then 60 bytes is done in fallback.
}
int len = (tag >> 2) + 1;
// We include a spurious offset for literals explained in the code.
return len | kLiteralOffset;
}
case 1: {
int len = ((tag >> 2) & 7) + 4;
int off = tag >> 5;
return len | (off << 8);
}
case 2: {
int len = (tag >> 2) + 1;
return len;
}
default:
return 0; // copy 3 tags are done in fallback.
}
}
inline constexpr std::array<uint16_t, 256> OffsetAndLengthTable() {
std::array<uint16_t, 256> arr{};
for (int i = 0; i < 256; i++) arr[i] = OffsetAndLength(i);
return arr;
}
alignas(64) const std::array<uint16_t, 256> offset_and_length_table =
OffsetAndLengthTable();
// Any hash function will produce a valid compressed bitstream, but a good // Any hash function will produce a valid compressed bitstream, but a good
// hash function reduces the number of collisions and thus yields better // hash function reduces the number of collisions and thus yields better
// compression for compressible input, and more speed for incompressible // compression for compressible input, and more speed for incompressible
@ -182,7 +221,7 @@ const uint8_t pattern_size_table[8] = {0, 16, 16, 15, 16, 15, 12, 14};
#endif // SNAPPY_HAVE_SSSE3 #endif // SNAPPY_HAVE_SSSE3
// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than // Copy [src, src+(op_limit-op)) to [op, op_limit) but faster than
// IncrementalCopySlow. buf_limit is the address past the end of the writable // IncrementalCopySlow. buf_limit is the address past the end of the writable
// region of the buffer. // region of the buffer.
inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
@ -761,6 +800,124 @@ static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) {
return (value & masks[shift]) != 0; return (value & masks[shift]) != 0;
} }
// Core decompression loop, when there is enough data available.
// Decompresses the input buffer [ip, ip_limit) into the output buffer
// [op, op_limit_min_slop). Returning when either we are too close to the end
// of the input buffer, or we exceed op_limit_min_slop or when a exceptional
// tag is encountered (literal of length > 60) or a copy-4.
// Returns {ip, op} at the points it stopped decoding.
// TODO This function probably does not need to be inlined, as it
// should decode large chunks at a time. This allows runtime dispatch to
// implementations based on CPU capability (BMI2 / perhaps 32 / 64 byte memcpy).
std::pair<const uint8_t*, char*> DecompressBranchless(
const uint8_t* ip, const uint8_t* ip_limit, char* op_ptr, char* op_base,
char* op_limit_min_slop_ptr) {
constexpr size_t kSlopBytes = 64;
std::ptrdiff_t op = op_ptr - op_base;
std::ptrdiff_t op_limit_min_slop = op_limit_min_slop_ptr - op_base;
std::ptrdiff_t op_limit = op_limit_min_slop + kSlopBytes - 1;
if (kSlopBytes < ip_limit - ip && op < op_limit_min_slop) {
const uint8_t* const ip_limit_min_slop = ip_limit - kSlopBytes + 1;
ip++;
// ip points just past the tag and we are touching at maximum kSlopBytes
// in an iteration.
do {
const uint8_t* old_ip = ip;
const uint64_t tag = ip[-1];
uint32_t offset_and_length = offset_and_length_table[tag];
if (SNAPPY_PREDICT_FALSE(offset_and_length == 0)) {
// Exception case (long literal or copy 4).
if ((tag & 3) == 3) {
break_loop:
ip = old_ip;
break;
}
assert((tag & 3) == 0); // This is a literal
assert(tag >= 60 * 4); // It's a long literal
uint32_t next = LittleEndian::Load32(ip);
uint32_t length_bytes = (tag >> 2) - 59;
uint32_t literal_len = ExtractLowBytes(next, length_bytes) + 1;
ip += length_bytes;
// Note we use >= instead of > because we also need to increment ip
// because we have to leave it past the tag.
if (literal_len >= ip_limit - ip) goto break_loop;
if (literal_len > op_limit - op) goto break_loop;
memcpy(op_base + op, ip, literal_len);
ip += literal_len;
op += literal_len;
assert(ip < ip_limit); // See above test
ip++;
continue;
}
size_t tag_type;
{
// This section is crucial for the throughput of the decompression loop.
// The latency of an iteration is fundamentally constrained by the
// following data chain on ip.
// ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2
// ip2 = ip + 2 + (c >> 2)
// This amounts to 8 cycles.
// 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov)
size_t literal_len = tag >> 2;
#if defined(__GNUC__) && defined(__x86_64__)
// TODO
// clang misses the fact that the (c & 3) already correctly sets
// the zero flag.
tag_type = tag;
bool is_literal;
asm("and $3, %0\n\t" : "+r"(tag_type), "=@ccz"(is_literal));
bool is_copy = !is_literal;
#else
tag_type = tag & 3;
bool is_copy = (tag_type != 0);
#endif
const uint8_t* ip_copy = ip + 1 + tag_type;
const uint8_t* ip_literal = ip + 2 + literal_len;
ip = is_copy ? ip_copy : ip_literal;
}
uint32_t next = LittleEndian::Load32(old_ip);
// For literals tag_type = 0, hence we will always obtain 0 from
// ExtractLowBytes. For literals offset will thus be kLiteralOffset.
std::ptrdiff_t offset =
(offset_and_length & 0x700) + ExtractLowBytes(next, tag_type);
size_t len = offset_and_length & 0xFF;
std::ptrdiff_t delta = op - offset;
if (SNAPPY_PREDICT_FALSE(delta < 0)) {
if (tag_type != 0) goto break_loop;
std::memcpy(op_base + op, old_ip, 64);
op += len;
continue;
}
// By choosing literals to have kLiteralOffset this test will
// always succeed for literals.
if (SNAPPY_PREDICT_FALSE(offset < len)) {
assert(tag_type != 0);
op = IncrementalCopy(op_base + delta, op_base + op, op_base + op + len,
op_base + op_limit) -
op_base;
continue;
}
const uint8_t* from =
tag_type ? reinterpret_cast<const uint8_t*>(op_base + delta) : old_ip;
// For literals we need to copy from ip instead of from the stream.
// We also need to compensate for the offset in that case.
std::memmove(op_base + op, from, 64);
op += len;
} while (ip < ip_limit_min_slop && op < op_limit_min_slop);
ip--;
assert(ip <= ip_limit);
}
return {ip, op_base + op};
}
template <typename T>
std::pair<const uint8_t*, T> DecompressBranchless(const uint8_t* ip,
const uint8_t*, T op, char*,
char*) {
return {ip, op};
}
// Helper class for decompression // Helper class for decompression
class SnappyDecompressor { class SnappyDecompressor {
private: private:
@ -853,6 +1010,19 @@ class SnappyDecompressor {
uint32_t preload; uint32_t preload;
MAYBE_REFILL(); MAYBE_REFILL();
for (;;) { for (;;) {
{
char* op_limit_min_slop;
auto op_base = writer->GetBase(&op_limit_min_slop);
if (op_base) {
auto res =
DecompressBranchless(reinterpret_cast<const uint8_t*>(ip),
reinterpret_cast<const uint8_t*>(ip_limit_),
op, op_base, op_limit_min_slop);
ip = reinterpret_cast<const char*>(res.first);
op = res.second;
MAYBE_REFILL();
}
}
const uint8_t c = static_cast<uint8_t>(preload); const uint8_t c = static_cast<uint8_t>(preload);
ip++; ip++;
@ -912,7 +1082,7 @@ class SnappyDecompressor {
if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit; if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;
} else { } else {
const uint32_t entry = char_table[c]; const uint32_t entry = offset_and_length_table[c];
preload = LittleEndian::Load32(ip); preload = LittleEndian::Load32(ip);
const uint32_t trailer = ExtractLowBytes(preload, c & 3); const uint32_t trailer = ExtractLowBytes(preload, c & 3);
const uint32_t length = entry & 0xff; const uint32_t length = entry & 0xff;
@ -938,6 +1108,23 @@ class SnappyDecompressor {
} }
}; };
constexpr uint32_t CalculateNeeded(uint8_t tag) {
uint32_t needed = (0x05030201 >> ((tag * 8) & 31)) & 0xFF;
if ((tag & 3) == 0 && tag >= (60 * 4)) needed = (tag >> 2) - 58;
return needed;
}
constexpr bool VerifyCalculateNeeded() {
for (int i = 0; i < 1; i++) {
if (CalculateNeeded(i) != (char_table[i] >> 11) + 1) return false;
}
return true;
}
// Make sure CalculateNeeded is correct by verifying it against the established
// table encoding the number of added bytes needed.
static_assert(VerifyCalculateNeeded(), "");
bool SnappyDecompressor::RefillTag() { bool SnappyDecompressor::RefillTag() {
const char* ip = ip_; const char* ip = ip_;
if (ip == ip_limit_) { if (ip == ip_limit_) {
@ -954,8 +1141,13 @@ bool SnappyDecompressor::RefillTag() {
// Read the tag character // Read the tag character
assert(ip < ip_limit_); assert(ip < ip_limit_);
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip)); const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
const uint32_t entry = char_table[c]; // At this point make sure that the data for the next tag is consecutive.
const uint32_t needed = (entry >> 11) + 1; // +1 byte for 'c' // For copy 1 this means the next 2 bytes (tag and 1 byte offset)
// For copy 2 the next 3 bytes (tag and 2 byte offset)
// For copy 4 the next 5 bytes (tag and 4 byte offset)
// For all small literals we only need 1 byte buf for literals 60...63 the
// length is encoded in 1...4 extra bytes.
const uint32_t needed = CalculateNeeded(c);
assert(needed <= sizeof(scratch_)); assert(needed <= sizeof(scratch_));
// Read more bytes from reader if needed // Read more bytes from reader if needed
@ -1160,6 +1352,7 @@ class SnappyIOVecWriter {
} }
char* GetOutputPtr() { return nullptr; } char* GetOutputPtr() { return nullptr; }
char* GetBase(char** op_limit_min_slop) { return nullptr; }
void SetOutputPtr(char* op) { void SetOutputPtr(char* op) {
// TODO: Switch to [[maybe_unused]] when we can assume C++17. // TODO: Switch to [[maybe_unused]] when we can assume C++17.
(void)op; (void)op;
@ -1323,6 +1516,10 @@ class SnappyArrayWriter {
inline bool CheckLength() const { return op_ == op_limit_; } inline bool CheckLength() const { return op_ == op_limit_; }
char* GetOutputPtr() { return op_; } char* GetOutputPtr() { return op_; }
char* GetBase(char** op_limit_min_slop) {
*op_limit_min_slop = op_limit_min_slop_;
return base_;
}
void SetOutputPtr(char* op) { op_ = op; } void SetOutputPtr(char* op) { op_ = op; }
inline bool Append(const char* ip, size_t len, char** op_p) { inline bool Append(const char* ip, size_t len, char** op_p) {
@ -1412,6 +1609,7 @@ class SnappyDecompressionValidator {
inline SnappyDecompressionValidator() : expected_(0), produced_(0) {} inline SnappyDecompressionValidator() : expected_(0), produced_(0) {}
inline void SetExpectedLength(size_t len) { expected_ = len; } inline void SetExpectedLength(size_t len) { expected_ = len; }
size_t GetOutputPtr() { return produced_; } size_t GetOutputPtr() { return produced_; }
char* GetBase(char** op_limit_min_slop) { return nullptr; }
void SetOutputPtr(size_t op) { produced_ = op; } void SetOutputPtr(size_t op) { produced_ = op; }
inline bool CheckLength() const { return expected_ == produced_; } inline bool CheckLength() const { return expected_ == produced_; }
inline bool Append(const char* ip, size_t len, size_t* produced) { inline bool Append(const char* ip, size_t len, size_t* produced) {
@ -1516,6 +1714,10 @@ class SnappyScatteredWriter {
op_limit_(NULL), op_limit_(NULL),
op_limit_min_slop_(NULL) {} op_limit_min_slop_(NULL) {}
char* GetOutputPtr() { return op_ptr_; } char* GetOutputPtr() { return op_ptr_; }
char* GetBase(char** op_limit_min_slop) {
*op_limit_min_slop = op_limit_min_slop_;
return op_base_;
}
void SetOutputPtr(char* op) { op_ptr_ = op; } void SetOutputPtr(char* op) { op_ptr_ = op; }
inline void SetExpectedLength(size_t len) { inline void SetExpectedLength(size_t len) {