diff --git a/snappy.cc b/snappy.cc index 73d987e..5bca64f 100644 --- a/snappy.cc +++ b/snappy.cc @@ -76,6 +76,9 @@ namespace snappy { +// The amount of slop bytes writers are using for unconditional copies. +constexpr int kSlopBytes = 64; + using internal::char_table; using internal::COPY_1_BYTE_OFFSET; using internal::COPY_2_BYTE_OFFSET; @@ -688,12 +691,28 @@ static inline void Report(const char *algorithm, size_t compressed_size, // // Called before decompression // void SetExpectedLength(size_t length); // +// // For performance a writer may choose to donate the cursor variable to the +// // decompression function. The decompression will inject it in all its +// // function calls to the writer. Keeping the important output cursor as a +// // function local stack variable allows the compiler to keep it in +// // register, which greatly aids performance by avoiding loads and stores of +// // this variable in the fast path loop iterations. +// T GetOutputPtr() const; +// +// // At end of decompression the loop donates the ownership of the cursor +// // variable back to the writer by calling this function. +// void SetOutputPtr(T op); +// // // Called after decompression // bool CheckLength() const; // // // Called repeatedly during decompression -// bool Append(const char* ip, size_t length); -// bool AppendFromSelf(uint32 offset, size_t length); +// // Each function get a pointer to the op (output pointer), that the writer +// // can use and update. Note it's important that these functions get fully +// // inlined so that no actual address of the local variable needs to be +// // taken. +// bool Append(const char* ip, size_t length, T* op); +// bool AppendFromSelf(uint32 offset, size_t length, T* op); // // // The rules for how TryFastAppend differs from Append are somewhat // // convoluted: @@ -715,7 +734,7 @@ static inline void Report(const char *algorithm, size_t compressed_size, // // as it is unlikely that one would implement a fast path accepting // // this much data. // // -// bool TryFastAppend(const char* ip, size_t available, size_t length); +// bool TryFastAppend(const char* ip, size_t available, size_t length, T* op); // }; static inline uint32 ExtractLowBytes(uint32 v, int n) { @@ -747,6 +766,9 @@ class SnappyDecompressor { Source* reader_; // Underlying source of bytes to decompress const char* ip_; // Points to next buffered byte const char* ip_limit_; // Points just past buffered bytes + // If ip < ip_limit_min_maxtaglen_ it's safe to read kMaxTagLength from + // buffer. + const char* ip_limit_min_maxtaglen_; uint32 peeked_; // Bytes peeked from reader (need to skip) bool eof_; // Hit end of input without an error? char scratch_[kMaximumTagLength]; // See RefillTag(). @@ -758,6 +780,11 @@ class SnappyDecompressor { // Returns true on success, false on error or end of input. bool RefillTag(); + void ResetLimit(const char* ip) { + ip_limit_min_maxtaglen_ = + ip_limit_ - std::min(ip_limit_ - ip, kMaximumTagLength - 1); + } + public: explicit SnappyDecompressor(Source* reader) : reader_(reader), @@ -810,37 +837,29 @@ class SnappyDecompressor { __attribute__((aligned(32))) #endif void DecompressAllTags(Writer* writer) { - // In x86, pad the function body to start 16 bytes later. This function has - // a couple of hotspots that are highly sensitive to alignment: we have - // observed regressions by more than 20% in some metrics just by moving the - // exact same code to a different position in the benchmark binary. - // - // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit - // the "lucky" case consistently. Unfortunately, this is a very brittle - // workaround, and future differences in code generation may reintroduce - // this regression. If you experience a big, difficult to explain, benchmark - // performance regression here, first try removing this hack. -#if defined(__GNUC__) && defined(__x86_64__) - // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions. - asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00"); - asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00"); -#endif - const char* ip = ip_; + ResetLimit(ip); + auto op = writer->GetOutputPtr(); // We could have put this refill fragment only at the beginning of the loop. // However, duplicating it at the end of each branch gives the compiler more // scope to optimize the expression based on the local // context, which overall increases speed. - #define MAYBE_REFILL() \ - if (ip_limit_ - ip < kMaximumTagLength) { \ - ip_ = ip; \ - if (!RefillTag()) return; \ - ip = ip_; \ - } +#define MAYBE_REFILL() \ + if (SNAPPY_PREDICT_FALSE(ip >= ip_limit_min_maxtaglen_)) { \ + ip_ = ip; \ + if (SNAPPY_PREDICT_FALSE(!RefillTag())) goto exit; \ + ip = ip_; \ + ResetLimit(ip); \ + } \ + preload = static_cast(*ip) + // At the start of the for loop below the least significant byte of preload + // contains the tag. + uint32 preload; MAYBE_REFILL(); for ( ;; ) { - const unsigned char c = *(reinterpret_cast(ip++)); + const uint8 c = static_cast(preload); + ip++; // Ratio of iterations that have LITERAL vs non-LITERAL for different // inputs. @@ -856,12 +875,13 @@ class SnappyDecompressor { // bin 24% 76% if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) { size_t literal_length = (c >> 2) + 1u; - if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) { + if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length, &op)) { assert(literal_length < 61); ip += literal_length; // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend() // will not return true unless there's already at least five spare // bytes in addition to the literal. + preload = static_cast(*ip); continue; } if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) { @@ -875,62 +895,51 @@ class SnappyDecompressor { size_t avail = ip_limit_ - ip; while (avail < literal_length) { - if (!writer->Append(ip, avail)) return; + if (!writer->Append(ip, avail, &op)) goto exit; literal_length -= avail; reader_->Skip(peeked_); size_t n; ip = reader_->Peek(&n); avail = n; peeked_ = avail; - if (avail == 0) return; // Premature end of input + if (avail == 0) goto exit; ip_limit_ = ip + avail; + ResetLimit(ip); } - if (!writer->Append(ip, literal_length)) { - return; - } + if (!writer->Append(ip, literal_length, &op)) goto exit; ip += literal_length; MAYBE_REFILL(); } else { - // If we ignore branch prediction misses, then the throughput of the - // loop over tags is limited by the data dependency chain carried by - // ip. The amount ip advances per iteration depends on the value c. - // For COPY_1 it's 2 bytes, for COPY_2 it's 3 bytes and for COPY_4 it's - // 5 bytes. We could be branch free and use information from the entry - // in char_table to advance ip, but that would lead to the following - // critical data dependency chain - // ip -> c = *ip++ -> entry = char_table[c] -> - // advance = entry >> 11 -> ip += advance - // which is 5 + 5 + 1 + 1 = 12 cycles of latency. Meaning the processor - // will never be able to execute the loop faster then 12 cycles per - // iteration. It's better to speculate it's not a COPY_4, which is an - // uncommon tag, in which case we get the data dependency chain - // ip -> c = *ip++ -> c &= 3 -> ip += c - // which is 5 + 1 + 1 = 7 cycles of latency. if (SNAPPY_PREDICT_FALSE((c & 3) == COPY_4_BYTE_OFFSET)) { const size_t copy_offset = LittleEndian::Load32(ip); const size_t length = (c >> 2) + 1; ip += 4; - if (!writer->AppendFromSelf(copy_offset, length)) return; + + if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit; } else { - const size_t entry = char_table[c]; - const size_t trailer = - ExtractLowBytes(LittleEndian::Load32(ip), c & 3); - const size_t length = entry & 0xff; - ip += (c & 3); + const uint32 entry = char_table[c]; + preload = LittleEndian::Load32(ip); + const uint32 trailer = ExtractLowBytes(preload, c & 3); + const uint32 length = entry & 0xff; // copy_offset/256 is encoded in bits 8..10. By just fetching // those bits, we get copy_offset (since the bit-field starts at // bit 8). - const size_t copy_offset = entry & 0x700; - if (!writer->AppendFromSelf(copy_offset + trailer, length)) { - return; - } + const uint32 copy_offset = (entry & 0x700) + trailer; + if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit; + + ip += (c & 3); + // By using the result of the previous load we reduce the critical + // dependency chain of ip to 4 cycles. + preload >>= (c & 3) * 8; + if (ip < ip_limit_min_maxtaglen_) continue; } MAYBE_REFILL(); } } - #undef MAYBE_REFILL + exit: + writer->SetOutputPtr(op); } }; @@ -1151,7 +1160,7 @@ class SnappyIOVecWriter { return total_written_ == output_limit_; } - inline bool Append(const char* ip, size_t len) { + inline bool Append(const char* ip, size_t len, char**) { if (total_written_ + len > output_limit_) { return false; } @@ -1159,6 +1168,9 @@ class SnappyIOVecWriter { return AppendNoCheck(ip, len); } + char* GetOutputPtr() { return nullptr; } + void SetOutputPtr(char* op) {} + inline bool AppendNoCheck(const char* ip, size_t len) { while (len > 0) { if (curr_iov_remaining_ == 0) { @@ -1183,7 +1195,8 @@ class SnappyIOVecWriter { return true; } - inline bool TryFastAppend(const char* ip, size_t available, size_t len) { + inline bool TryFastAppend(const char* ip, size_t available, size_t len, + char**) { const size_t space_left = output_limit_ - total_written_; if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 && curr_iov_remaining_ >= 16) { @@ -1198,7 +1211,7 @@ class SnappyIOVecWriter { return false; } - inline bool AppendFromSelf(size_t offset, size_t len) { + inline bool AppendFromSelf(size_t offset, size_t len, char**) { // See SnappyArrayWriter::AppendFromSelf for an explanation of // the "offset - 1u" trick. if (offset - 1u >= total_written_) { @@ -1296,59 +1309,68 @@ class SnappyArrayWriter { char* base_; char* op_; char* op_limit_; + // If op < op_limit_min_slop_ then it's safe to unconditionally write + // kSlopBytes starting at op. + char* op_limit_min_slop_; public: inline explicit SnappyArrayWriter(char* dst) : base_(dst), op_(dst), - op_limit_(dst) { - } + op_limit_(dst), + op_limit_min_slop_(dst) {} // Safe default see invariant. inline void SetExpectedLength(size_t len) { op_limit_ = op_ + len; + // Prevent pointer from being past the buffer. + op_limit_min_slop_ = op_limit_ - std::min(kSlopBytes - 1, len); } inline bool CheckLength() const { return op_ == op_limit_; } - inline bool Append(const char* ip, size_t len) { - char* op = op_; + char* GetOutputPtr() { return op_; } + void SetOutputPtr(char* op) { op_ = op; } + + inline bool Append(const char* ip, size_t len, char** op_p) { + char* op = *op_p; const size_t space_left = op_limit_ - op; - if (space_left < len) { - return false; - } + if (space_left < len) return false; memcpy(op, ip, len); - op_ = op + len; + *op_p = op + len; return true; } - inline bool TryFastAppend(const char* ip, size_t available, size_t len) { - char* op = op_; + inline bool TryFastAppend(const char* ip, size_t available, size_t len, + char** op_p) { + char* op = *op_p; const size_t space_left = op_limit_ - op; if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) { // Fast path, used for the majority (about 95%) of invocations. UnalignedCopy128(ip, op); - op_ = op + len; + *op_p = op + len; return true; } else { return false; } } - inline bool AppendFromSelf(size_t offset, size_t len) { - char* const op_end = op_ + len; + ABSL_ATTRIBUTE_ALWAYS_INLINE + inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) { + char* const op = *op_p; + char* const op_end = op + len; // Check if we try to append from before the start of the buffer. - // Normally this would just be a check for "produced < offset", - // but "produced <= offset - 1u" is equivalent for every case - // except the one where offset==0, where the right side will wrap around - // to a very big number. This is convenient, as offset==0 is another - // invalid case that we also want to catch, so that we do not go - // into an infinite loop. - if (Produced() <= offset - 1u || op_end > op_limit_) return false; - op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_); - + if (SNAPPY_PREDICT_FALSE(op - base_ < offset)) return false; + if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) || + op >= op_limit_min_slop_ || offset < len)) { + if (op_end > op_limit_ || offset == 0) return false; + *op_p = IncrementalCopy(op - offset, op, op_end, op_limit_); + return true; + } + std::memmove(op, op - offset, kSlopBytes); + *op_p = op_end; return true; } inline size_t Produced() const { @@ -1393,22 +1415,25 @@ class SnappyDecompressionValidator { inline void SetExpectedLength(size_t len) { expected_ = len; } + size_t GetOutputPtr() { return produced_; } + void SetOutputPtr(size_t op) { produced_ = op; } inline bool CheckLength() const { return expected_ == produced_; } - inline bool Append(const char* ip, size_t len) { - produced_ += len; - return produced_ <= expected_; + inline bool Append(const char* ip, size_t len, size_t* produced) { + *produced += len; + return *produced <= expected_; } - inline bool TryFastAppend(const char* ip, size_t available, size_t length) { + inline bool TryFastAppend(const char* ip, size_t available, size_t length, + size_t* produced) { return false; } - inline bool AppendFromSelf(size_t offset, size_t len) { + inline bool AppendFromSelf(size_t offset, size_t len, size_t* produced) { // See SnappyArrayWriter::AppendFromSelf for an explanation of // the "offset - 1u" trick. - if (produced_ <= offset - 1u) return false; - produced_ += len; - return produced_ <= expected_; + if (*produced <= offset - 1u) return false; + *produced += len; + return *produced <= expected_; } inline void Flush() {} }; @@ -1472,6 +1497,9 @@ class SnappyScatteredWriter { char* op_base_; // Base of output block char* op_ptr_; // Pointer to next unfilled byte in block char* op_limit_; // Pointer just past block + // If op < op_limit_min_slop_ then it's safe to unconditionally write + // kSlopBytes starting at op. + char* op_limit_min_slop_; inline size_t Size() const { return full_size_ + (op_ptr_ - op_base_); @@ -1488,6 +1516,8 @@ class SnappyScatteredWriter { op_ptr_(NULL), op_limit_(NULL) { } + char* GetOutputPtr() { return op_ptr_; } + void SetOutputPtr(char* op) { op_ptr_ = op; } inline void SetExpectedLength(size_t len) { assert(blocks_.empty()); @@ -1503,43 +1533,59 @@ class SnappyScatteredWriter { return Size(); } - inline bool Append(const char* ip, size_t len) { - size_t avail = op_limit_ - op_ptr_; + inline bool Append(const char* ip, size_t len, char** op_p) { + char* op = *op_p; + size_t avail = op_limit_ - op; if (len <= avail) { // Fast path - memcpy(op_ptr_, ip, len); - op_ptr_ += len; + memcpy(op, ip, len); + *op_p = op + len; return true; } else { - return SlowAppend(ip, len); + op_ptr_ = op; + bool res = SlowAppend(ip, len); + *op_p = op_ptr_; + return res; } } - inline bool TryFastAppend(const char* ip, size_t available, size_t length) { - char* op = op_ptr_; + inline bool TryFastAppend(const char* ip, size_t available, size_t length, + char** op_p) { + char* op = *op_p; const int space_left = op_limit_ - op; if (length <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) { // Fast path, used for the majority (about 95%) of invocations. UnalignedCopy128(ip, op); - op_ptr_ = op + length; + *op_p = op + length; return true; } else { return false; } } - inline bool AppendFromSelf(size_t offset, size_t len) { - char* const op_end = op_ptr_ + len; - // See SnappyArrayWriter::AppendFromSelf for an explanation of - // the "offset - 1u" trick. - if (SNAPPY_PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ && - op_end <= op_limit_)) { - // Fast path: src and dst in current block. - op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_); + inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) { + char* op = *op_p; + // Check if we try to append from before the start of the buffer. + if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) || + op - op_base_ < offset || + op >= op_limit_min_slop_ || offset < len)) { + if (offset == 0) return false; + char* const op_end = op + len; + if (SNAPPY_PREDICT_FALSE(op - op_base_ < offset || op_end > op_limit_)) { + op_ptr_ = op; + bool res = SlowAppendFromSelf(offset, len); + *op_p = op_ptr_; + return res; + } + *op_p = IncrementalCopy(op - offset, op, op_end, op_limit_); return true; } - return SlowAppendFromSelf(offset, len); + // Fast path + char* const op_end = op + len; + std::memmove(op, op - offset, kSlopBytes); + *op_p = op_end; + return true; } // Called at the end of the decompress. We ask the allocator @@ -1560,15 +1606,15 @@ bool SnappyScatteredWriter::SlowAppend(const char* ip, size_t len) { ip += avail; // Bounds check - if (full_size_ + len > expected_) { - return false; - } + if (full_size_ + len > expected_) return false; // Make new block size_t bsize = std::min(kBlockSize, expected_ - full_size_); op_base_ = allocator_.Allocate(bsize); op_ptr_ = op_base_; op_limit_ = op_base_ + bsize; + op_limit_min_slop_ = op_limit_ - std::min(kSlopBytes - 1, bsize); + blocks_.push_back(op_base_); avail = bsize; } @@ -1593,12 +1639,20 @@ bool SnappyScatteredWriter::SlowAppendFromSelf(size_t offset, // nice if we do not rely on that, since we can get better compression if we // allow cross-block copies and thus might want to change the compressor in // the future. + // TODO Replace this with a properly optimized path. This is not + // triggered right now. But this is so super slow, that it would regress + // performance unacceptably if triggered. size_t src = cur - offset; + char* op = op_ptr_; while (len-- > 0) { char c = blocks_[src >> kBlockLog][src & (kBlockSize-1)]; - Append(&c, 1); + if (!Append(&c, 1, &op)) { + op_ptr_ = op; + return false; + } src++; } + op_ptr_ = op; return true; }