diff --git a/HISTORY.md b/HISTORY.md index 7d485a2766..301c186186 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -30,6 +30,9 @@ * Generate file checksum in SstFileWriter if Options.file_checksum_gen_factory is set. The checksum and checksum function name are stored in ExternalSstFileInfo after the sst file write is finished. * Add a value_size_soft_limit in read options which limits the cumulative value size of keys read in batches in MultiGet. Once the cumulative value size of found keys exceeds read_options.value_size_soft_limit, all the remaining keys are returned with status Abort without further finding their values. By default the value_size_soft_limit is std::numeric_limits::max(). +### Performance Improvements +* Eliminate redundant key comparisons during random access in block-based tables. + ## 6.10 (5/2/2020) ### Bug Fixes * Fix wrong result being read from ingested file. May happen when a key in the file happen to be prefix of another key also in the file. The issue can further cause more data corruption. The issue exists with rocksdb >= 5.0.0 since DB::IngestExternalFile() was introduced. diff --git a/table/block_based/block.cc b/table/block_based/block.cc index 064cd132c8..8bf7e11d49 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -128,17 +128,14 @@ struct DecodeKeyV4 { }; void DataBlockIter::Next() { - assert(Valid()); ParseNextDataKey(); } void DataBlockIter::NextOrReport() { - assert(Valid()); ParseNextDataKey(); } void IndexBlockIter::Next() { - assert(Valid()); ParseNextIndexKey(); } @@ -248,18 +245,14 @@ void DataBlockIter::Seek(const Slice& target) { return; } uint32_t index = 0; + bool skip_linear_scan = false; bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - comparator_); + &skip_linear_scan, comparator_); if (!ok) { return; } - SeekToRestartPoint(index); - - // Linear search (within restart block) for first key >= target - while (ParseNextDataKey() && - comparator_->Compare(applied_key_.UpdateAndGetKey(), seek_key) < 0) { - } + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan, comparator_); } // Optimized Seek for point lookup for an internal key `target` @@ -393,6 +386,7 @@ void IndexBlockIter::Seek(const Slice& target) { } status_ = Status::OK(); uint32_t index = 0; + bool skip_linear_scan = false; bool ok = false; if (prefix_index_) { bool prefix_may_exist = true; @@ -404,23 +398,21 @@ void IndexBlockIter::Seek(const Slice& target) { current_ = restarts_; status_ = Status::NotFound(); } + // restart interval must be one when hash search is enabled so the binary + // search simply lands at the right place. + skip_linear_scan = true; } else if (value_delta_encoded_) { ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - comparator_); + &skip_linear_scan, comparator_); } else { ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - comparator_); + &skip_linear_scan, comparator_); } if (!ok) { return; } - SeekToRestartPoint(index); - - // Linear search (within restart block) for first key >= target - while (ParseNextIndexKey() && - comparator_->Compare(applied_key_.UpdateAndGetKey(), seek_key) < 0) { - } + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan, comparator_); } void DataBlockIter::SeekForPrev(const Slice& target) { @@ -430,18 +422,15 @@ void DataBlockIter::SeekForPrev(const Slice& target) { return; } uint32_t index = 0; + bool skip_linear_scan = false; bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - comparator_); + &skip_linear_scan, comparator_); if (!ok) { return; } - SeekToRestartPoint(index); + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan, comparator_); - // Linear search (within restart block) for first key >= seek_key - while (ParseNextDataKey() && - comparator_->Compare(applied_key_.UpdateAndGetKey(), seek_key) < 0) { - } if (!Valid()) { SeekToLast(); } else { @@ -659,17 +648,70 @@ void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { } } -// Binary search in restart array to find the first restart point that -// is either the last restart point with a key less than target, -// which means the key of next restart point is larger than target, or -// the first restart point with a key = target +template +void BlockIter::FindKeyAfterBinarySeek(const Slice& target, + uint32_t index, + bool skip_linear_scan, + const Comparator* comp) { + // SeekToRestartPoint() only does the lookup in the restart block. We need + // to follow it up with Next() to position the iterator at the restart key. + SeekToRestartPoint(index); + Next(); + + if (!skip_linear_scan) { + // Linear search (within restart block) for first key >= target + uint32_t max_offset; + if (index + 1 < num_restarts_) { + // We are in a non-last restart interval. Since `BinarySeek()` guarantees + // the next restart key is strictly greater than `target`, we can + // terminate upon reaching it without any additional key comparison. + max_offset = GetRestartPoint(index + 1); + } else { + // We are in the last restart interval. The while-loop will terminate by + // `Valid()` returning false upon advancing past the block's last key. + max_offset = port::kMaxUint32; + } + while (true) { + Next(); + if (!Valid()) { + break; + } + if (current_ == max_offset) { + assert(comp->Compare(applied_key_.UpdateAndGetKey(), target) > 0); + break; + } else if (comp->Compare(applied_key_.UpdateAndGetKey(), target) >= 0) { + break; + } + } + } +} + +// Binary searches in restart array to find the starting restart point for the +// linear scan, and stores it in `*index`. Assumes restart array does not +// contain duplicate keys. It is guaranteed that the restart key at `*index + 1` +// is strictly greater than `target` or does not exist (this can be used to +// elide a comparison when linear scan reaches all the way to the next restart +// key). Furthermore, `*skip_linear_scan` is set to indicate whether the +// `*index`th restart key is the final result so that key does not need to be +// compared again later. template template bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, uint32_t* index, + bool* skip_linear_scan, const Comparator* comp) { assert(left <= right); + if (restarts_ == 0) { + // SST files dedicated to range tombstones are written with index blocks + // that have no keys while also having `num_restarts_ == 1`. This would + // cause a problem for `BinarySeek()` as it'd try to access the first key + // which does not exist. We identify such blocks by the offset at which + // their restarts are stored, and return false to prevent any attempted + // key accesses. + return false; + } + *skip_linear_scan = false; while (left < right) { uint32_t mid = (left + right + 1) / 2; uint32_t region_offset = GetRestartPoint(mid); @@ -692,11 +734,32 @@ bool BlockIter::BinarySeek(const Slice& target, uint32_t left, // after "mid" are uninteresting. right = mid - 1; } else { + *skip_linear_scan = true; left = right = mid; } } + assert(left == right); *index = left; + if (*index == 0) { + // Special case as we land at zero as long as restart key at index 1 is > + // "target". We need to compare the restart key at index 0 so we can set + // `*skip_linear_scan` when the 0th restart key is >= "target". + // + // GetRestartPoint() is always zero for restart key zero; skip the restart + // block access. + uint32_t shared, non_shared; + const char* key_ptr = + DecodeKeyFunc()(data_, data_ + restarts_, &shared, &non_shared); + if (key_ptr == nullptr || (shared != 0)) { + CorruptionError(); + return false; + } + Slice first_key(key_ptr, non_shared); + raw_key_.SetKey(first_key, false /* copy */); + int cmp = comp->Compare(applied_key_.UpdateAndGetKey(), target); + *skip_linear_scan = cmp >= 0; + } return true; } diff --git a/table/block_based/block.h b/table/block_based/block.h index a24005f25a..a7378949a6 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -343,6 +343,8 @@ class BlockIter : public InternalIteratorBase { Cache::Handle* cache_handle() { return cache_handle_; } + virtual void Next() override = 0; + protected: // Note: The type could be changed to InternalKeyComparator but we see a weird // performance drop by that. @@ -403,9 +405,14 @@ class BlockIter : public InternalIteratorBase { void CorruptionError(); + protected: template inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, - uint32_t* index, const Comparator* comp); + uint32_t* index, bool* is_index_key_result, + const Comparator* comp); + + void FindKeyAfterBinarySeek(const Slice& target, uint32_t index, + bool is_index_key_result, const Comparator* comp); }; class DataBlockIter final : public BlockIter {