From 25d81e4577c30f1da7fe6631f4123a5897de4f98 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 9 May 2019 12:04:56 -0700 Subject: [PATCH] DBIter::Next() can skip user key checking if previous entry's seqnum is 0 (#5244) Summary: Right now, DBIter::Next() always checks whether an entry is for the same user key as the previous entry to see whether the key should be hidden to the user. However, if previous entry's sequence number is 0, the check is not needed because 0 is the oldest possible sequence number. We could extend it from seqnum 0 case to simply prev_seqno >= current_seqno. However, it is less robust with bug or unexpected situations, while the gain is relatively low. We can always extend it later when needed. In a readseq benchmark with full formed LSM-tree, number of key comparisons called is reduced from 2.981 to 2.165. readseq against a fully compacted DB, no key comparison is called. Performance in this benchmark didn't show obvious improvement, which is expected because key comparisons only takes small percentage of CPU. But it may show up to be more effective if users have an expensive customized comparator. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5244 Differential Revision: D15067257 Pulled By: siying fbshipit-source-id: b7e1ef3ec4fa928cba509683d2b3246e35d270d9 --- HISTORY.md | 5 ++++- db/db_iter.cc | 29 +++++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 65d64d2360..fb1db417ec 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,9 +4,12 @@ * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. ### New Features -* Reduce binary search when iterator reseek into the same data block. * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. +### Performance Improvements +* Reduce binary search when iterator reseek into the same data block. +* DBIter::Next() can skip user key checking if previous entry's seqnum is 0. + ## 6.2.0 (4/30/2019) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. diff --git a/db/db_iter.cc b/db/db_iter.cc index 43a56af78c..1d8ccf9adb 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -133,6 +133,7 @@ class DBIter final: public Iterator { direction_(kForward), valid_(false), current_entry_is_merged_(false), + is_key_seqnum_zero_(false), prefix_same_as_start_(read_options.prefix_same_as_start), pin_thru_lifetime_(read_options.pin_data), total_order_seek_(read_options.total_order_seek), @@ -333,6 +334,10 @@ class DBIter final: public Iterator { Direction direction_; bool valid_; bool current_entry_is_merged_; + // True if we know that the current entry's seqnum is 0. + // This information is used as that the next entry will be for another + // user key. + bool is_key_seqnum_zero_; const bool prefix_same_as_start_; // Means that we will pin all data blocks we read as long the Iterator // is not deleted, will be true if ReadOptions::pin_data is true @@ -381,6 +386,7 @@ void DBIter::Next() { num_internal_keys_skipped_ = 0; bool ok = true; if (direction_ == kReverse) { + is_key_seqnum_zero_ = false; if (!ReverseToForward()) { ok = false; } @@ -400,6 +406,7 @@ void DBIter::Next() { FindNextUserEntry(true /* skipping the current user key */, prefix_same_as_start_); } else { + is_key_seqnum_zero_ = false; valid_ = false; } if (statistics_ != nullptr && valid_) { @@ -450,10 +457,16 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) is_blob_ = false; do { + // Will update is_key_seqnum_zero_ as soon as we parsed the current key + // but we need to save the previous value to be used in the loop. + bool is_prev_key_seqnum_zero = is_key_seqnum_zero_; if (!ParseKey(&ikey_)) { + is_key_seqnum_zero_ = false; return false; } + is_key_seqnum_zero_ = (ikey_.sequence == 0); + if (iterate_upper_bound_ != nullptr && user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) { break; @@ -470,11 +483,18 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) } if (IsVisible(ikey_.sequence)) { - if (skipping && user_comparator_.Compare(ikey_.user_key, - saved_key_.GetUserKey()) <= 0) { + // If the previous entry is of seqnum 0, the current entry will not + // possibly be skipped. This condition can potentially be relaxed to + // prev_key.seq <= ikey_.sequence. We are cautious because it will be more + // prone to bugs causing the same user key with the same sequence number. + if (!is_prev_key_seqnum_zero && skipping && + user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <= + 0) { num_skipped++; // skip this entry PERF_COUNTER_ADD(internal_key_skipped_count, 1); } else { + assert(!skipping || user_comparator_.Compare( + ikey_.user_key, saved_key_.GetUserKey()) > 0); num_skipped = 0; switch (ikey_.type) { case kTypeDeletion: @@ -595,6 +615,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) // If we have sequentially iterated via numerous equal keys, then it's // better to seek so that we can avoid too many key comparisons. if (num_skipped > max_skip_ && CanReseekToSkip()) { + is_key_seqnum_zero_ = false; num_skipped = 0; std::string last_key; if (skipping) { @@ -1265,6 +1286,7 @@ void DBIter::Seek(const Slice& target) { status_ = Status::OK(); ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); + is_key_seqnum_zero_ = false; SequenceNumber seq = sequence_; saved_key_.Clear(); @@ -1323,6 +1345,7 @@ void DBIter::SeekForPrev(const Slice& target) { status_ = Status::OK(); ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); + is_key_seqnum_zero_ = false; saved_key_.Clear(); // now saved_key is used to store internal key. saved_key_.SetInternalKey(target, 0 /* sequence_number */, @@ -1390,6 +1413,7 @@ void DBIter::SeekToFirst() { ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); ClearSavedValue(); + is_key_seqnum_zero_ = false; { PERF_TIMER_GUARD(seek_internal_seek_time); @@ -1442,6 +1466,7 @@ void DBIter::SeekToLast() { ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); ClearSavedValue(); + is_key_seqnum_zero_ = false; { PERF_TIMER_GUARD(seek_internal_seek_time);