From 662ce6204406f4377044e9fd34fb8dc502ca4df7 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Tue, 2 Jul 2019 11:45:32 -0700 Subject: [PATCH] Reduce iterator key comparison for upper/lower bound check (2nd attempt) (#5468) Summary: This is a second attempt for https://github.com/facebook/rocksdb/issues/5111, with the fix to redo iterate bounds check after `SeekXXX()`. This is because MyRocks may change iterate bounds between seek. See https://github.com/facebook/rocksdb/issues/5111 for original benchmark result and discussion. Closes https://github.com/facebook/rocksdb/issues/5463. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5468 Test Plan: Existing rocksdb tests, plus myrocks test `rocksdb.optimizer_loose_index_scans` and `rocksdb.group_min_max`. Differential Revision: D15863332 fbshipit-source-id: ab4aba5899838591806b8673899bd465f3f53e18 --- HISTORY.md | 1 + db/db_iter.cc | 9 ++- db/db_iterator_test.cc | 62 +++++++++++++++++++ db/version_set.cc | 48 +++++++++++--- table/block_based/block_based_table_reader.cc | 30 ++++++--- table/block_based/block_based_table_reader.h | 15 ++++- table/internal_iterator.h | 25 +++++++- table/iterator_wrapper.h | 22 +++++-- table/merging_iterator.cc | 24 +++++++ 9 files changed, 212 insertions(+), 24 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 2c8dc8c3ab..c3af6ba06d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -24,6 +24,7 @@ * Reduce binary search when iterator reseek into the same data block. * DBIter::Next() can skip user key checking if previous entry's seqnum is 0. * Merging iterator to avoid child iterator reseek for some cases +* Reduce iterator key comparision for upper/lower bound check. * Log Writer will flush after finishing the whole record, rather than a fragment. * Lower MultiGet batching API latency by reading data blocks from disk in parallel diff --git a/db/db_iter.cc b/db/db_iter.cc index b89d730113..633724c576 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -467,7 +467,9 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) is_key_seqnum_zero_ = (ikey_.sequence == 0); - if (iterate_upper_bound_ != nullptr && + assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() || + user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0); + if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() && user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) { break; } @@ -859,7 +861,10 @@ void DBIter::PrevInternal() { return; } - if (iterate_lower_bound_ != nullptr && + assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() || + user_comparator_.Compare(saved_key_.GetUserKey(), + *iterate_lower_bound_) >= 0); + if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < 0) { // We've iterated earlier than the user-specified lower bound. diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index d514e7683d..67a97b20b8 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -2759,6 +2759,68 @@ TEST_P(DBIteratorTest, AvoidReseekChildIterator) { SyncPoint::GetInstance()->DisableProcessing(); } +// MyRocks may change iterate bounds before seek. Simply test to make sure such +// usage doesn't break iterator. +TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) { + Options options = CurrentOptions(); + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 100; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + std::string value(50, 'v'); + Reopen(options); + ASSERT_OK(Put("aaa", value)); + ASSERT_OK(Flush()); + ASSERT_OK(Put("bbb", "v")); + ASSERT_OK(Put("ccc", "v")); + ASSERT_OK(Put("ddd", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("eee", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::string ub1 = "e"; + std::string ub2 = "c"; + Slice ub(ub1); + ReadOptions read_opts1; + read_opts1.iterate_upper_bound = &ub; + Iterator* iter = NewIterator(read_opts1); + // Seek and iterate accross block boundary. + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + ub = Slice(ub2); + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + + std::string lb1 = "a"; + std::string lb2 = "c"; + Slice lb(lb1); + ReadOptions read_opts2; + read_opts2.iterate_lower_bound = &lb; + iter = NewIterator(read_opts2); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + lb = Slice(lb2); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; +} + INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest, testing::Values(true, false)); diff --git a/db/version_set.cc b/db/version_set.cc index 8e2d21b051..3354959aac 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -887,7 +887,7 @@ class LevelIterator final : public InternalIterator { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(Slice* ret_key) override; + bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { return file_iter_.Valid(); } @@ -895,23 +895,38 @@ class LevelIterator final : public InternalIterator { assert(Valid()); return file_iter_.key(); } + Slice value() const override { assert(Valid()); return file_iter_.value(); } + Status status() const override { return file_iter_.iter() ? file_iter_.status() : Status::OK(); } + + inline bool MayBeOutOfLowerBound() override { + assert(Valid()); + return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound(); + } + + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return file_iter_.MayBeOutOfUpperBound(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; if (file_iter_.iter()) { file_iter_.SetPinnedItersMgr(pinned_iters_mgr); } } + bool IsKeyPinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsKeyPinned(); } + bool IsValuePinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsValuePinned(); @@ -955,6 +970,7 @@ class LevelIterator final : public InternalIterator { smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; } + CheckMayBeOutOfLowerBound(); return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *file_meta.file_metadata, range_del_agg_, prefix_extractor_, @@ -963,6 +979,19 @@ class LevelIterator final : public InternalIterator { largest_compaction_key); } + // Check if current file being fully within iterate_lower_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update may_be_out_of_lower_bound_ accordingly. + void CheckMayBeOutOfLowerBound() { + if (Valid() && read_options_.iterate_lower_bound != nullptr) { + may_be_out_of_lower_bound_ = + user_comparator_.Compare( + ExtractUserKey(file_smallest_key(file_index_)), + *read_options_.iterate_lower_bound) < 0; + } + } + TableCache* table_cache_; const ReadOptions read_options_; const EnvOptions& env_options_; @@ -976,6 +1005,7 @@ class LevelIterator final : public InternalIterator { bool should_sample_; TableReaderCaller caller_; bool skip_filters_; + bool may_be_out_of_lower_bound_ = true; size_t file_index_; int level_; RangeDelAggregator* range_del_agg_; @@ -1011,6 +1041,7 @@ void LevelIterator::Seek(const Slice& target) { file_iter_.Seek(target); } SkipEmptyFileForward(); + CheckMayBeOutOfLowerBound(); } void LevelIterator::SeekForPrev(const Slice& target) { @@ -1024,6 +1055,7 @@ void LevelIterator::SeekForPrev(const Slice& target) { file_iter_.SeekForPrev(target); SkipEmptyFileBackward(); } + CheckMayBeOutOfLowerBound(); } void LevelIterator::SeekToFirst() { @@ -1032,6 +1064,7 @@ void LevelIterator::SeekToFirst() { file_iter_.SeekToFirst(); } SkipEmptyFileForward(); + CheckMayBeOutOfLowerBound(); } void LevelIterator::SeekToLast() { @@ -1040,15 +1073,17 @@ void LevelIterator::SeekToLast() { file_iter_.SeekToLast(); } SkipEmptyFileBackward(); + CheckMayBeOutOfLowerBound(); } void LevelIterator::Next() { NextImpl(); } -bool LevelIterator::NextAndGetResult(Slice* ret_key) { +bool LevelIterator::NextAndGetResult(IterateResult* result) { NextImpl(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); } return is_valid; } @@ -4366,10 +4401,9 @@ Status VersionSet::Recover( ", last_sequence is %" PRIu64 ", log_number is %" PRIu64 ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 ",min_log_number_to_keep is %" PRIu64 "\n", - manifest_path.c_str(), manifest_file_number_, - next_file_number_.load(), last_sequence_.load(), log_number, - prev_log_number_, column_family_set_->GetMaxColumnFamily(), - min_log_number_to_keep_2pc()); + manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), + last_sequence_.load(), log_number, prev_log_number_, + column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc()); for (auto cfd : *column_family_set_) { if (cfd->IsDropped()) { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index edddecf78b..87756f2e24 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2896,6 +2896,7 @@ void BlockBasedTableIterator::SeekImpl( FindKeyForward(); } + CheckDataBlockWithinUpperBound(); CheckOutOfBound(); if (target) { @@ -2952,6 +2953,7 @@ void BlockBasedTableIterator::SeekForPrev( block_iter_.SeekForPrev(target); FindKeyBackward(); + CheckDataBlockWithinUpperBound(); assert(!block_iter_.Valid() || icomp_.Compare(target, block_iter_.key()) >= 0); } @@ -2969,6 +2971,7 @@ void BlockBasedTableIterator::SeekToLast() { InitDataBlock(); block_iter_.SeekToLast(); FindKeyBackward(); + CheckDataBlockWithinUpperBound(); } template @@ -2984,11 +2987,12 @@ void BlockBasedTableIterator::Next() { template bool BlockBasedTableIterator::NextAndGetResult( - Slice* ret_key) { + IterateResult* result) { Next(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); } return is_valid; } @@ -3087,6 +3091,7 @@ void BlockBasedTableIterator::InitDataBlock() { /*for_compaction=*/lookup_context_.caller == TableReaderCaller::kCompaction); block_iter_points_to_real_block_ = true; + CheckDataBlockWithinUpperBound(); } } @@ -3140,13 +3145,12 @@ void BlockBasedTableIterator::FindBlockForward() { return; } // Whether next data block is out of upper bound, if there is one. - bool next_block_is_out_of_bound = false; - if (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_) { - next_block_is_out_of_bound = - (user_comparator_.Compare(*read_options_.iterate_upper_bound, + const bool next_block_is_out_of_bound = + read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && !data_block_within_upper_bound_; + assert(!next_block_is_out_of_bound || + user_comparator_.Compare(*read_options_.iterate_upper_bound, index_iter_->user_key()) <= 0); - } ResetDataIter(); index_iter_->Next(); if (next_block_is_out_of_bound) { @@ -3210,6 +3214,16 @@ void BlockBasedTableIterator::CheckOutOfBound() { } } +template +void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_) { + data_block_within_upper_bound_ = + (user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) > 0); + } +} + InternalIterator* BlockBasedTable::NewIterator( const ReadOptions& read_options, const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, TableReaderCaller caller, size_t compaction_readahead_size) { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 358bc8b8d2..750700813d 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -652,7 +652,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(Slice* ret_key) override; + bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { return !is_out_of_bound_ && @@ -702,6 +702,11 @@ class BlockBasedTableIterator : public InternalIteratorBase { // Whether iterator invalidated for being out of bound. bool IsOutOfBound() override { return is_out_of_bound_; } + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return !data_block_within_upper_bound_; + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; } @@ -768,6 +773,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool block_iter_points_to_real_block_; // See InternalIteratorBase::IsOutOfBound(). bool is_out_of_bound_ = false; + // Whether current data block being fully within iterate upper bound. + bool data_block_within_upper_bound_ = false; // True if we're standing at the first key of a block, and we haven't loaded // that block yet. A call to value() will trigger loading the block. bool is_at_first_key_from_index_ = false; @@ -802,6 +809,12 @@ class BlockBasedTableIterator : public InternalIteratorBase { void FindBlockForward(); void FindKeyBackward(); void CheckOutOfBound(); + + // Check if data block is fully within iterate_upper_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update data_block_within_upper_bound_ accordingly. + void CheckDataBlockWithinUpperBound(); }; } // namespace rocksdb diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 696e66135d..426ff39654 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -17,6 +17,11 @@ namespace rocksdb { class PinnedIteratorsManager; +struct IterateResult { + Slice key; + bool may_be_out_of_upper_bound; +}; + template class InternalIteratorBase : public Cleanable { public: @@ -55,11 +60,20 @@ class InternalIteratorBase : public Cleanable { // REQUIRES: Valid() virtual void Next() = 0; - virtual bool NextAndGetResult(Slice* ret_key) { + // Moves to the next entry in the source, and return result. Iterator + // implementation should override this method to help methods inline better, + // or when MayBeOutOfUpperBound() is non-trivial. + // REQUIRES: Valid() + virtual bool NextAndGetResult(IterateResult* result) { Next(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual + // call. If an implementation has non-trivial MayBeOutOfUpperBound(), + // it should also override NextAndGetResult(). + result->may_be_out_of_upper_bound = true; + assert(MayBeOutOfUpperBound()); } return is_valid; } @@ -97,6 +111,13 @@ class InternalIteratorBase : public Cleanable { // keys above the upper bound, IsOutOfBound() must return false. virtual bool IsOutOfBound() { return false; } + // Keys return from this iterator can be smaller than iterate_lower_bound. + virtual bool MayBeOutOfLowerBound() { return true; } + + // Keys return from this iterator can be larger or equal to + // iterate_upper_bound. + virtual bool MayBeOutOfUpperBound() { return true; } + // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont // communicate with PinnedIteratorsManager so default implementation is no-op // but for Iterators that need to communicate with PinnedIteratorsManager diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index a570e53c1e..a5aa5c49ea 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -56,7 +56,10 @@ class IteratorWrapperBase { // Iterator interface methods bool Valid() const { return valid_; } - Slice key() const { assert(Valid()); return key_; } + Slice key() const { + assert(Valid()); + return result_.key; + } TValue value() const { assert(Valid()); return iter_->value(); @@ -65,7 +68,7 @@ class IteratorWrapperBase { Status status() const { assert(iter_); return iter_->status(); } void Next() { assert(iter_); - valid_ = iter_->NextAndGetResult(&key_); + valid_ = iter_->NextAndGetResult(&result_); assert(!valid_ || iter_->status().ok()); } void Prev() { assert(iter_); iter_->Prev(); Update(); } @@ -83,6 +86,16 @@ class IteratorWrapperBase { void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + bool MayBeOutOfLowerBound() { + assert(Valid()); + return iter_->MayBeOutOfLowerBound(); + } + + bool MayBeOutOfUpperBound() { + assert(Valid()); + return result_.may_be_out_of_upper_bound; + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) { assert(iter_); iter_->SetPinnedItersMgr(pinned_iters_mgr); @@ -100,14 +113,15 @@ class IteratorWrapperBase { void Update() { valid_ = iter_->Valid(); if (valid_) { - key_ = iter_->key(); assert(iter_->status().ok()); + result_.key = iter_->key(); + result_.may_be_out_of_upper_bound = true; } } InternalIteratorBase* iter_; + IterateResult result_; bool valid_; - Slice key_; }; using IteratorWrapper = IteratorWrapperBase; diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index 207066b5a1..1a0d4df899 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -227,6 +227,16 @@ class MergingIterator : public InternalIterator { current_ = CurrentForward(); } + bool NextAndGetResult(IterateResult* result) override { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + } + return is_valid; + } + void Prev() override { assert(Valid()); // Ensure that all children are positioned before key(). @@ -296,6 +306,20 @@ class MergingIterator : public InternalIterator { return current_->value(); } + // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result + // from current child iterator. Potentially as long as one of child iterator + // report out of bound is not possible, we know current key is within bound. + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return current_->MayBeOutOfLowerBound(); + } + + bool MayBeOutOfUpperBound() override { + assert(Valid()); + return current_->MayBeOutOfUpperBound(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; for (auto& child : children_) {