// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_based_table_reader_impl.h" #include "table/block_based/block_prefetcher.h" #include "table/block_based/reader_common.h" namespace ROCKSDB_NAMESPACE { // Iterates over the contents of BlockBasedTable. class BlockBasedTableIterator : public InternalIteratorBase { // compaction_readahead_size: its value will only be used if for_compaction = // true // @param read_options Must outlive this iterator. public: BlockBasedTableIterator( const BlockBasedTable* table, const ReadOptions& read_options, const InternalKeyComparator& icomp, std::unique_ptr>&& index_iter, bool check_filter, bool need_upper_bound_check, const SliceTransform* prefix_extractor, TableReaderCaller caller, size_t compaction_readahead_size = 0, bool allow_unprepared_value = false) : index_iter_(std::move(index_iter)), table_(table), read_options_(read_options), icomp_(icomp), user_comparator_(icomp.user_comparator()), pinned_iters_mgr_(nullptr), prefix_extractor_(prefix_extractor), lookup_context_(caller), block_prefetcher_( compaction_readahead_size, table_->get_rep()->table_options.initial_auto_readahead_size), allow_unprepared_value_(allow_unprepared_value), block_iter_points_to_real_block_(false), check_filter_(check_filter), need_upper_bound_check_(need_upper_bound_check), async_read_in_progress_(false), is_last_level_(table->IsLastLevel()) {} ~BlockBasedTableIterator() {} void Seek(const Slice& target) override; void SeekForPrev(const Slice& target) override; void SeekToFirst() override; void SeekToLast() override; void Next() final override; bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { return !is_out_of_bound_ && (is_at_first_key_from_index_ || (block_iter_points_to_real_block_ && block_iter_.Valid())); } Slice key() const override { assert(Valid()); if (is_at_first_key_from_index_) { return index_iter_->value().first_internal_key; } else { return block_iter_.key(); } } Slice user_key() const override { assert(Valid()); if (is_at_first_key_from_index_) { return ExtractUserKey(index_iter_->value().first_internal_key); } else { return block_iter_.user_key(); } } bool PrepareValue() override { assert(Valid()); if (!is_at_first_key_from_index_) { return true; } return const_cast(this) ->MaterializeCurrentBlock(); } Slice value() const override { // PrepareValue() must have been called. assert(!is_at_first_key_from_index_); assert(Valid()); if (seek_stat_state_ & kReportOnUseful) { bool filter_used = (seek_stat_state_ & kFilterUsed) != 0; RecordTick( table_->GetStatistics(), filter_used ? (is_last_level_ ? LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH : NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH) : (is_last_level_ ? LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER : NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER)); seek_stat_state_ = kDataBlockReadSinceLastSeek; } return block_iter_.value(); } Status status() const override { // Prefix index set status to NotFound when the prefix does not exist if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { return index_iter_->status(); } else if (block_iter_points_to_real_block_) { return block_iter_.status(); } else if (async_read_in_progress_) { return Status::TryAgain(); } else { return Status::OK(); } } inline IterBoundCheck UpperBoundCheckResult() override { if (is_out_of_bound_) { return IterBoundCheck::kOutOfBound; } else if (block_upper_bound_check_ == BlockUpperBound::kUpperBoundBeyondCurBlock) { assert(!is_out_of_bound_); return IterBoundCheck::kInbound; } else { return IterBoundCheck::kUnknown; } } void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; } bool IsKeyPinned() const override { // Our key comes either from block_iter_'s current key // or index_iter_'s current *value*. return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) || (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned())); } bool IsValuePinned() const override { assert(!is_at_first_key_from_index_); assert(Valid()); // BlockIter::IsValuePinned() is always true. No need to check return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && block_iter_points_to_real_block_; } void ResetDataIter() { if (block_iter_points_to_real_block_) { if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) { block_iter_.DelegateCleanupsTo(pinned_iters_mgr_); } block_iter_.Invalidate(Status::OK()); block_iter_points_to_real_block_ = false; } block_upper_bound_check_ = BlockUpperBound::kUnknown; } void SavePrevIndexValue() { if (block_iter_points_to_real_block_) { // Reseek. If they end up with the same data block, we shouldn't re-fetch // the same data block. prev_block_offset_ = index_iter_->value().handle.offset(); } } void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { if (block_prefetcher_.prefetch_buffer() != nullptr && read_options_.adaptive_readahead) { block_prefetcher_.prefetch_buffer()->GetReadaheadState( &(readahead_file_info->data_block_readahead_info)); if (index_iter_) { index_iter_->GetReadaheadState(readahead_file_info); } } } void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { if (read_options_.adaptive_readahead) { block_prefetcher_.SetReadaheadState( &(readahead_file_info->data_block_readahead_info)); if (index_iter_) { index_iter_->SetReadaheadState(readahead_file_info); } } } std::unique_ptr> index_iter_; private: enum class IterDirection { kForward, kBackward, }; // This enum indicates whether the upper bound falls into current block // or beyond. // +-------------+ // | cur block | <-- (1) // +-------------+ // <-- (2) // --- --- // <-- (3) // +-------------+ // | next block | <-- (4) // ...... // // When the block is smaller than , kUpperBoundInCurBlock // is the value to use. The examples are (1) or (2) in the graph. It means // all keys in the next block or beyond will be out of bound. Keys within // the current block may or may not be out of bound. // When the block is larger or equal to , // kUpperBoundBeyondCurBlock is to be used. The examples are (3) and (4) // in the graph. It means that all keys in the current block is within the // upper bound and keys in the next block may or may not be within the uppder // bound. // If the boundary key hasn't been checked against the upper bound, // kUnknown can be used. enum class BlockUpperBound : uint8_t { kUpperBoundInCurBlock, kUpperBoundBeyondCurBlock, kUnknown, }; // State bits for collecting stats on seeks and whether they returned useful // results. enum SeekStatState : uint8_t { kNone = 0, // Most recent seek checked prefix filter (or similar future feature) kFilterUsed = 1 << 0, // Already recorded that a data block was accessed since the last seek. kDataBlockReadSinceLastSeek = 1 << 1, // Have not yet recorded that a value() was accessed. kReportOnUseful = 1 << 2, }; const BlockBasedTable* table_; const ReadOptions& read_options_; const InternalKeyComparator& icomp_; UserComparatorWrapper user_comparator_; PinnedIteratorsManager* pinned_iters_mgr_; DataBlockIter block_iter_; const SliceTransform* prefix_extractor_; uint64_t prev_block_offset_ = std::numeric_limits::max(); BlockCacheLookupContext lookup_context_; BlockPrefetcher block_prefetcher_; const bool allow_unprepared_value_; // True if block_iter_ is initialized and points to the same block // as index iterator. bool block_iter_points_to_real_block_; // See InternalIteratorBase::IsOutOfBound(). bool is_out_of_bound_ = false; // How current data block's boundary key with the next block is compared with // iterate upper bound. BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown; // True if we're standing at the first key of a block, and we haven't loaded // that block yet. A call to PrepareValue() will trigger loading the block. bool is_at_first_key_from_index_ = false; bool check_filter_; // TODO(Zhongyi): pick a better name bool need_upper_bound_check_; bool async_read_in_progress_; mutable SeekStatState seek_stat_state_ = SeekStatState::kNone; bool is_last_level_; // If `target` is null, seek to first. void SeekImpl(const Slice* target, bool async_prefetch); void InitDataBlock(); void AsyncInitDataBlock(bool is_first_pass); bool MaterializeCurrentBlock(); void FindKeyForward(); void FindBlockForward(); void FindKeyBackward(); void CheckOutOfBound(); // Check if data block is fully within iterate_upper_bound. // // Note MyRocks may update iterate bounds between seek. To workaround it, // we need to check and update data_block_within_upper_bound_ accordingly. void CheckDataBlockWithinUpperBound(); bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction, bool* filter_checked) { if (need_upper_bound_check_ && direction == IterDirection::kBackward) { // Upper bound check isn't sufficient for backward direction to // guarantee the same result as total order, so disable prefix // check. return true; } if (check_filter_ && !table_->PrefixRangeMayMatch(ikey, read_options_, prefix_extractor_, need_upper_bound_check_, &lookup_context_, filter_checked)) { // TODO remember the iterator is invalidated because of prefix // match. This can avoid the upper level file iterator to falsely // believe the position is the end of the SST file and move to // the first key of the next file. ResetDataIter(); return false; } return true; } void FindReadAheadSizeUpperBound(); }; } // namespace ROCKSDB_NAMESPACE