rocksdb/table/block_based/block_based_table_iterator.cc

559 lines
19 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/block_based/block_based_table_iterator.h"
namespace ROCKSDB_NAMESPACE {
void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr, false); }
void BlockBasedTableIterator::Seek(const Slice& target) {
SeekImpl(&target, true);
}
void BlockBasedTableIterator::SeekImpl(const Slice* target,
bool async_prefetch) {
bool is_first_pass = true;
if (async_read_in_progress_) {
AsyncInitDataBlock(false);
is_first_pass = false;
}
is_out_of_bound_ = false;
is_at_first_key_from_index_ = false;
seek_stat_state_ = kNone;
bool filter_checked = false;
if (target &&
!CheckPrefixMayMatch(*target, IterDirection::kForward, &filter_checked)) {
ResetDataIter();
RecordTick(table_->GetStatistics(), is_last_level_
? LAST_LEVEL_SEEK_FILTERED
: NON_LAST_LEVEL_SEEK_FILTERED);
return;
}
if (filter_checked) {
seek_stat_state_ = kFilterUsed;
RecordTick(table_->GetStatistics(), is_last_level_
? LAST_LEVEL_SEEK_FILTER_MATCH
: NON_LAST_LEVEL_SEEK_FILTER_MATCH);
}
bool need_seek_index = true;
if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
// Reseek.
prev_block_offset_ = index_iter_->value().handle.offset();
if (target) {
// We can avoid an index seek if:
// 1. The new seek key is larger than the current key
// 2. The new seek key is within the upper bound of the block
// Since we don't necessarily know the internal key for either
// the current key or the upper bound, we check user keys and
// exclude the equality case. Considering internal keys can
// improve for the boundary cases, but it would complicate the
// code.
if (user_comparator_.Compare(ExtractUserKey(*target),
block_iter_.user_key()) > 0 &&
user_comparator_.Compare(ExtractUserKey(*target),
index_iter_->user_key()) < 0) {
need_seek_index = false;
}
}
}
if (need_seek_index) {
if (target) {
index_iter_->Seek(*target);
} else {
index_iter_->SeekToFirst();
}
if (!index_iter_->Valid()) {
ResetDataIter();
return;
}
}
if (read_options_.auto_readahead_size && read_options_.iterate_upper_bound &&
is_first_pass) {
FindReadAheadSizeUpperBound();
if (target) {
index_iter_->Seek(*target);
} else {
index_iter_->SeekToFirst();
}
// Check for IO error.
if (!index_iter_->Valid()) {
ResetDataIter();
return;
}
}
IndexValue v = index_iter_->value();
const bool same_block = block_iter_points_to_real_block_ &&
v.handle.offset() == prev_block_offset_;
if (!v.first_internal_key.empty() && !same_block &&
(!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
allow_unprepared_value_) {
// Index contains the first key of the block, and it's >= target.
// We can defer reading the block.
is_at_first_key_from_index_ = true;
// ResetDataIter() will invalidate block_iter_. Thus, there is no need to
// call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
// as that will be done later when the data block is actually read.
ResetDataIter();
} else {
// Need to use the data block.
if (!same_block) {
if (read_options_.async_io && async_prefetch) {
if (is_first_pass) {
AsyncInitDataBlock(is_first_pass);
}
if (async_read_in_progress_) {
// Status::TryAgain indicates asynchronous request for retrieval of
// data blocks has been submitted. So it should return at this point
// and Seek should be called again to retrieve the requested block and
// execute the remaining code.
return;
}
} else {
InitDataBlock();
}
} else {
// When the user does a reseek, the iterate_upper_bound might have
// changed. CheckDataBlockWithinUpperBound() needs to be called
// explicitly if the reseek ends up in the same data block.
// If the reseek ends up in a different block, InitDataBlock() will do
// the iterator upper bound check.
CheckDataBlockWithinUpperBound();
}
if (target) {
block_iter_.Seek(*target);
} else {
block_iter_.SeekToFirst();
}
FindKeyForward();
}
CheckOutOfBound();
if (target) {
assert(!Valid() || icomp_.Compare(*target, key()) <= 0);
}
}
void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
is_out_of_bound_ = false;
is_at_first_key_from_index_ = false;
seek_stat_state_ = kNone;
bool filter_checked = false;
// For now totally disable prefix seek in auto prefix mode because we don't
// have logic
if (!CheckPrefixMayMatch(target, IterDirection::kBackward, &filter_checked)) {
ResetDataIter();
RecordTick(table_->GetStatistics(), is_last_level_
? LAST_LEVEL_SEEK_FILTERED
: NON_LAST_LEVEL_SEEK_FILTERED);
return;
}
if (filter_checked) {
seek_stat_state_ = kFilterUsed;
RecordTick(table_->GetStatistics(), is_last_level_
? LAST_LEVEL_SEEK_FILTER_MATCH
: NON_LAST_LEVEL_SEEK_FILTER_MATCH);
}
SavePrevIndexValue();
// Call Seek() rather than SeekForPrev() in the index block, because the
// target data block will likely to contain the position for `target`, the
// same as Seek(), rather than than before.
// For example, if we have three data blocks, each containing two keys:
// [2, 4] [6, 8] [10, 12]
// (the keys in the index block would be [4, 8, 12])
// and the user calls SeekForPrev(7), we need to go to the second block,
// just like if they call Seek(7).
// The only case where the block is difference is when they seek to a position
// in the boundary. For example, if they SeekForPrev(5), we should go to the
// first block, rather than the second. However, we don't have the information
// to distinguish the two unless we read the second block. In this case, we'll
// end up with reading two blocks.
index_iter_->Seek(target);
if (!index_iter_->Valid()) {
auto seek_status = index_iter_->status();
// Check for IO error
if (!seek_status.IsNotFound() && !seek_status.ok()) {
ResetDataIter();
return;
}
// With prefix index, Seek() returns NotFound if the prefix doesn't exist
if (seek_status.IsNotFound()) {
// Any key less than the target is fine for prefix seek
ResetDataIter();
return;
} else {
index_iter_->SeekToLast();
}
// Check for IO error
if (!index_iter_->Valid()) {
ResetDataIter();
return;
}
}
InitDataBlock();
block_iter_.SeekForPrev(target);
FindKeyBackward();
CheckDataBlockWithinUpperBound();
assert(!block_iter_.Valid() ||
icomp_.Compare(target, block_iter_.key()) >= 0);
}
void BlockBasedTableIterator::SeekToLast() {
is_out_of_bound_ = false;
is_at_first_key_from_index_ = false;
seek_stat_state_ = kNone;
SavePrevIndexValue();
index_iter_->SeekToLast();
if (!index_iter_->Valid()) {
ResetDataIter();
return;
}
InitDataBlock();
block_iter_.SeekToLast();
FindKeyBackward();
CheckDataBlockWithinUpperBound();
}
void BlockBasedTableIterator::Next() {
if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
return;
}
assert(block_iter_points_to_real_block_);
block_iter_.Next();
FindKeyForward();
CheckOutOfBound();
}
bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) {
Next();
bool is_valid = Valid();
if (is_valid) {
result->key = key();
result->bound_check_result = UpperBoundCheckResult();
result->value_prepared = !is_at_first_key_from_index_;
}
return is_valid;
}
void BlockBasedTableIterator::Prev() {
if (is_at_first_key_from_index_) {
is_at_first_key_from_index_ = false;
index_iter_->Prev();
if (!index_iter_->Valid()) {
return;
}
InitDataBlock();
block_iter_.SeekToLast();
} else {
assert(block_iter_points_to_real_block_);
block_iter_.Prev();
}
FindKeyBackward();
}
void BlockBasedTableIterator::InitDataBlock() {
BlockHandle data_block_handle = index_iter_->value().handle;
if (!block_iter_points_to_real_block_ ||
data_block_handle.offset() != prev_block_offset_ ||
// if previous attempt of reading the block missed cache, try again
block_iter_.status().IsIncomplete()) {
if (block_iter_points_to_real_block_) {
ResetDataIter();
}
auto* rep = table_->get_rep();
bool is_for_compaction =
lookup_context_.caller == TableReaderCaller::kCompaction;
// Prefetch additional data for range scans (iterators).
// Implicit auto readahead:
// Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
// Explicit user requested readahead:
// Enabled from the very first IO when ReadOptions.readahead_size is set.
block_prefetcher_.PrefetchIfNeeded(
rep, data_block_handle, read_options_.readahead_size, is_for_compaction,
/*no_sequential_checking=*/false, read_options_);
Status s;
table_->NewDataBlockIterator<DataBlockIter>(
read_options_, data_block_handle, &block_iter_, BlockType::kData,
/*get_context=*/nullptr, &lookup_context_,
block_prefetcher_.prefetch_buffer(),
/*for_compaction=*/is_for_compaction, /*async_read=*/false, s);
block_iter_points_to_real_block_ = true;
CheckDataBlockWithinUpperBound();
if (!is_for_compaction &&
(seek_stat_state_ & kDataBlockReadSinceLastSeek) == 0) {
RecordTick(table_->GetStatistics(), is_last_level_
? LAST_LEVEL_SEEK_DATA
: NON_LAST_LEVEL_SEEK_DATA);
seek_stat_state_ = static_cast<SeekStatState>(
seek_stat_state_ | kDataBlockReadSinceLastSeek | kReportOnUseful);
}
}
}
void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) {
BlockHandle data_block_handle = index_iter_->value().handle;
bool is_for_compaction =
lookup_context_.caller == TableReaderCaller::kCompaction;
if (is_first_pass) {
if (!block_iter_points_to_real_block_ ||
data_block_handle.offset() != prev_block_offset_ ||
// if previous attempt of reading the block missed cache, try again
block_iter_.status().IsIncomplete()) {
if (block_iter_points_to_real_block_) {
ResetDataIter();
}
auto* rep = table_->get_rep();
// Prefetch additional data for range scans (iterators).
// Implicit auto readahead:
// Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
// Explicit user requested readahead:
// Enabled from the very first IO when ReadOptions.readahead_size is
// set.
// In case of async_io with Implicit readahead, block_prefetcher_ will
// always the create the prefetch buffer by setting no_sequential_checking
// = true.
block_prefetcher_.PrefetchIfNeeded(
rep, data_block_handle, read_options_.readahead_size,
is_for_compaction, /*no_sequential_checking=*/read_options_.async_io,
read_options_);
Status s;
table_->NewDataBlockIterator<DataBlockIter>(
read_options_, data_block_handle, &block_iter_, BlockType::kData,
/*get_context=*/nullptr, &lookup_context_,
block_prefetcher_.prefetch_buffer(),
/*for_compaction=*/is_for_compaction, /*async_read=*/true, s);
if (s.IsTryAgain()) {
async_read_in_progress_ = true;
return;
}
}
} else {
// Second pass will call the Poll to get the data block which has been
// requested asynchronously.
Status s;
table_->NewDataBlockIterator<DataBlockIter>(
read_options_, data_block_handle, &block_iter_, BlockType::kData,
/*get_context=*/nullptr, &lookup_context_,
block_prefetcher_.prefetch_buffer(),
/*for_compaction=*/is_for_compaction, /*async_read=*/false, s);
}
block_iter_points_to_real_block_ = true;
CheckDataBlockWithinUpperBound();
if (!is_for_compaction &&
(seek_stat_state_ & kDataBlockReadSinceLastSeek) == 0) {
RecordTick(table_->GetStatistics(), is_last_level_
? LAST_LEVEL_SEEK_DATA
: NON_LAST_LEVEL_SEEK_DATA);
seek_stat_state_ = static_cast<SeekStatState>(
seek_stat_state_ | kDataBlockReadSinceLastSeek | kReportOnUseful);
}
async_read_in_progress_ = false;
}
bool BlockBasedTableIterator::MaterializeCurrentBlock() {
assert(is_at_first_key_from_index_);
assert(!block_iter_points_to_real_block_);
assert(index_iter_->Valid());
is_at_first_key_from_index_ = false;
InitDataBlock();
assert(block_iter_points_to_real_block_);
if (!block_iter_.status().ok()) {
return false;
}
block_iter_.SeekToFirst();
if (!block_iter_.Valid() ||
icomp_.Compare(block_iter_.key(),
index_iter_->value().first_internal_key) != 0) {
block_iter_.Invalidate(Status::Corruption(
"first key in index doesn't match first key in block"));
return false;
}
return true;
}
void BlockBasedTableIterator::FindKeyForward() {
// This method's code is kept short to make it likely to be inlined.
assert(!is_out_of_bound_);
assert(block_iter_points_to_real_block_);
if (!block_iter_.Valid()) {
// This is the only call site of FindBlockForward(), but it's extracted into
// a separate method to keep FindKeyForward() short and likely to be
// inlined. When transitioning to a different block, we call
// FindBlockForward(), which is much longer and is probably not inlined.
FindBlockForward();
} else {
// This is the fast path that avoids a function call.
}
}
void BlockBasedTableIterator::FindBlockForward() {
// TODO the while loop inherits from two-level-iterator. We don't know
// whether a block can be empty so it can be replaced by an "if".
do {
if (!block_iter_.status().ok()) {
return;
}
// Whether next data block is out of upper bound, if there is one.
const bool next_block_is_out_of_bound =
read_options_.iterate_upper_bound != nullptr &&
block_iter_points_to_real_block_ &&
block_upper_bound_check_ == BlockUpperBound::kUpperBoundInCurBlock;
assert(!next_block_is_out_of_bound ||
user_comparator_.CompareWithoutTimestamp(
*read_options_.iterate_upper_bound, /*a_has_ts=*/false,
index_iter_->user_key(), /*b_has_ts=*/true) <= 0);
ResetDataIter();
index_iter_->Next();
if (next_block_is_out_of_bound) {
// The next block is out of bound. No need to read it.
TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
// We need to make sure this is not the last data block before setting
// is_out_of_bound_, since the index key for the last data block can be
// larger than smallest key of the next file on the same level.
if (index_iter_->Valid()) {
is_out_of_bound_ = true;
}
return;
}
if (!index_iter_->Valid()) {
return;
}
IndexValue v = index_iter_->value();
if (!v.first_internal_key.empty() && allow_unprepared_value_) {
// Index contains the first key of the block. Defer reading the block.
is_at_first_key_from_index_ = true;
return;
}
InitDataBlock();
block_iter_.SeekToFirst();
} while (!block_iter_.Valid());
}
void BlockBasedTableIterator::FindKeyBackward() {
while (!block_iter_.Valid()) {
if (!block_iter_.status().ok()) {
return;
}
ResetDataIter();
index_iter_->Prev();
if (index_iter_->Valid()) {
InitDataBlock();
block_iter_.SeekToLast();
} else {
return;
}
}
// We could have check lower bound here too, but we opt not to do it for
// code simplicity.
}
void BlockBasedTableIterator::CheckOutOfBound() {
if (read_options_.iterate_upper_bound != nullptr &&
block_upper_bound_check_ != BlockUpperBound::kUpperBoundBeyondCurBlock &&
Valid()) {
is_out_of_bound_ =
user_comparator_.CompareWithoutTimestamp(
*read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(),
/*b_has_ts=*/true) <= 0;
}
}
void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() {
if (read_options_.iterate_upper_bound != nullptr &&
block_iter_points_to_real_block_) {
block_upper_bound_check_ = (user_comparator_.CompareWithoutTimestamp(
*read_options_.iterate_upper_bound,
/*a_has_ts=*/false, index_iter_->user_key(),
/*b_has_ts=*/true) > 0)
? BlockUpperBound::kUpperBoundBeyondCurBlock
: BlockUpperBound::kUpperBoundInCurBlock;
}
}
void BlockBasedTableIterator::FindReadAheadSizeUpperBound() {
size_t total_bytes_till_upper_bound = 0;
size_t footer = table_->get_rep()->footer.GetBlockTrailerSize();
uint64_t start_offset = index_iter_->value().handle.offset();
do {
BlockHandle block_handle = index_iter_->value().handle;
total_bytes_till_upper_bound += block_handle.size();
total_bytes_till_upper_bound += footer;
// Can't figure out for current block if current block
// is out of bound. But for next block we can find that.
// If curr block's index key >= iterate_upper_bound, it
// means all the keys in next block or above are out of
// bound.
bool next_block_out_of_bound =
(user_comparator_.CompareWithoutTimestamp(
index_iter_->user_key(),
/*a_has_ts=*/true, *read_options_.iterate_upper_bound,
/*b_has_ts=*/false) >= 0
? true
: false);
if (next_block_out_of_bound) {
break;
}
// Since next block is not out of bound, iterate to that
// index block and add it's Data block size to
// readahead_size.
index_iter_->Next();
if (!index_iter_->Valid()) {
break;
}
} while (true);
block_prefetcher_.SetUpperBoundOffset(start_offset +
total_bytes_till_upper_bound);
}
} // namespace ROCKSDB_NAMESPACE