mirror of
https://github.com/facebook/rocksdb.git
synced 2024-12-03 05:54:17 +00:00
a30a696034
Summary: Now if we have iterate_upper_bound set, we continue read until get a key >= upper_bound. For a lot of cases that neighboring data blocks have a user key gap between them, our index key will be a user key in the middle to get a shorter size. For example, if we have blocks: [a b c d][f g h] Then the index key for the first block will be 'e'. then if upper bound is any key between 'd' and 'e', for example, d1, d2, ..., d99999999999, we don't have to read the second block and also know that we have done our iteration by reaching the last key that smaller the upper bound already. This diff can reduce RA in most cases. Closes https://github.com/facebook/rocksdb/pull/2239 Differential Revision: D4990693 Pulled By: lightmark fbshipit-source-id: ab30ea2e3c6edf3fddd5efed3c34fcf7739827ff
270 lines
8.4 KiB
C++
270 lines
8.4 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under the BSD-style license found in the
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
// This source code is also licensed under the GPLv2 license found in the
|
|
// COPYING file in the root directory of this source tree.
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "table/two_level_iterator.h"
|
|
#include "db/pinned_iterators_manager.h"
|
|
#include "rocksdb/options.h"
|
|
#include "rocksdb/table.h"
|
|
#include "table/block.h"
|
|
#include "table/format.h"
|
|
#include "util/arena.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
namespace {
|
|
|
|
class TwoLevelIterator : public InternalIterator {
|
|
public:
|
|
explicit TwoLevelIterator(TwoLevelIteratorState* state,
|
|
InternalIterator* first_level_iter,
|
|
bool need_free_iter_and_state);
|
|
|
|
virtual ~TwoLevelIterator() {
|
|
// Assert that the TwoLevelIterator is never deleted while Pinning is
|
|
// Enabled.
|
|
assert(!pinned_iters_mgr_ ||
|
|
(pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
|
|
first_level_iter_.DeleteIter(!need_free_iter_and_state_);
|
|
second_level_iter_.DeleteIter(false);
|
|
if (need_free_iter_and_state_) {
|
|
delete state_;
|
|
} else {
|
|
state_->~TwoLevelIteratorState();
|
|
}
|
|
}
|
|
|
|
virtual void Seek(const Slice& target) override;
|
|
virtual void SeekForPrev(const Slice& target) override;
|
|
virtual void SeekToFirst() override;
|
|
virtual void SeekToLast() override;
|
|
virtual void Next() override;
|
|
virtual void Prev() override;
|
|
|
|
virtual bool Valid() const override { return second_level_iter_.Valid(); }
|
|
virtual Slice key() const override {
|
|
assert(Valid());
|
|
return second_level_iter_.key();
|
|
}
|
|
virtual Slice value() const override {
|
|
assert(Valid());
|
|
return second_level_iter_.value();
|
|
}
|
|
virtual Status status() const override {
|
|
// It'd be nice if status() returned a const Status& instead of a Status
|
|
if (!first_level_iter_.status().ok()) {
|
|
return first_level_iter_.status();
|
|
} else if (second_level_iter_.iter() != nullptr &&
|
|
!second_level_iter_.status().ok()) {
|
|
return second_level_iter_.status();
|
|
} else {
|
|
return status_;
|
|
}
|
|
}
|
|
virtual void SetPinnedItersMgr(
|
|
PinnedIteratorsManager* pinned_iters_mgr) override {
|
|
pinned_iters_mgr_ = pinned_iters_mgr;
|
|
first_level_iter_.SetPinnedItersMgr(pinned_iters_mgr);
|
|
if (second_level_iter_.iter()) {
|
|
second_level_iter_.SetPinnedItersMgr(pinned_iters_mgr);
|
|
}
|
|
}
|
|
virtual bool IsKeyPinned() const override {
|
|
return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
|
|
second_level_iter_.iter() && second_level_iter_.IsKeyPinned();
|
|
}
|
|
virtual bool IsValuePinned() const override {
|
|
return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
|
|
second_level_iter_.iter() && second_level_iter_.IsValuePinned();
|
|
}
|
|
|
|
private:
|
|
void SaveError(const Status& s) {
|
|
if (status_.ok() && !s.ok()) status_ = s;
|
|
}
|
|
void SkipEmptyDataBlocksForward();
|
|
void SkipEmptyDataBlocksBackward();
|
|
void SetSecondLevelIterator(InternalIterator* iter);
|
|
void InitDataBlock();
|
|
|
|
TwoLevelIteratorState* state_;
|
|
IteratorWrapper first_level_iter_;
|
|
IteratorWrapper second_level_iter_; // May be nullptr
|
|
bool need_free_iter_and_state_;
|
|
PinnedIteratorsManager* pinned_iters_mgr_;
|
|
Status status_;
|
|
// If second_level_iter is non-nullptr, then "data_block_handle_" holds the
|
|
// "index_value" passed to block_function_ to create the second_level_iter.
|
|
std::string data_block_handle_;
|
|
};
|
|
|
|
TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state,
|
|
InternalIterator* first_level_iter,
|
|
bool need_free_iter_and_state)
|
|
: state_(state),
|
|
first_level_iter_(first_level_iter),
|
|
need_free_iter_and_state_(need_free_iter_and_state),
|
|
pinned_iters_mgr_(nullptr) {}
|
|
|
|
void TwoLevelIterator::Seek(const Slice& target) {
|
|
if (state_->check_prefix_may_match &&
|
|
!state_->PrefixMayMatch(target)) {
|
|
SetSecondLevelIterator(nullptr);
|
|
return;
|
|
}
|
|
first_level_iter_.Seek(target);
|
|
|
|
InitDataBlock();
|
|
if (second_level_iter_.iter() != nullptr) {
|
|
second_level_iter_.Seek(target);
|
|
}
|
|
SkipEmptyDataBlocksForward();
|
|
}
|
|
|
|
void TwoLevelIterator::SeekForPrev(const Slice& target) {
|
|
if (state_->check_prefix_may_match && !state_->PrefixMayMatch(target)) {
|
|
SetSecondLevelIterator(nullptr);
|
|
return;
|
|
}
|
|
first_level_iter_.Seek(target);
|
|
InitDataBlock();
|
|
if (second_level_iter_.iter() != nullptr) {
|
|
second_level_iter_.SeekForPrev(target);
|
|
}
|
|
if (!Valid()) {
|
|
if (!first_level_iter_.Valid()) {
|
|
first_level_iter_.SeekToLast();
|
|
InitDataBlock();
|
|
if (second_level_iter_.iter() != nullptr) {
|
|
second_level_iter_.SeekForPrev(target);
|
|
}
|
|
}
|
|
SkipEmptyDataBlocksBackward();
|
|
}
|
|
}
|
|
|
|
void TwoLevelIterator::SeekToFirst() {
|
|
first_level_iter_.SeekToFirst();
|
|
InitDataBlock();
|
|
if (second_level_iter_.iter() != nullptr) {
|
|
second_level_iter_.SeekToFirst();
|
|
}
|
|
SkipEmptyDataBlocksForward();
|
|
}
|
|
|
|
void TwoLevelIterator::SeekToLast() {
|
|
first_level_iter_.SeekToLast();
|
|
InitDataBlock();
|
|
if (second_level_iter_.iter() != nullptr) {
|
|
second_level_iter_.SeekToLast();
|
|
}
|
|
SkipEmptyDataBlocksBackward();
|
|
}
|
|
|
|
void TwoLevelIterator::Next() {
|
|
assert(Valid());
|
|
second_level_iter_.Next();
|
|
SkipEmptyDataBlocksForward();
|
|
}
|
|
|
|
void TwoLevelIterator::Prev() {
|
|
assert(Valid());
|
|
second_level_iter_.Prev();
|
|
SkipEmptyDataBlocksBackward();
|
|
}
|
|
|
|
void TwoLevelIterator::SkipEmptyDataBlocksForward() {
|
|
while (second_level_iter_.iter() == nullptr ||
|
|
(!second_level_iter_.Valid() &&
|
|
!second_level_iter_.status().IsIncomplete())) {
|
|
// Move to next block
|
|
if (!first_level_iter_.Valid() ||
|
|
state_->KeyReachedUpperBound(first_level_iter_.key())) {
|
|
SetSecondLevelIterator(nullptr);
|
|
return;
|
|
}
|
|
first_level_iter_.Next();
|
|
InitDataBlock();
|
|
if (second_level_iter_.iter() != nullptr) {
|
|
second_level_iter_.SeekToFirst();
|
|
}
|
|
}
|
|
}
|
|
|
|
void TwoLevelIterator::SkipEmptyDataBlocksBackward() {
|
|
while (second_level_iter_.iter() == nullptr ||
|
|
(!second_level_iter_.Valid() &&
|
|
!second_level_iter_.status().IsIncomplete())) {
|
|
// Move to next block
|
|
if (!first_level_iter_.Valid()) {
|
|
SetSecondLevelIterator(nullptr);
|
|
return;
|
|
}
|
|
first_level_iter_.Prev();
|
|
InitDataBlock();
|
|
if (second_level_iter_.iter() != nullptr) {
|
|
second_level_iter_.SeekToLast();
|
|
}
|
|
}
|
|
}
|
|
|
|
void TwoLevelIterator::SetSecondLevelIterator(InternalIterator* iter) {
|
|
if (second_level_iter_.iter() != nullptr) {
|
|
SaveError(second_level_iter_.status());
|
|
}
|
|
|
|
if (pinned_iters_mgr_ && iter) {
|
|
iter->SetPinnedItersMgr(pinned_iters_mgr_);
|
|
}
|
|
|
|
InternalIterator* old_iter = second_level_iter_.Set(iter);
|
|
if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
|
|
pinned_iters_mgr_->PinIterator(old_iter);
|
|
} else {
|
|
delete old_iter;
|
|
}
|
|
}
|
|
|
|
void TwoLevelIterator::InitDataBlock() {
|
|
if (!first_level_iter_.Valid()) {
|
|
SetSecondLevelIterator(nullptr);
|
|
} else {
|
|
Slice handle = first_level_iter_.value();
|
|
if (second_level_iter_.iter() != nullptr &&
|
|
!second_level_iter_.status().IsIncomplete() &&
|
|
handle.compare(data_block_handle_) == 0) {
|
|
// second_level_iter is already constructed with this iterator, so
|
|
// no need to change anything
|
|
} else {
|
|
InternalIterator* iter = state_->NewSecondaryIterator(handle);
|
|
data_block_handle_.assign(handle.data(), handle.size());
|
|
SetSecondLevelIterator(iter);
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
InternalIterator* NewTwoLevelIterator(TwoLevelIteratorState* state,
|
|
InternalIterator* first_level_iter,
|
|
Arena* arena,
|
|
bool need_free_iter_and_state) {
|
|
if (arena == nullptr) {
|
|
return new TwoLevelIterator(state, first_level_iter,
|
|
need_free_iter_and_state);
|
|
} else {
|
|
auto mem = arena->AllocateAligned(sizeof(TwoLevelIterator));
|
|
return new (mem)
|
|
TwoLevelIterator(state, first_level_iter, need_free_iter_and_state);
|
|
}
|
|
}
|
|
|
|
} // namespace rocksdb
|