mirror of https://github.com/facebook/rocksdb.git
Prevent iterating over range tombstones beyond `iterate_upper_bound` (#10966)
Summary: Currently, `iterate_upper_bound` is not checked for range tombstone keys in MergingIterator. This may impact performance when there is a large number of range tombstones right after `iterate_upper_bound`. This PR fixes this issue by checking `iterate_upper_bound` in MergingIterator for range tombstone keys. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10966 Test Plan: - added unit test - stress test: `python3 tools/db_crashtest.py whitebox --simple --verify_iterator_with_expected_state_one_in=5 --delrangepercent=5 --prefixpercent=18 --writepercent=48 --readpercen=15 --duration=36000 --range_deletion_width=100` - ran different stress tests over sandcastle - Falcon team ran some test traffic and saw reduced CPU usage on processing range tombstones. Reviewed By: ajkr Differential Revision: D41414172 Pulled By: cbi42 fbshipit-source-id: 9b2c29eb3abb99327c6a649bdc412e70d863f981
This commit is contained in:
parent
54c2542df2
commit
534fb06dd3
|
@ -3,6 +3,9 @@
|
|||
### Behavior changes
|
||||
* Make best-efforts recovery verify SST unique ID before Version construction (#10962)
|
||||
|
||||
### Bug Fixes
|
||||
* Fixed a regression in iterator where range tombstones after `iterate_upper_bound` is processed.
|
||||
|
||||
## 7.9.0 (11/21/2022)
|
||||
### Performance Improvements
|
||||
* Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877).
|
||||
|
|
|
@ -1823,7 +1823,8 @@ InternalIterator* DBImpl::NewInternalIterator(
|
|||
MergeIteratorBuilder merge_iter_builder(
|
||||
&cfd->internal_comparator(), arena,
|
||||
!read_options.total_order_seek &&
|
||||
super_version->mutable_cf_options.prefix_extractor != nullptr);
|
||||
super_version->mutable_cf_options.prefix_extractor != nullptr,
|
||||
read_options.iterate_upper_bound);
|
||||
// Collect iterator for mutable memtable
|
||||
auto mem_iter = super_version->mem->NewIterator(read_options, arena);
|
||||
Status s;
|
||||
|
|
|
@ -2756,6 +2756,46 @@ TEST_F(DBRangeDelTest, RefreshMemtableIter) {
|
|||
ASSERT_OK(iter->Refresh());
|
||||
}
|
||||
|
||||
TEST_F(DBRangeDelTest, RangeTombstoneRespectIterateUpperBound) {
|
||||
// Memtable: a, [b, bz)
|
||||
// Do a Seek on `a` with iterate_upper_bound being az
|
||||
// range tombstone [b, bz) should not be processed (added to and
|
||||
// popped from the min_heap in MergingIterator).
|
||||
Options options = CurrentOptions();
|
||||
options.disable_auto_compactions = true;
|
||||
DestroyAndReopen(options);
|
||||
|
||||
ASSERT_OK(Put("a", "bar"));
|
||||
ASSERT_OK(
|
||||
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "bz"));
|
||||
|
||||
// I could not find a cleaner way to test this without relying on
|
||||
// implementation detail. Tried to test the value of
|
||||
// `internal_range_del_reseek_count` but that did not work
|
||||
// since BlockBasedTable iterator becomes !Valid() when point key
|
||||
// is out of bound and that reseek only happens when a point key
|
||||
// is covered by some range tombstone.
|
||||
SyncPoint::GetInstance()->SetCallBack("MergeIterator::PopDeleteRangeStart",
|
||||
[](void*) {
|
||||
// there should not be any range
|
||||
// tombstone in the heap.
|
||||
FAIL();
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
ReadOptions read_opts;
|
||||
std::string upper_bound = "az";
|
||||
Slice upper_bound_slice = upper_bound;
|
||||
read_opts.iterate_upper_bound = &upper_bound_slice;
|
||||
std::unique_ptr<Iterator> iter{db_->NewIterator(read_opts)};
|
||||
iter->Seek("a");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ(iter->key(), "a");
|
||||
iter->Next();
|
||||
ASSERT_FALSE(iter->Valid());
|
||||
ASSERT_OK(iter->status());
|
||||
}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
|
@ -1487,7 +1487,7 @@ struct ReadOptions {
|
|||
const Slice* iterate_lower_bound;
|
||||
|
||||
// "iterate_upper_bound" defines the extent up to which the forward iterator
|
||||
// can returns entries. Once the bound is reached, Valid() will be false.
|
||||
// can return entries. Once the bound is reached, Valid() will be false.
|
||||
// "iterate_upper_bound" is exclusive ie the bound value is
|
||||
// not a valid entry. If prefix_extractor is not null:
|
||||
// 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used
|
||||
|
|
|
@ -117,14 +117,16 @@ class MergingIterator : public InternalIterator {
|
|||
public:
|
||||
MergingIterator(const InternalKeyComparator* comparator,
|
||||
InternalIterator** children, int n, bool is_arena_mode,
|
||||
bool prefix_seek_mode)
|
||||
bool prefix_seek_mode,
|
||||
const Slice* iterate_upper_bound = nullptr)
|
||||
: is_arena_mode_(is_arena_mode),
|
||||
prefix_seek_mode_(prefix_seek_mode),
|
||||
direction_(kForward),
|
||||
comparator_(comparator),
|
||||
current_(nullptr),
|
||||
minHeap_(comparator_),
|
||||
pinned_iters_mgr_(nullptr) {
|
||||
pinned_iters_mgr_(nullptr),
|
||||
iterate_upper_bound_(iterate_upper_bound) {
|
||||
children_.resize(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
children_[i].level = i;
|
||||
|
@ -202,11 +204,26 @@ class MergingIterator : public InternalIterator {
|
|||
assert(!range_tombstone_iters_.empty() &&
|
||||
range_tombstone_iters_[level]->Valid());
|
||||
if (start_key) {
|
||||
pinned_heap_item_[level].SetTombstoneKey(
|
||||
range_tombstone_iters_[level]->start_key());
|
||||
ParsedInternalKey pik = range_tombstone_iters_[level]->start_key();
|
||||
// iterate_upper_bound does not have timestamp
|
||||
if (iterate_upper_bound_ &&
|
||||
comparator_->user_comparator()->CompareWithoutTimestamp(
|
||||
pik.user_key, true /* a_has_ts */, *iterate_upper_bound_,
|
||||
false /* b_has_ts */) >= 0) {
|
||||
if (replace_top) {
|
||||
// replace_top implies this range tombstone iterator is still in
|
||||
// minHeap_ and at the top.
|
||||
minHeap_.pop();
|
||||
}
|
||||
return;
|
||||
}
|
||||
pinned_heap_item_[level].SetTombstoneKey(std::move(pik));
|
||||
pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
|
||||
assert(active_.count(level) == 0);
|
||||
} else {
|
||||
// allow end key to go over upper bound (if present) since start key is
|
||||
// before upper bound and the range tombstone could still cover a
|
||||
// range before upper bound.
|
||||
pinned_heap_item_[level].SetTombstoneKey(
|
||||
range_tombstone_iters_[level]->end_key());
|
||||
pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
|
||||
|
@ -251,6 +268,7 @@ class MergingIterator : public InternalIterator {
|
|||
void PopDeleteRangeStart() {
|
||||
while (!minHeap_.empty() &&
|
||||
minHeap_.top()->type == HeapItem::DELETE_RANGE_START) {
|
||||
TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr);
|
||||
// insert end key of this range tombstone and updates active_
|
||||
InsertRangeTombstoneToMinHeap(
|
||||
minHeap_.top()->level, false /* start_key */, true /* replace_top */);
|
||||
|
@ -573,6 +591,10 @@ class MergingIterator : public InternalIterator {
|
|||
std::unique_ptr<MergerMaxIterHeap> maxHeap_;
|
||||
PinnedIteratorsManager* pinned_iters_mgr_;
|
||||
|
||||
// Used to bound range tombstones. For point keys, DBIter and SSTable iterator
|
||||
// take care of boundary checking.
|
||||
const Slice* iterate_upper_bound_;
|
||||
|
||||
// In forward direction, process a child that is not in the min heap.
|
||||
// If valid, add to the min heap. Otherwise, check status.
|
||||
void AddToMinHeapOrCheckStatus(HeapItem*);
|
||||
|
@ -634,9 +656,19 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
|
|||
for (size_t level = 0; level < starting_level; ++level) {
|
||||
if (range_tombstone_iters_[level] &&
|
||||
range_tombstone_iters_[level]->Valid()) {
|
||||
assert(static_cast<bool>(active_.count(level)) ==
|
||||
(pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END));
|
||||
minHeap_.push(&pinned_heap_item_[level]);
|
||||
// use an iterator on active_ if performance becomes an issue here
|
||||
if (active_.count(level) > 0) {
|
||||
assert(pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END);
|
||||
// if it was active, then start key must be within upper_bound,
|
||||
// so we can add to minHeap_ directly.
|
||||
minHeap_.push(&pinned_heap_item_[level]);
|
||||
} else {
|
||||
// this takes care of checking iterate_upper_bound, but with an extra
|
||||
// key comparison if range_tombstone_iters_[level] was already out of
|
||||
// bound. Consider using a new HeapItem type or some flag to remember
|
||||
// boundary checking result.
|
||||
InsertRangeTombstoneToMinHeap(level);
|
||||
}
|
||||
} else {
|
||||
assert(!active_.count(level));
|
||||
}
|
||||
|
@ -1280,11 +1312,12 @@ InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp,
|
|||
}
|
||||
|
||||
MergeIteratorBuilder::MergeIteratorBuilder(
|
||||
const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode)
|
||||
const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode,
|
||||
const Slice* iterate_upper_bound)
|
||||
: first_iter(nullptr), use_merging_iter(false), arena(a) {
|
||||
auto mem = arena->AllocateAligned(sizeof(MergingIterator));
|
||||
merge_iter =
|
||||
new (mem) MergingIterator(comparator, nullptr, 0, true, prefix_seek_mode);
|
||||
merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true,
|
||||
prefix_seek_mode, iterate_upper_bound);
|
||||
}
|
||||
|
||||
MergeIteratorBuilder::~MergeIteratorBuilder() {
|
||||
|
|
|
@ -45,7 +45,8 @@ class MergeIteratorBuilder {
|
|||
// comparator: the comparator used in merging comparator
|
||||
// arena: where the merging iterator needs to be allocated from.
|
||||
explicit MergeIteratorBuilder(const InternalKeyComparator* comparator,
|
||||
Arena* arena, bool prefix_seek_mode = false);
|
||||
Arena* arena, bool prefix_seek_mode = false,
|
||||
const Slice* iterate_upper_bound = nullptr);
|
||||
~MergeIteratorBuilder();
|
||||
|
||||
// Add iter to the merging iterator.
|
||||
|
|
Loading…
Reference in New Issue