Cache fragmented range tombstones in BlockBasedTableReader (#4493)

Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.

On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom   :       0.983 micros/op 1017076 ops/sec;   78.3 MB/s (63103 of 100000 found)
```

Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
   Tombstones?    | avg micros/op | stddev micros/op |  avg ops/s   | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None              |        0.6186 |          0.04637 | 1,625,252.90 | 124,679.41
500 Expanded      |        0.6019 |          0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded    |        0.6435 |          0.03994 | 1,559,979.40 | 104,090.52
1k Expanded       |        0.6034 |          0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded     |        0.6261 |          0.03093 | 1,600,457.50 |  79,024.94
5k Expanded       |        0.6163 |          0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded     |        0.6402 |          0.04002 | 1,567,804.70 | 100,965.55
10k Expanded      |        0.6036 |          0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded    |        0.6128 |          0.02598 | 1,634,633.40 |  72,161.82
25k Expanded      |        0.6198 |          0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded    |        0.5478 |          0.0362  | 1,833,059.10 | 121,233.81
50k Expanded      |        0.5104 |          0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded    |        0.4528 |          0.03387 | 2,219,034.50 | 170,984.32
```

After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493

Differential Revision: D10842844

Pulled By: abhimadan

fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
This commit is contained in:
Abhishek Madan 2018-10-25 19:25:00 -07:00 committed by Facebook Github Bot
parent fe0d23059d
commit 7528130e38
10 changed files with 240 additions and 100 deletions

View file

@ -729,13 +729,20 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
// Avoiding recording stats for speed. // Avoiding recording stats for speed.
return false; return false;
} }
if (*max_covering_tombstone_seq > 0) {
*s = Status::NotFound();
return true;
}
PERF_TIMER_GUARD(get_from_memtable_time); PERF_TIMER_GUARD(get_from_memtable_time);
std::unique_ptr<InternalIterator> range_del_iter( std::unique_ptr<InternalIterator> range_del_iter(
NewRangeTombstoneIterator(read_opts)); NewRangeTombstoneIterator(read_opts));
SequenceNumber snapshot = GetInternalKeySeqno(key.internal_key()); SequenceNumber snapshot = GetInternalKeySeqno(key.internal_key());
FragmentedRangeTombstoneIterator fragment_iter( FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
std::move(range_del_iter), comparator_.comparator, snapshot); comparator_.comparator,
true /* one_time_use */, snapshot);
FragmentedRangeTombstoneIterator fragment_iter(&fragment_list,
comparator_.comparator);
*max_covering_tombstone_seq = std::max( *max_covering_tombstone_seq = std::max(
*max_covering_tombstone_seq, *max_covering_tombstone_seq,
MaxCoveringTombstoneSeqnum(&fragment_iter, key.internal_key(), MaxCoveringTombstoneSeqnum(&fragment_iter, key.internal_key(),

View file

@ -146,7 +146,7 @@ bool MemTableListVersion::GetFromList(
} }
if (done) { if (done) {
assert(*seq != kMaxSequenceNumber); assert(*seq != kMaxSequenceNumber || s->IsNotFound());
return true; return true;
} }
if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {

View file

@ -12,19 +12,17 @@
#include <inttypes.h> #include <inttypes.h>
#include <stdio.h> #include <stdio.h>
#include "util/autovector.h"
#include "util/kv_map.h" #include "util/kv_map.h"
#include "util/vector_iterator.h" #include "util/vector_iterator.h"
namespace rocksdb { namespace rocksdb {
FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
std::unique_ptr<InternalIterator> unfragmented_tombstones, std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, SequenceNumber snapshot) const InternalKeyComparator& icmp, bool one_time_use,
: tombstone_cmp_(icmp.user_comparator()), SequenceNumber snapshot) {
icmp_(&icmp),
ucmp_(icmp.user_comparator()) {
if (unfragmented_tombstones == nullptr) { if (unfragmented_tombstones == nullptr) {
pos_ = tombstones_.end();
return; return;
} }
bool is_sorted = true; bool is_sorted = true;
@ -34,7 +32,7 @@ FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
unfragmented_tombstones->Next(), num_tombstones++) { unfragmented_tombstones->Next(), num_tombstones++) {
if (num_tombstones > 0 && if (num_tombstones > 0 &&
icmp_->Compare(last_start_key, unfragmented_tombstones->key()) > 0) { icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
is_sorted = false; is_sorted = false;
break; break;
} }
@ -46,7 +44,8 @@ FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
} }
} }
if (is_sorted) { if (is_sorted) {
FragmentTombstones(std::move(unfragmented_tombstones), snapshot); FragmentTombstones(std::move(unfragmented_tombstones), icmp, one_time_use,
snapshot);
return; return;
} }
@ -63,15 +62,16 @@ FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
} }
// VectorIterator implicitly sorts by key during construction. // VectorIterator implicitly sorts by key during construction.
auto iter = std::unique_ptr<VectorIterator>( auto iter = std::unique_ptr<VectorIterator>(
new VectorIterator(std::move(keys), std::move(values), icmp_)); new VectorIterator(std::move(keys), std::move(values), &icmp));
FragmentTombstones(std::move(iter), snapshot); FragmentTombstones(std::move(iter), icmp, one_time_use, snapshot);
} }
void FragmentedRangeTombstoneIterator::FragmentTombstones( void FragmentedRangeTombstoneList::FragmentTombstones(
std::unique_ptr<InternalIterator> unfragmented_tombstones, std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool one_time_use,
SequenceNumber snapshot) { SequenceNumber snapshot) {
Slice cur_start_key(nullptr, 0); Slice cur_start_key(nullptr, 0);
auto cmp = ParsedInternalKeyComparator(icmp_); auto cmp = ParsedInternalKeyComparator(&icmp);
// Stores the end keys and sequence numbers of range tombstones with a start // Stores the end keys and sequence numbers of range tombstones with a start
// key less than or equal to cur_start_key. Provides an ordering by end key // key less than or equal to cur_start_key. Provides an ordering by end key
@ -87,11 +87,11 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones(
bool reached_next_start_key = false; bool reached_next_start_key = false;
for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) { for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
Slice cur_end_key = it->user_key; Slice cur_end_key = it->user_key;
if (icmp_->user_comparator()->Compare(cur_start_key, cur_end_key) == 0) { if (icmp.user_comparator()->Compare(cur_start_key, cur_end_key) == 0) {
// Empty tombstone. // Empty tombstone.
continue; continue;
} }
if (icmp_->user_comparator()->Compare(next_start_key, cur_end_key) <= 0) { if (icmp.user_comparator()->Compare(next_start_key, cur_end_key) <= 0) {
// All of the end keys in [it, cur_end_keys.end()) are after // All of the end keys in [it, cur_end_keys.end()) are after
// next_start_key, so the tombstones they represent can be used in // next_start_key, so the tombstones they represent can be used in
// fragments that start with keys greater than or equal to // fragments that start with keys greater than or equal to
@ -109,17 +109,32 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones(
// Flush a range tombstone fragment [cur_start_key, cur_end_key), which // Flush a range tombstone fragment [cur_start_key, cur_end_key), which
// should not overlap with the last-flushed tombstone fragment. // should not overlap with the last-flushed tombstone fragment.
assert(tombstones_.empty() || assert(tombstones_.empty() ||
icmp_->user_comparator()->Compare(tombstones_.back().end_key_, icmp.user_comparator()->Compare(tombstones_.back().end_key_,
cur_start_key) <= 0); cur_start_key) <= 0);
if (one_time_use) {
SequenceNumber max_seqnum = 0; SequenceNumber max_seqnum = 0;
for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) { for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
max_seqnum = std::max(max_seqnum, flush_it->sequence); max_seqnum = std::max(max_seqnum, flush_it->sequence);
} }
// Flush only the tombstone fragment with the highest sequence
// number. // Flush only the tombstone fragment with the highest sequence number.
tombstones_.push_back( tombstones_.push_back(
RangeTombstone(cur_start_key, cur_end_key, max_seqnum)); RangeTombstone(cur_start_key, cur_end_key, max_seqnum));
} else {
// Sort the sequence numbers of the tombstones being fragmented in
// descending order, and then flush them in that order.
autovector<SequenceNumber> seqnums_to_flush;
for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
seqnums_to_flush.push_back(flush_it->sequence);
}
std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
std::greater<SequenceNumber>());
for (const auto seq : seqnums_to_flush) {
tombstones_.push_back(
RangeTombstone(cur_start_key, cur_end_key, seq));
}
}
cur_start_key = cur_end_key; cur_start_key = cur_end_key;
} }
if (!reached_next_start_key) { if (!reached_next_start_key) {
@ -140,7 +155,7 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones(
const Slice& ikey = unfragmented_tombstones->key(); const Slice& ikey = unfragmented_tombstones->key();
Slice tombstone_start_key = ExtractUserKey(ikey); Slice tombstone_start_key = ExtractUserKey(ikey);
SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey); SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
if (tombstone_seq > snapshot) { if (one_time_use && tombstone_seq > snapshot) {
// The tombstone is not visible by this snapshot. // The tombstone is not visible by this snapshot.
continue; continue;
} }
@ -152,7 +167,7 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones(
tombstone_end_key.size()); tombstone_end_key.size());
tombstone_end_key = pinned_slices_.back(); tombstone_end_key = pinned_slices_.back();
} }
if (!cur_end_keys.empty() && icmp_->user_comparator()->Compare( if (!cur_end_keys.empty() && icmp.user_comparator()->Compare(
cur_start_key, tombstone_start_key) != 0) { cur_start_key, tombstone_start_key) != 0) {
// The start key has changed. Flush all tombstones that start before // The start key has changed. Flush all tombstones that start before
// this new start key. // this new start key.
@ -177,29 +192,50 @@ void FragmentedRangeTombstoneIterator::FragmentTombstones(
pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(), pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
false /* arena */); false /* arena */);
} }
}
// With this, the caller must Seek before the iterator is valid. FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
pos_ = tombstones_.end(); const FragmentedRangeTombstoneList* tombstones,
pinned_pos_ = tombstones_.end(); const InternalKeyComparator& icmp)
: tombstone_cmp_(icmp.user_comparator()),
icmp_(&icmp),
ucmp_(icmp.user_comparator()),
tombstones_(tombstones) {
assert(tombstones_ != nullptr);
pos_ = tombstones_->end();
pinned_pos_ = tombstones_->end();
}
FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
const InternalKeyComparator& icmp)
: tombstone_cmp_(icmp.user_comparator()),
icmp_(&icmp),
ucmp_(icmp.user_comparator()),
tombstones_ref_(tombstones),
tombstones_(tombstones_ref_.get()) {
assert(tombstones_ != nullptr);
pos_ = tombstones_->end();
pinned_pos_ = tombstones_->end();
} }
void FragmentedRangeTombstoneIterator::SeekToFirst() { void FragmentedRangeTombstoneIterator::SeekToFirst() {
pos_ = tombstones_.begin(); pos_ = tombstones_->begin();
} }
void FragmentedRangeTombstoneIterator::SeekToLast() { void FragmentedRangeTombstoneIterator::SeekToLast() {
pos_ = tombstones_.end(); pos_ = tombstones_->end();
Prev(); Prev();
} }
void FragmentedRangeTombstoneIterator::Seek(const Slice& target) { void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
if (tombstones_.empty()) { if (tombstones_->empty()) {
pos_ = tombstones_.end(); pos_ = tombstones_->end();
return; return;
} }
RangeTombstone search(ExtractUserKey(target), ExtractUserKey(target), RangeTombstone search(ExtractUserKey(target), ExtractUserKey(target),
GetInternalKeySeqno(target)); GetInternalKeySeqno(target));
pos_ = std::lower_bound(tombstones_.begin(), tombstones_.end(), search, pos_ = std::lower_bound(tombstones_->begin(), tombstones_->end(), search,
tombstone_cmp_); tombstone_cmp_);
} }
@ -223,20 +259,24 @@ void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
void FragmentedRangeTombstoneIterator::Next() { ++pos_; } void FragmentedRangeTombstoneIterator::Next() { ++pos_; }
void FragmentedRangeTombstoneIterator::Prev() { void FragmentedRangeTombstoneIterator::Prev() {
if (pos_ == tombstones_.begin()) { if (pos_ == tombstones_->begin()) {
pos_ = tombstones_.end(); pos_ = tombstones_->end();
return; return;
} }
--pos_; --pos_;
} }
bool FragmentedRangeTombstoneIterator::Valid() const { bool FragmentedRangeTombstoneIterator::Valid() const {
return pos_ != tombstones_.end(); return tombstones_ != nullptr && pos_ != tombstones_->end();
} }
SequenceNumber MaxCoveringTombstoneSeqnum( SequenceNumber MaxCoveringTombstoneSeqnum(
FragmentedRangeTombstoneIterator* tombstone_iter, const Slice& lookup_key, FragmentedRangeTombstoneIterator* tombstone_iter, const Slice& lookup_key,
const Comparator* ucmp) { const Comparator* ucmp) {
if (tombstone_iter == nullptr) {
return 0;
}
SequenceNumber snapshot = GetInternalKeySeqno(lookup_key); SequenceNumber snapshot = GetInternalKeySeqno(lookup_key);
Slice user_key = ExtractUserKey(lookup_key); Slice user_key = ExtractUserKey(lookup_key);

View file

@ -17,6 +17,37 @@
namespace rocksdb { namespace rocksdb {
struct FragmentedRangeTombstoneList {
public:
FragmentedRangeTombstoneList(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool one_time_use,
SequenceNumber snapshot = kMaxSequenceNumber);
std::vector<RangeTombstone>::const_iterator begin() const {
return tombstones_.begin();
}
std::vector<RangeTombstone>::const_iterator end() const {
return tombstones_.end();
}
bool empty() const { return tombstones_.size() == 0; }
private:
// Given an ordered range tombstone iterator unfragmented_tombstones,
// "fragment" the tombstones into non-overlapping pieces, and store them in
// tombstones_.
void FragmentTombstones(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool one_time_use,
SequenceNumber snapshot = kMaxSequenceNumber);
std::vector<RangeTombstone> tombstones_;
std::list<std::string> pinned_slices_;
PinnedIteratorsManager pinned_iters_mgr_;
};
// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del // FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
// meta block into an iterator over non-overlapping tombstone fragments. The // meta block into an iterator over non-overlapping tombstone fragments. The
// tombstone fragmentation process should be more efficient than the range // tombstone fragmentation process should be more efficient than the range
@ -29,8 +60,11 @@ namespace rocksdb {
class FragmentedRangeTombstoneIterator : public InternalIterator { class FragmentedRangeTombstoneIterator : public InternalIterator {
public: public:
FragmentedRangeTombstoneIterator( FragmentedRangeTombstoneIterator(
std::unique_ptr<InternalIterator> unfragmented_tombstones, const FragmentedRangeTombstoneList* tombstones,
const InternalKeyComparator& icmp, SequenceNumber snapshot); const InternalKeyComparator& icmp);
FragmentedRangeTombstoneIterator(
const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
const InternalKeyComparator& icmp);
void SeekToFirst() override; void SeekToFirst() override;
void SeekToLast() override; void SeekToLast() override;
void Seek(const Slice& target) override; void Seek(const Slice& target) override;
@ -66,7 +100,7 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {
}; };
void MaybePinKey() const { void MaybePinKey() const {
if (pos_ != tombstones_.end() && pinned_pos_ != pos_) { if (pos_ != tombstones_->end() && pinned_pos_ != pos_) {
current_start_key_.Set(pos_->start_key_, pos_->seq_, kTypeRangeDeletion); current_start_key_.Set(pos_->start_key_, pos_->seq_, kTypeRangeDeletion);
pinned_pos_ = pos_; pinned_pos_ = pos_;
} }
@ -78,18 +112,11 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {
parsed->type = kTypeRangeDeletion; parsed->type = kTypeRangeDeletion;
} }
// Given an ordered range tombstone iterator unfragmented_tombstones,
// "fragment" the tombstones into non-overlapping pieces, and store them in
// tombstones_.
void FragmentTombstones(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
SequenceNumber snapshot);
const FragmentedRangeTombstoneComparator tombstone_cmp_; const FragmentedRangeTombstoneComparator tombstone_cmp_;
const InternalKeyComparator* icmp_; const InternalKeyComparator* icmp_;
const Comparator* ucmp_; const Comparator* ucmp_;
std::vector<RangeTombstone> tombstones_; std::shared_ptr<const FragmentedRangeTombstoneList> tombstones_ref_;
std::list<std::string> pinned_slices_; const FragmentedRangeTombstoneList* tombstones_;
std::vector<RangeTombstone>::const_iterator pos_; std::vector<RangeTombstone>::const_iterator pos_;
mutable std::vector<RangeTombstone>::const_iterator pinned_pos_; mutable std::vector<RangeTombstone>::const_iterator pinned_pos_;
mutable InternalKey current_start_key_; mutable InternalKey current_start_key_;

View file

@ -87,8 +87,9 @@ void VerifyMaxCoveringTombstoneSeqnum(
TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) { TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {
auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}}); auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, kMaxSequenceNumber); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}}); VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
{{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}}); {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}});
@ -97,8 +98,9 @@ TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {
TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) { TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) {
auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}}); auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, kMaxSequenceNumber); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, VerifyFragmentedRangeDels(&iter,
{{"a", "c", 10}, {"c", "e", 15}, {"e", "g", 15}}); {{"a", "c", 10}, {"c", "e", 15}, {"e", "g", 15}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
@ -109,8 +111,9 @@ TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) {
auto range_del_iter = MakeRangeDelIter( auto range_del_iter = MakeRangeDelIter(
{{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}}); {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, kMaxSequenceNumber); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, VerifyFragmentedRangeDels(&iter,
{{"a", "c", 10}, {"c", "e", 20}, {"e", "g", 15}}); {{"a", "c", 10}, {"c", "e", 20}, {"e", "g", 15}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
@ -121,8 +124,9 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) {
auto range_del_iter = auto range_del_iter =
MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}}); MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, kMaxSequenceNumber); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}}); VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
{{"a", 10}, {"b", 10}, {"c", 0}}); {{"a", 10}, {"b", 10}, {"c", 0}});
@ -132,8 +136,9 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) {
auto range_del_iter = auto range_del_iter =
MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}}); MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, kMaxSequenceNumber); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, VerifyFragmentedRangeDels(&iter,
{{"a", "c", 10}, {"c", "e", 10}, {"e", "g", 7}}); {{"a", "c", 10}, {"c", "e", 10}, {"e", "g", 7}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
@ -147,8 +152,9 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) {
{"a", "g", 7}, {"a", "g", 7},
{"a", "c", 3}}); {"a", "c", 3}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, kMaxSequenceNumber); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, VerifyFragmentedRangeDels(&iter,
{{"a", "c", 30}, {"c", "e", 20}, {"e", "g", 20}}); {{"a", "c", 30}, {"c", "e", 20}, {"e", "g", 20}});
VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(), VerifyMaxCoveringTombstoneSeqnum(&iter, bytewise_icmp.user_comparator(),
@ -162,8 +168,9 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
{"j", "n", 4}, {"j", "n", 4},
{"j", "l", 2}}); {"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, kMaxSequenceNumber); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
{"c", "e", 10}, {"c", "e", 10},
{"e", "g", 8}, {"e", "g", 8},
@ -182,8 +189,9 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyWithSnapshot) {
{"j", "n", 4}, {"j", "n", 4},
{"j", "l", 2}}); {"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, 9); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */, 9);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels( VerifyFragmentedRangeDels(
&iter, {{"c", "g", 8}, {"g", "i", 6}, {"j", "l", 4}, {"l", "n", 4}}); &iter, {{"c", "g", 8}, {"g", "i", 6}, {"j", "l", 4}, {"l", "n", 4}});
VerifyMaxCoveringTombstoneSeqnum( VerifyMaxCoveringTombstoneSeqnum(
@ -198,8 +206,9 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
{"c", "g", 8}, {"c", "g", 8},
{"j", "l", 2}}); {"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, 9); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */, 9);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels( VerifyFragmentedRangeDels(
&iter, {{"c", "g", 8}, {"g", "i", 6}, {"j", "l", 4}, {"l", "n", 4}}); &iter, {{"c", "g", 8}, {"g", "i", 6}, {"j", "l", 4}, {"l", "n", 4}});
VerifyMaxCoveringTombstoneSeqnum( VerifyMaxCoveringTombstoneSeqnum(
@ -207,6 +216,31 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
{{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
} }
TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyMultiUse) {
auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
{"c", "g", 8},
{"c", "i", 6},
{"j", "n", 4},
{"j", "l", 2}});
FragmentedRangeTombstoneList fragment_list(
std::move(range_del_iter), bytewise_icmp, false /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
{"c", "e", 10},
{"c", "e", 8},
{"c", "e", 6},
{"e", "g", 8},
{"e", "g", 6},
{"g", "i", 6},
{"j", "l", 4},
{"j", "l", 2},
{"l", "n", 4}});
VerifyMaxCoveringTombstoneSeqnum(
&iter, bytewise_icmp.user_comparator(),
{{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
}
TEST_F(RangeTombstoneFragmenterTest, SeekForPrevStartKey) { TEST_F(RangeTombstoneFragmenterTest, SeekForPrevStartKey) {
// Same tombstones as OverlapAndRepeatedStartKey. // Same tombstones as OverlapAndRepeatedStartKey.
auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
@ -215,8 +249,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevStartKey) {
{"j", "n", 4}, {"j", "n", 4},
{"j", "l", 2}}); {"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, kMaxSequenceNumber); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifySeekForPrev( VerifySeekForPrev(
&iter, &iter,
{{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}}); {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
@ -230,8 +265,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevCovered) {
{"j", "n", 4}, {"j", "n", 4},
{"j", "l", 2}}); {"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, kMaxSequenceNumber); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifySeekForPrev( VerifySeekForPrev(
&iter, &iter,
{{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}}); {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
@ -245,8 +281,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevEndKey) {
{"j", "n", 4}, {"j", "n", 4},
{"j", "l", 2}}); {"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, kMaxSequenceNumber); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifySeekForPrev(&iter, {{"c", {"c", "e", 10}}, VerifySeekForPrev(&iter, {{"c", {"c", "e", 10}},
{"g", {"g", "i", 6}}, {"g", {"g", "i", 6}},
{"i", {"g", "i", 6}}, {"i", {"g", "i", 6}},
@ -261,8 +298,9 @@ TEST_F(RangeTombstoneFragmenterTest, SeekForPrevOutOfBounds) {
{"j", "n", 4}, {"j", "n", 4},
{"j", "l", 2}}); {"j", "l", 2}});
FragmentedRangeTombstoneIterator iter(std::move(range_del_iter), FragmentedRangeTombstoneList fragment_list(
bytewise_icmp, kMaxSequenceNumber); std::move(range_del_iter), bytewise_icmp, true /* one_time_use */);
FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp);
VerifySeekForPrev(&iter, VerifySeekForPrev(&iter,
{{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}}); {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}});
} }

View file

@ -379,13 +379,12 @@ Status TableCache::Get(const ReadOptions& options,
!options.ignore_range_deletions) { !options.ignore_range_deletions) {
std::unique_ptr<InternalIterator> range_del_iter( std::unique_ptr<InternalIterator> range_del_iter(
t->NewRangeTombstoneIterator(options)); t->NewRangeTombstoneIterator(options));
FragmentedRangeTombstoneIterator fragment_iter(std::move(range_del_iter), *max_covering_tombstone_seq =
internal_comparator, std::max(*max_covering_tombstone_seq,
GetInternalKeySeqno(k)); MaxCoveringTombstoneSeqnum(
*max_covering_tombstone_seq = std::max( static_cast<FragmentedRangeTombstoneIterator*>(
*max_covering_tombstone_seq, range_del_iter.get()),
MaxCoveringTombstoneSeqnum(&fragment_iter, k, k, internal_comparator.user_comparator()));
internal_comparator.user_comparator()));
} }
if (s.ok()) { if (s.ok()) {
get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. get_context->SetReplayLog(row_cache_entry); // nullptr if no cache.

View file

@ -1209,6 +1209,11 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
FdWithKeyRange* f = fp.GetNextFile(); FdWithKeyRange* f = fp.GetNextFile();
while (f != nullptr) { while (f != nullptr) {
if (*max_covering_tombstone_seq > 0) {
// Use empty error message for speed
*status = Status::NotFound();
return;
}
if (get_context.sample()) { if (get_context.sample()) {
sample_file_read_inc(f->file_metadata); sample_file_read_inc(f->file_metadata);
} }

View file

@ -972,8 +972,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
rep->ioptions.info_log, rep->ioptions.info_log,
"Error when seeking to range delete tombstones block from file: %s", "Error when seeking to range delete tombstones block from file: %s",
s.ToString().c_str()); s.ToString().c_str());
} else { } else if (found_range_del_block && !rep->range_del_handle.IsNull()) {
if (found_range_del_block && !rep->range_del_handle.IsNull()) {
ReadOptions read_options; ReadOptions read_options;
s = MaybeLoadDataBlockToCache( s = MaybeLoadDataBlockToCache(
prefetch_buffer.get(), rep, read_options, rep->range_del_handle, prefetch_buffer.get(), rep, read_options, rep->range_del_handle,
@ -985,7 +984,10 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
"Encountered error while reading data from range del block %s", "Encountered error while reading data from range del block %s",
s.ToString().c_str()); s.ToString().c_str());
} }
} auto iter = std::unique_ptr<InternalIterator>(
new_table->NewUnfragmentedRangeTombstoneIterator(read_options));
rep->fragmented_range_dels = std::make_shared<FragmentedRangeTombstoneList>(
std::move(iter), internal_comparator, false /* one_time_use */);
} }
bool need_upper_bound_check = bool need_upper_bound_check =
@ -2263,6 +2265,15 @@ InternalIterator* BlockBasedTable::NewIterator(
} }
InternalIterator* BlockBasedTable::NewRangeTombstoneIterator( InternalIterator* BlockBasedTable::NewRangeTombstoneIterator(
const ReadOptions& /* read_options */) {
if (rep_->fragmented_range_dels == nullptr) {
return nullptr;
}
return new FragmentedRangeTombstoneIterator(rep_->fragmented_range_dels,
rep_->internal_comparator);
}
InternalIterator* BlockBasedTable::NewUnfragmentedRangeTombstoneIterator(
const ReadOptions& read_options) { const ReadOptions& read_options) {
if (rep_->range_del_handle.IsNull()) { if (rep_->range_del_handle.IsNull()) {
// The block didn't exist, nullptr indicates no range tombstones. // The block didn't exist, nullptr indicates no range tombstones.

View file

@ -16,6 +16,7 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "db/range_tombstone_fragmenter.h"
#include "options/cf_options.h" #include "options/cf_options.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "rocksdb/persistent_cache.h" #include "rocksdb/persistent_cache.h"
@ -384,6 +385,9 @@ class BlockBasedTable : public TableReader {
friend class PartitionedFilterBlockReader; friend class PartitionedFilterBlockReader;
friend class PartitionedFilterBlockTest; friend class PartitionedFilterBlockTest;
InternalIterator* NewUnfragmentedRangeTombstoneIterator(
const ReadOptions& read_options);
}; };
// Maitaning state of a two-level iteration on a partitioned index structure // Maitaning state of a two-level iteration on a partitioned index structure
@ -511,6 +515,7 @@ struct BlockBasedTable::Rep {
// cache is enabled. // cache is enabled.
CachableEntry<Block> range_del_entry; CachableEntry<Block> range_del_entry;
BlockHandle range_del_handle; BlockHandle range_del_handle;
std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
// If global_seqno is used, all Keys in this file will have the same // If global_seqno is used, all Keys in this file will have the same
// seqno with value `global_seqno`. // seqno with value `global_seqno`.

View file

@ -1278,6 +1278,13 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) {
std::vector<std::string> keys = {"1pika", "2chu"}; std::vector<std::string> keys = {"1pika", "2chu"};
std::vector<std::string> vals = {"p", "c"}; std::vector<std::string> vals = {"p", "c"};
std::vector<RangeTombstone> expected_tombstones = {
{"1pika", "2chu", 0},
{"2chu", "c", 1},
{"2chu", "c", 0},
{"c", "p", 0},
};
for (int i = 0; i < 2; i++) { for (int i = 0; i < 2; i++) {
RangeTombstone t(keys[i], vals[i], i); RangeTombstone t(keys[i], vals[i], i);
std::pair<InternalKey, Slice> p = t.Serialize(); std::pair<InternalKey, Slice> p = t.Serialize();
@ -1310,14 +1317,15 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) {
ASSERT_FALSE(iter->Valid()); ASSERT_FALSE(iter->Valid());
iter->SeekToFirst(); iter->SeekToFirst();
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
for (int i = 0; i < 2; i++) { for (size_t i = 0; i < expected_tombstones.size(); i++) {
ASSERT_TRUE(iter->Valid()); ASSERT_TRUE(iter->Valid());
ParsedInternalKey parsed_key; ParsedInternalKey parsed_key;
ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key)); ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key));
RangeTombstone t(parsed_key, iter->value()); RangeTombstone t(parsed_key, iter->value());
ASSERT_EQ(t.start_key_, keys[i]); const auto& expected_t = expected_tombstones[i];
ASSERT_EQ(t.end_key_, vals[i]); ASSERT_EQ(t.start_key_, expected_t.start_key_);
ASSERT_EQ(t.seq_, i); ASSERT_EQ(t.end_key_, expected_t.end_key_);
ASSERT_EQ(t.seq_, expected_t.seq_);
iter->Next(); iter->Next();
} }
ASSERT_TRUE(!iter->Valid()); ASSERT_TRUE(!iter->Valid());