rocksdb/db/range_tombstone_fragmenter.cc

302 lines
11 KiB
C++

// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/range_tombstone_fragmenter.h"
#include <algorithm>
#include <functional>
#include <set>
#include <inttypes.h>
#include <stdio.h>
#include "util/autovector.h"
#include "util/kv_map.h"
#include "util/vector_iterator.h"
namespace rocksdb {
FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool one_time_use,
SequenceNumber snapshot) {
if (unfragmented_tombstones == nullptr) {
return;
}
bool is_sorted = true;
int num_tombstones = 0;
InternalKey pinned_last_start_key;
Slice last_start_key;
for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
unfragmented_tombstones->Next(), num_tombstones++) {
if (num_tombstones > 0 &&
icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
is_sorted = false;
break;
}
if (unfragmented_tombstones->IsKeyPinned()) {
last_start_key = unfragmented_tombstones->key();
} else {
pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key());
last_start_key = pinned_last_start_key.Encode();
}
}
if (is_sorted) {
FragmentTombstones(std::move(unfragmented_tombstones), icmp, one_time_use,
snapshot);
return;
}
// Sort the tombstones before fragmenting them.
std::vector<std::string> keys, values;
keys.reserve(num_tombstones);
values.reserve(num_tombstones);
for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
unfragmented_tombstones->Next()) {
keys.emplace_back(unfragmented_tombstones->key().data(),
unfragmented_tombstones->key().size());
values.emplace_back(unfragmented_tombstones->value().data(),
unfragmented_tombstones->value().size());
}
// VectorIterator implicitly sorts by key during construction.
auto iter = std::unique_ptr<VectorIterator>(
new VectorIterator(std::move(keys), std::move(values), &icmp));
FragmentTombstones(std::move(iter), icmp, one_time_use, snapshot);
}
void FragmentedRangeTombstoneList::FragmentTombstones(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool one_time_use,
SequenceNumber snapshot) {
Slice cur_start_key(nullptr, 0);
auto cmp = ParsedInternalKeyComparator(&icmp);
// Stores the end keys and sequence numbers of range tombstones with a start
// key less than or equal to cur_start_key. Provides an ordering by end key
// for use in flush_current_tombstones.
std::set<ParsedInternalKey, ParsedInternalKeyComparator> cur_end_keys(cmp);
// Given the next start key in unfragmented_tombstones,
// flush_current_tombstones writes every tombstone fragment that starts
// and ends with a key before next_start_key, and starts with a key greater
// than or equal to cur_start_key.
auto flush_current_tombstones = [&](const Slice& next_start_key) {
auto it = cur_end_keys.begin();
bool reached_next_start_key = false;
for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
Slice cur_end_key = it->user_key;
if (icmp.user_comparator()->Compare(cur_start_key, cur_end_key) == 0) {
// Empty tombstone.
continue;
}
if (icmp.user_comparator()->Compare(next_start_key, cur_end_key) <= 0) {
// All of the end keys in [it, cur_end_keys.end()) are after
// next_start_key, so the tombstones they represent can be used in
// fragments that start with keys greater than or equal to
// next_start_key. However, the end keys we already passed will not be
// used in any more tombstone fragments.
//
// Remove the fully fragmented tombstones and stop iteration after a
// final round of flushing to preserve the tombstones we can create more
// fragments from.
reached_next_start_key = true;
cur_end_keys.erase(cur_end_keys.begin(), it);
cur_end_key = next_start_key;
}
// Flush a range tombstone fragment [cur_start_key, cur_end_key), which
// should not overlap with the last-flushed tombstone fragment.
assert(tombstones_.empty() ||
icmp.user_comparator()->Compare(tombstones_.back().end_key_,
cur_start_key) <= 0);
if (one_time_use) {
SequenceNumber max_seqnum = 0;
for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
max_seqnum = std::max(max_seqnum, flush_it->sequence);
}
// Flush only the tombstone fragment with the highest sequence number.
tombstones_.push_back(
RangeTombstone(cur_start_key, cur_end_key, max_seqnum));
} else {
// Sort the sequence numbers of the tombstones being fragmented in
// descending order, and then flush them in that order.
autovector<SequenceNumber> seqnums_to_flush;
for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
seqnums_to_flush.push_back(flush_it->sequence);
}
std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
std::greater<SequenceNumber>());
for (const auto seq : seqnums_to_flush) {
tombstones_.push_back(
RangeTombstone(cur_start_key, cur_end_key, seq));
}
}
cur_start_key = cur_end_key;
}
if (!reached_next_start_key) {
// There is a gap between the last flushed tombstone fragment and
// the next tombstone's start key. Remove all the end keys in
// the working set, since we have fully fragmented their corresponding
// tombstones.
cur_end_keys.clear();
}
cur_start_key = next_start_key;
};
pinned_iters_mgr_.StartPinning();
bool no_tombstones = true;
for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
unfragmented_tombstones->Next()) {
const Slice& ikey = unfragmented_tombstones->key();
Slice tombstone_start_key = ExtractUserKey(ikey);
SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
if (one_time_use && tombstone_seq > snapshot) {
// The tombstone is not visible by this snapshot.
continue;
}
no_tombstones = false;
Slice tombstone_end_key = unfragmented_tombstones->value();
if (!unfragmented_tombstones->IsValuePinned()) {
pinned_slices_.emplace_back(tombstone_end_key.data(),
tombstone_end_key.size());
tombstone_end_key = pinned_slices_.back();
}
if (!cur_end_keys.empty() && icmp.user_comparator()->Compare(
cur_start_key, tombstone_start_key) != 0) {
// The start key has changed. Flush all tombstones that start before
// this new start key.
flush_current_tombstones(tombstone_start_key);
}
if (unfragmented_tombstones->IsKeyPinned()) {
cur_start_key = tombstone_start_key;
} else {
pinned_slices_.emplace_back(tombstone_start_key.data(),
tombstone_start_key.size());
cur_start_key = pinned_slices_.back();
}
cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion);
}
if (!cur_end_keys.empty()) {
ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end());
flush_current_tombstones(last_end_key.user_key);
}
if (!no_tombstones) {
pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
false /* arena */);
}
}
FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
const FragmentedRangeTombstoneList* tombstones,
const InternalKeyComparator& icmp)
: tombstone_cmp_(icmp.user_comparator()),
icmp_(&icmp),
ucmp_(icmp.user_comparator()),
tombstones_(tombstones) {
assert(tombstones_ != nullptr);
pos_ = tombstones_->end();
pinned_pos_ = tombstones_->end();
}
FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
const InternalKeyComparator& icmp)
: tombstone_cmp_(icmp.user_comparator()),
icmp_(&icmp),
ucmp_(icmp.user_comparator()),
tombstones_ref_(tombstones),
tombstones_(tombstones_ref_.get()) {
assert(tombstones_ != nullptr);
pos_ = tombstones_->end();
pinned_pos_ = tombstones_->end();
}
void FragmentedRangeTombstoneIterator::SeekToFirst() {
pos_ = tombstones_->begin();
}
void FragmentedRangeTombstoneIterator::SeekToLast() {
pos_ = tombstones_->end();
Prev();
}
void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
if (tombstones_->empty()) {
pos_ = tombstones_->end();
return;
}
RangeTombstone search(ExtractUserKey(target), ExtractUserKey(target),
GetInternalKeySeqno(target));
pos_ = std::lower_bound(tombstones_->begin(), tombstones_->end(), search,
tombstone_cmp_);
}
void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
Seek(target);
if (!Valid()) {
SeekToLast();
}
ParsedInternalKey parsed_target;
if (!ParseInternalKey(target, &parsed_target)) {
assert(false);
}
ParsedInternalKey parsed_start_key;
ParseKey(&parsed_start_key);
while (Valid() && icmp_->Compare(parsed_target, parsed_start_key) < 0) {
Prev();
ParseKey(&parsed_start_key);
}
}
void FragmentedRangeTombstoneIterator::Next() { ++pos_; }
void FragmentedRangeTombstoneIterator::Prev() {
if (pos_ == tombstones_->begin()) {
pos_ = tombstones_->end();
return;
}
--pos_;
}
bool FragmentedRangeTombstoneIterator::Valid() const {
return tombstones_ != nullptr && pos_ != tombstones_->end();
}
SequenceNumber MaxCoveringTombstoneSeqnum(
FragmentedRangeTombstoneIterator* tombstone_iter, const Slice& lookup_key,
const Comparator* ucmp) {
if (tombstone_iter == nullptr) {
return 0;
}
SequenceNumber snapshot = GetInternalKeySeqno(lookup_key);
Slice user_key = ExtractUserKey(lookup_key);
tombstone_iter->Seek(lookup_key);
SequenceNumber highest_covering_seqnum = 0;
if (!tombstone_iter->Valid()) {
// Seeked past the last tombstone
tombstone_iter->Prev();
}
while (tombstone_iter->Valid() &&
ucmp->Compare(user_key, tombstone_iter->value()) < 0) {
if (tombstone_iter->seq() <= snapshot &&
ucmp->Compare(tombstone_iter->user_key(), user_key) <= 0) {
highest_covering_seqnum =
std::max(highest_covering_seqnum, tombstone_iter->seq());
}
tombstone_iter->Prev();
}
return highest_covering_seqnum;
}
} // namespace rocksdb