rocksdb/db/range_tombstone_fragmenter.cc
Yu Zhang 071a146fa0 Add support for range deletion when user timestamps are not persisted (#12254)
Summary:
For the user defined timestamps in memtable only feature, some special handling for range deletion blocks are needed since both the key (start_key) and the value (end_key) of a range tombstone can contain user-defined timestamps. Handling for the key is taken care of in the same way as the other data blocks in the block based table. This PR adds the special handling needed for the value (end_key) part. This includes:

1) On the write path, when L0 SST files are first created from flush, user-defined timestamps are removed from an end key of a range tombstone. There are places where it's logically removed (replaced with a min timestamp) because there is still logic with the running comparator that expects a user key that contains timestamp. And in the block based builder, it is eventually physically removed before persisted in a block.

2) On the read path, when range deletion block is being read, we artificially pad a min timestamp to the end key of a range tombstone in `BlockBasedTableReader`.

3) For file boundary `FileMetaData.largest`, we artificially pad a max timestamp to it if it contains a range deletion sentinel. Anytime when range deletion end_key is used to update file boundaries, it's using max timestamp instead of the range tombstone's actual timestamp to mark it as an exclusive end. d69628e6ce/db/dbformat.h (L923-L935)
This max timestamp is removed when in memory `FileMetaData.largest` is persisted into Manifest, we pad it back when it's read from Manifest while handling related `VersionEdit` in `VersionEditHandler`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12254

Test Plan: Added unit test and enabled this feature combination's stress test.

Reviewed By: cbi42

Differential Revision: D52965527

Pulled By: jowlyzhang

fbshipit-source-id: e8315f8a2c5268e2ae0f7aec8012c266b86df985
2024-01-29 11:37:34 -08:00

516 lines
18 KiB
C++

// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/range_tombstone_fragmenter.h"
#include <algorithm>
#include <cinttypes>
#include <cstdio>
#include <functional>
#include <set>
#include "util/autovector.h"
#include "util/kv_map.h"
#include "util/vector_iterator.h"
namespace ROCKSDB_NAMESPACE {
FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool for_compaction,
const std::vector<SequenceNumber>& snapshots,
const bool tombstone_end_include_ts) {
if (unfragmented_tombstones == nullptr) {
return;
}
bool is_sorted = true;
InternalKey pinned_last_start_key;
Slice last_start_key;
num_unfragmented_tombstones_ = 0;
total_tombstone_payload_bytes_ = 0;
for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) {
total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
unfragmented_tombstones->value().size();
if (num_unfragmented_tombstones_ > 0 &&
icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
is_sorted = false;
break;
}
if (unfragmented_tombstones->IsKeyPinned()) {
last_start_key = unfragmented_tombstones->key();
} else {
pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key());
last_start_key = pinned_last_start_key.Encode();
}
}
auto ucmp = icmp.user_comparator();
assert(ucmp);
const size_t ts_sz = ucmp->timestamp_size();
bool pad_min_ts_for_end = ts_sz > 0 && !tombstone_end_include_ts;
if (is_sorted && !pad_min_ts_for_end) {
FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction,
snapshots);
return;
}
// Sort the tombstones before fragmenting them.
std::vector<std::string> keys, values;
keys.reserve(num_unfragmented_tombstones_);
values.reserve(num_unfragmented_tombstones_);
// Reset the counter to zero for the next iteration over keys.
total_tombstone_payload_bytes_ = 0;
for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
unfragmented_tombstones->Next()) {
total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
unfragmented_tombstones->value().size();
keys.emplace_back(unfragmented_tombstones->key().data(),
unfragmented_tombstones->key().size());
Slice value = unfragmented_tombstones->value();
if (pad_min_ts_for_end) {
AppendKeyWithMinTimestamp(&values.emplace_back(), value, ts_sz);
} else {
values.emplace_back(value.data(), value.size());
}
}
if (pad_min_ts_for_end) {
total_tombstone_payload_bytes_ += num_unfragmented_tombstones_ * ts_sz;
}
// VectorIterator implicitly sorts by key during construction.
auto iter = std::make_unique<VectorIterator>(std::move(keys),
std::move(values), &icmp);
FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots);
}
void FragmentedRangeTombstoneList::FragmentTombstones(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool for_compaction,
const std::vector<SequenceNumber>& snapshots) {
Slice cur_start_key(nullptr, 0);
auto cmp = ParsedInternalKeyComparator(&icmp);
// Stores the end keys and sequence numbers of range tombstones with a start
// key less than or equal to cur_start_key. Provides an ordering by end key
// for use in flush_current_tombstones.
std::set<ParsedInternalKey, ParsedInternalKeyComparator> cur_end_keys(cmp);
size_t ts_sz = icmp.user_comparator()->timestamp_size();
// Given the next start key in unfragmented_tombstones,
// flush_current_tombstones writes every tombstone fragment that starts
// and ends with a key before next_start_key, and starts with a key greater
// than or equal to cur_start_key.
auto flush_current_tombstones = [&](const Slice& next_start_key) {
auto it = cur_end_keys.begin();
bool reached_next_start_key = false;
for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
Slice cur_end_key = it->user_key;
if (icmp.user_comparator()->CompareWithoutTimestamp(cur_start_key,
cur_end_key) == 0) {
// Empty tombstone.
continue;
}
if (icmp.user_comparator()->CompareWithoutTimestamp(next_start_key,
cur_end_key) <= 0) {
// All the end keys in [it, cur_end_keys.end()) are after
// next_start_key, so the tombstones they represent can be used in
// fragments that start with keys greater than or equal to
// next_start_key. However, the end keys we already passed will not be
// used in any more tombstone fragments.
//
// Remove the fully fragmented tombstones and stop iteration after a
// final round of flushing to preserve the tombstones we can create more
// fragments from.
reached_next_start_key = true;
cur_end_keys.erase(cur_end_keys.begin(), it);
cur_end_key = next_start_key;
}
// Flush a range tombstone fragment [cur_start_key, cur_end_key), which
// should not overlap with the last-flushed tombstone fragment.
assert(tombstones_.empty() ||
icmp.user_comparator()->CompareWithoutTimestamp(
tombstones_.back().end_key, cur_start_key) <= 0);
// Sort the sequence numbers of the tombstones being fragmented in
// descending order, and then flush them in that order.
autovector<SequenceNumber> seqnums_to_flush;
autovector<Slice> timestamps_to_flush;
for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
seqnums_to_flush.push_back(flush_it->sequence);
if (ts_sz) {
timestamps_to_flush.push_back(
ExtractTimestampFromUserKey(flush_it->user_key, ts_sz));
}
}
// TODO: bind the two sorting together to be more efficient
std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
std::greater<SequenceNumber>());
if (ts_sz) {
std::sort(timestamps_to_flush.begin(), timestamps_to_flush.end(),
[icmp](const Slice& ts1, const Slice& ts2) {
return icmp.user_comparator()->CompareTimestamp(ts1, ts2) >
0;
});
}
size_t start_idx = tombstone_seqs_.size();
size_t end_idx = start_idx + seqnums_to_flush.size();
// If user-defined timestamp is enabled, we should not drop tombstones
// from any snapshot stripe. Garbage collection of range tombstones
// happens in CompactionOutputs::AddRangeDels().
if (for_compaction && ts_sz == 0) {
// Drop all tombstone seqnums that are not preserved by a snapshot.
SequenceNumber next_snapshot = kMaxSequenceNumber;
for (auto seq : seqnums_to_flush) {
if (seq <= next_snapshot) {
// This seqnum is visible by a lower snapshot.
tombstone_seqs_.push_back(seq);
auto upper_bound_it =
std::lower_bound(snapshots.begin(), snapshots.end(), seq);
if (upper_bound_it == snapshots.begin()) {
// This seqnum is the topmost one visible by the earliest
// snapshot. None of the seqnums below it will be visible, so we
// can skip them.
break;
}
next_snapshot = *std::prev(upper_bound_it);
}
}
end_idx = tombstone_seqs_.size();
} else {
// The fragmentation is being done for reads, so preserve all seqnums.
tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(),
seqnums_to_flush.end());
if (ts_sz) {
tombstone_timestamps_.insert(tombstone_timestamps_.end(),
timestamps_to_flush.begin(),
timestamps_to_flush.end());
}
}
assert(start_idx < end_idx);
if (ts_sz) {
std::string start_key_with_max_ts;
AppendUserKeyWithMaxTimestamp(&start_key_with_max_ts, cur_start_key,
ts_sz);
pinned_slices_.emplace_back(std::move(start_key_with_max_ts));
Slice start_key = pinned_slices_.back();
std::string end_key_with_max_ts;
AppendUserKeyWithMaxTimestamp(&end_key_with_max_ts, cur_end_key, ts_sz);
pinned_slices_.emplace_back(std::move(end_key_with_max_ts));
Slice end_key = pinned_slices_.back();
// RangeTombstoneStack expects start_key and end_key to have max
// timestamp.
tombstones_.emplace_back(start_key, end_key, start_idx, end_idx);
} else {
tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx,
end_idx);
}
cur_start_key = cur_end_key;
}
if (!reached_next_start_key) {
// There is a gap between the last flushed tombstone fragment and
// the next tombstone's start key. Remove all the end keys in
// the working set, since we have fully fragmented their corresponding
// tombstones.
cur_end_keys.clear();
}
cur_start_key = next_start_key;
};
pinned_iters_mgr_.StartPinning();
bool no_tombstones = true;
for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
unfragmented_tombstones->Next()) {
const Slice& ikey = unfragmented_tombstones->key();
Slice tombstone_start_key = ExtractUserKey(ikey);
SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
if (!unfragmented_tombstones->IsKeyPinned()) {
pinned_slices_.emplace_back(tombstone_start_key.data(),
tombstone_start_key.size());
tombstone_start_key = pinned_slices_.back();
}
no_tombstones = false;
Slice tombstone_end_key = unfragmented_tombstones->value();
if (!unfragmented_tombstones->IsValuePinned()) {
pinned_slices_.emplace_back(tombstone_end_key.data(),
tombstone_end_key.size());
tombstone_end_key = pinned_slices_.back();
}
if (!cur_end_keys.empty() &&
icmp.user_comparator()->CompareWithoutTimestamp(
cur_start_key, tombstone_start_key) != 0) {
// The start key has changed. Flush all tombstones that start before
// this new start key.
flush_current_tombstones(tombstone_start_key);
}
cur_start_key = tombstone_start_key;
cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion);
}
if (!cur_end_keys.empty()) {
ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end());
flush_current_tombstones(last_end_key.user_key);
}
if (!no_tombstones) {
pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
false /* arena */);
}
}
bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower,
SequenceNumber upper) {
std::call_once(seq_set_init_once_flag_, [this]() {
for (auto s : tombstone_seqs_) {
seq_set_.insert(s);
}
});
auto seq_it = seq_set_.lower_bound(lower);
return seq_it != seq_set_.end() && *seq_it <= upper;
}
FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
FragmentedRangeTombstoneList* tombstones, const InternalKeyComparator& icmp,
SequenceNumber _upper_bound, const Slice* ts_upper_bound,
SequenceNumber _lower_bound)
: tombstone_start_cmp_(icmp.user_comparator()),
tombstone_end_cmp_(icmp.user_comparator()),
icmp_(&icmp),
ucmp_(icmp.user_comparator()),
tombstones_(tombstones),
upper_bound_(_upper_bound),
lower_bound_(_lower_bound),
ts_upper_bound_(ts_upper_bound) {
assert(tombstones_ != nullptr);
Invalidate();
}
FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
const std::shared_ptr<FragmentedRangeTombstoneList>& tombstones,
const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
const Slice* ts_upper_bound, SequenceNumber _lower_bound)
: tombstone_start_cmp_(icmp.user_comparator()),
tombstone_end_cmp_(icmp.user_comparator()),
icmp_(&icmp),
ucmp_(icmp.user_comparator()),
tombstones_ref_(tombstones),
tombstones_(tombstones_ref_.get()),
upper_bound_(_upper_bound),
lower_bound_(_lower_bound),
ts_upper_bound_(ts_upper_bound) {
assert(tombstones_ != nullptr);
Invalidate();
}
FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
const std::shared_ptr<FragmentedRangeTombstoneListCache>& tombstones_cache,
const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
const Slice* ts_upper_bound, SequenceNumber _lower_bound)
: tombstone_start_cmp_(icmp.user_comparator()),
tombstone_end_cmp_(icmp.user_comparator()),
icmp_(&icmp),
ucmp_(icmp.user_comparator()),
tombstones_cache_ref_(tombstones_cache),
tombstones_(tombstones_cache_ref_->tombstones.get()),
upper_bound_(_upper_bound),
lower_bound_(_lower_bound) {
assert(tombstones_ != nullptr);
if (!ts_upper_bound || ts_upper_bound->empty()) {
ts_upper_bound_ = nullptr;
} else {
ts_upper_bound_ = ts_upper_bound;
}
Invalidate();
}
void FragmentedRangeTombstoneIterator::SeekToFirst() {
pos_ = tombstones_->begin();
seq_pos_ = tombstones_->seq_begin();
}
void FragmentedRangeTombstoneIterator::SeekToTopFirst() {
if (tombstones_->empty()) {
Invalidate();
return;
}
pos_ = tombstones_->begin();
SetMaxVisibleSeqAndTimestamp();
ScanForwardToVisibleTombstone();
}
void FragmentedRangeTombstoneIterator::SeekToLast() {
pos_ = std::prev(tombstones_->end());
seq_pos_ = std::prev(tombstones_->seq_end());
}
void FragmentedRangeTombstoneIterator::SeekToTopLast() {
if (tombstones_->empty()) {
Invalidate();
return;
}
pos_ = std::prev(tombstones_->end());
SetMaxVisibleSeqAndTimestamp();
ScanBackwardToVisibleTombstone();
}
// @param `target` is a user key, with timestamp if user-defined timestamp is
// enabled.
void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
if (tombstones_->empty()) {
Invalidate();
return;
}
SeekToCoveringTombstone(target);
ScanForwardToVisibleTombstone();
}
void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
if (tombstones_->empty()) {
Invalidate();
return;
}
SeekForPrevToCoveringTombstone(target);
ScanBackwardToVisibleTombstone();
}
void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone(
const Slice& target) {
pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
tombstone_end_cmp_);
if (pos_ == tombstones_->end()) {
// All tombstones end before target.
seq_pos_ = tombstones_->seq_end();
return;
}
SetMaxVisibleSeqAndTimestamp();
}
void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone(
const Slice& target) {
if (tombstones_->empty()) {
Invalidate();
return;
}
pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
tombstone_start_cmp_);
if (pos_ == tombstones_->begin()) {
// All tombstones start after target.
Invalidate();
return;
}
--pos_;
SetMaxVisibleSeqAndTimestamp();
}
void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() {
while (pos_ != tombstones_->end() &&
(seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
*seq_pos_ < lower_bound_)) {
++pos_;
if (pos_ == tombstones_->end()) {
Invalidate();
return;
}
SetMaxVisibleSeqAndTimestamp();
}
}
void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() {
while (pos_ != tombstones_->end() &&
(seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
*seq_pos_ < lower_bound_)) {
if (pos_ == tombstones_->begin()) {
Invalidate();
return;
}
--pos_;
SetMaxVisibleSeqAndTimestamp();
}
}
void FragmentedRangeTombstoneIterator::Next() {
++seq_pos_;
if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) {
++pos_;
}
}
void FragmentedRangeTombstoneIterator::TopNext() {
++pos_;
if (pos_ == tombstones_->end()) {
return;
}
SetMaxVisibleSeqAndTimestamp();
ScanForwardToVisibleTombstone();
}
void FragmentedRangeTombstoneIterator::Prev() {
if (seq_pos_ == tombstones_->seq_begin()) {
Invalidate();
return;
}
--seq_pos_;
if (pos_ == tombstones_->end() ||
seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) {
--pos_;
}
}
void FragmentedRangeTombstoneIterator::TopPrev() {
if (pos_ == tombstones_->begin()) {
Invalidate();
return;
}
--pos_;
SetMaxVisibleSeqAndTimestamp();
ScanBackwardToVisibleTombstone();
}
bool FragmentedRangeTombstoneIterator::Valid() const {
return tombstones_ != nullptr && pos_ != tombstones_->end();
}
SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum(
const Slice& target_user_key) {
SeekToCoveringTombstone(target_user_key);
return ValidPos() && ucmp_->CompareWithoutTimestamp(start_key(),
target_user_key) <= 0
? seq()
: 0;
}
std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
FragmentedRangeTombstoneIterator::SplitBySnapshot(
const std::vector<SequenceNumber>& snapshots) {
std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
splits;
SequenceNumber lower = 0;
SequenceNumber upper;
for (size_t i = 0; i <= snapshots.size(); i++) {
if (i >= snapshots.size()) {
upper = kMaxSequenceNumber;
} else {
upper = snapshots[i];
}
if (tombstones_->ContainsRange(lower, upper)) {
splits.emplace(upper,
std::make_unique<FragmentedRangeTombstoneIterator>(
tombstones_, *icmp_, upper, ts_upper_bound_, lower));
}
lower = upper + 1;
}
return splits;
}
} // namespace ROCKSDB_NAMESPACE