rocksdb/db/seqno_to_time_mapping.cc
Yu Zhang 1104eaa35e Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.

The life cycle of such an entry on the write/flush/compaction paths are:

1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`

2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
 when we have easy access to the seqno to time mapping.

3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.

On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.

Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.

2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.

3) Stress test coverage for the feature

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419

Test Plan: Added unit tests

Reviewed By: pdillinger

Differential Revision: D54920296

Pulled By: jowlyzhang

fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 15:44:55 -07:00

535 lines
16 KiB
C++

// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/seqno_to_time_mapping.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <deque>
#include <functional>
#include <queue>
#include <vector>
#include "db/version_edit.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterTime(
uint64_t time) const {
assert(enforced_);
return std::upper_bound(pairs_.cbegin(), pairs_.cend(),
SeqnoTimePair{0, time}, SeqnoTimePair::TimeLess);
}
SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterEqSeqno(
SequenceNumber seqno) const {
assert(enforced_);
return std::lower_bound(pairs_.cbegin(), pairs_.cend(),
SeqnoTimePair{seqno, 0}, SeqnoTimePair::SeqnoLess);
}
SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterSeqno(
SequenceNumber seqno) const {
assert(enforced_);
return std::upper_bound(pairs_.cbegin(), pairs_.cend(),
SeqnoTimePair{seqno, 0}, SeqnoTimePair::SeqnoLess);
}
uint64_t SeqnoToTimeMapping::GetProximalTimeBeforeSeqno(
SequenceNumber seqno) const {
assert(enforced_);
// Find the last entry with a seqno strictly less than the given seqno.
// First, find the first entry >= the given seqno (or end)
auto it = FindGreaterEqSeqno(seqno);
if (it == pairs_.cbegin()) {
return kUnknownTimeBeforeAll;
}
// Then return data from previous.
it--;
return it->time;
}
SequenceNumber SeqnoToTimeMapping::GetProximalSeqnoBeforeTime(
uint64_t time) const {
assert(enforced_);
// Find the last entry with a time <= the given time.
// First, find the first entry > the given time (or end).
auto it = FindGreaterTime(time);
if (it == pairs_.cbegin()) {
return kUnknownSeqnoBeforeAll;
}
// Then return data from previous.
--it;
return it->seqno;
}
void SeqnoToTimeMapping::EnforceMaxTimeSpan(uint64_t now) {
assert(enforced_); // at least sorted
uint64_t cutoff_time;
if (pairs_.size() <= 1) {
return;
}
if (now > 0) {
if (now < max_time_span_) {
// Nothing eligible to prune / avoid underflow
return;
}
cutoff_time = now - max_time_span_;
} else {
const auto& last = pairs_.back();
if (last.time < max_time_span_) {
// Nothing eligible to prune / avoid underflow
return;
}
cutoff_time = last.time - max_time_span_;
}
// Keep one entry <= cutoff_time
while (pairs_.size() >= 2 && pairs_[0].time <= cutoff_time &&
pairs_[1].time <= cutoff_time) {
pairs_.pop_front();
}
}
void SeqnoToTimeMapping::EnforceCapacity(bool strict) {
assert(enforced_); // at least sorted
uint64_t strict_cap = capacity_;
if (strict_cap == 0) {
pairs_.clear();
return;
}
// Treat cap of 1 as 2 to work with the below algorithm (etc.)
if (strict_cap == 1) {
strict_cap = 2;
}
// When !strict, allow being over nominal capacity by a modest fraction.
uint64_t effective_cap = strict_cap + (strict ? 0 : strict_cap / 8);
if (effective_cap < strict_cap) {
// Correct overflow
effective_cap = UINT64_MAX;
}
if (pairs_.size() <= effective_cap) {
return;
}
// The below algorithm expects at least one removal candidate between first
// and last.
assert(pairs_.size() >= 3);
size_t to_remove_count = pairs_.size() - strict_cap;
struct RemovalCandidate {
uint64_t new_time_gap;
std::deque<SeqnoTimePair>::iterator it;
RemovalCandidate(uint64_t _new_time_gap,
std::deque<SeqnoTimePair>::iterator _it)
: new_time_gap(_new_time_gap), it(_it) {}
bool operator>(const RemovalCandidate& other) const {
if (new_time_gap == other.new_time_gap) {
// If same gap, treat the newer entry as less attractive
// for removal (like larger gap)
return it->seqno > other.it->seqno;
}
return new_time_gap > other.new_time_gap;
}
};
// A priority queue of best removal candidates (smallest time gap remaining
// after removal)
using RC = RemovalCandidate;
using PQ = std::priority_queue<RC, std::vector<RC>, std::greater<RC>>;
PQ pq;
// Add all the candidates (not including first and last)
{
auto it = pairs_.begin();
assert(it->time != kUnknownTimeBeforeAll);
uint64_t prev_prev_time = it->time;
++it;
assert(it->time != kUnknownTimeBeforeAll);
auto prev_it = it;
++it;
while (it != pairs_.end()) {
assert(it->time != kUnknownTimeBeforeAll);
uint64_t gap = it->time - prev_prev_time;
pq.emplace(gap, prev_it);
prev_prev_time = prev_it->time;
prev_it = it;
++it;
}
}
// Greedily remove the best candidate, iteratively
while (to_remove_count > 0) {
assert(!pq.empty());
// Remove the candidate with smallest gap
auto rc = pq.top();
pq.pop();
// NOTE: priority_queue does not support updating an existing element,
// but we can work around that because the gap tracked in pq is only
// going to be better than actuality, and we can detect and adjust
// when a better-than-actual gap is found.
// Determine actual time gap if this entry is removed (zero entries are
// marked for deletion)
auto it = rc.it + 1;
uint64_t after_time = it->time;
while (after_time == kUnknownTimeBeforeAll) {
assert(it != pairs_.end());
++it;
after_time = it->time;
}
it = rc.it - 1;
uint64_t before_time = it->time;
while (before_time == kUnknownTimeBeforeAll) {
assert(it != pairs_.begin());
--it;
before_time = it->time;
}
// Check whether the gap is still valid (or needs to be recomputed)
if (rc.new_time_gap == after_time - before_time) {
// Mark the entry as removed
rc.it->time = kUnknownTimeBeforeAll;
--to_remove_count;
} else {
// Insert a replacement up-to-date removal candidate
pq.emplace(after_time - before_time, rc.it);
}
}
// Collapse away entries marked for deletion
auto from_it = pairs_.begin();
auto to_it = from_it;
for (; from_it != pairs_.end(); ++from_it) {
if (from_it->time != kUnknownTimeBeforeAll) {
if (from_it != to_it) {
*to_it = *from_it;
}
++to_it;
}
}
// Erase slots freed up
pairs_.erase(to_it, pairs_.end());
assert(pairs_.size() == strict_cap);
}
bool SeqnoToTimeMapping::SeqnoTimePair::Merge(const SeqnoTimePair& other) {
assert(seqno <= other.seqno);
if (seqno == other.seqno) {
// Favoring GetProximalSeqnoBeforeTime over GetProximalTimeBeforeSeqno
// by keeping the older time. For example, consider nothing has been
// written to the DB in some time.
time = std::min(time, other.time);
return true;
} else if (time == other.time) {
// Favoring GetProximalSeqnoBeforeTime over GetProximalTimeBeforeSeqno
// by keeping the newer seqno. For example, when a burst of writes ages
// out, we want the cutoff to be the newest seqno from that burst.
seqno = std::max(seqno, other.seqno);
return true;
} else if (time > other.time) {
assert(seqno < other.seqno);
// Need to resolve an inconsistency (clock drift? very rough time?).
// Given the direction that entries are supposed to err, trust the earlier
// time entry as more reliable, and this choice ensures we don't
// accidentally throw out an entry within our time span.
*this = other;
return true;
} else {
// Not merged
return false;
}
}
void SeqnoToTimeMapping::SortAndMerge() {
assert(!enforced_);
if (!pairs_.empty()) {
std::sort(pairs_.begin(), pairs_.end());
auto from_it = pairs_.begin();
auto to_it = from_it;
for (++from_it; from_it != pairs_.end(); ++from_it) {
if (to_it->Merge(*from_it)) {
// Merged with last entry
} else {
// Copy into next entry
*++to_it = *from_it;
}
}
// Erase slots freed up from merging
pairs_.erase(to_it + 1, pairs_.end());
}
// Mark as "at least sorted"
enforced_ = true;
}
SeqnoToTimeMapping& SeqnoToTimeMapping::SetMaxTimeSpan(uint64_t max_time_span) {
max_time_span_ = max_time_span;
if (enforced_) {
EnforceMaxTimeSpan();
}
return *this;
}
SeqnoToTimeMapping& SeqnoToTimeMapping::SetCapacity(uint64_t capacity) {
capacity_ = capacity;
if (enforced_) {
EnforceCapacity(/*strict=*/true);
}
return *this;
}
SeqnoToTimeMapping& SeqnoToTimeMapping::Enforce(uint64_t now) {
if (!enforced_) {
SortAndMerge();
assert(enforced_);
EnforceMaxTimeSpan(now);
} else if (now > 0) {
EnforceMaxTimeSpan(now);
}
EnforceCapacity(/*strict=*/true);
return *this;
}
void SeqnoToTimeMapping::AddUnenforced(SequenceNumber seqno, uint64_t time) {
if (seqno == 0) {
return;
}
enforced_ = false;
pairs_.emplace_back(seqno, time);
}
// The encoded format is:
// [num_of_entries][[seqno][time],[seqno][time],...]
// ^ ^
// var_int delta_encoded (var_int)
// Except empty string is used for empty mapping. This means the encoding
// doesn't fully form a prefix code, but that is OK for applications like
// TableProperties.
void SeqnoToTimeMapping::EncodeTo(std::string& dest) const {
assert(enforced_);
// Can use empty string for empty mapping
if (pairs_.empty()) {
return;
}
// Encode number of entries
PutVarint64(&dest, pairs_.size());
SeqnoTimePair base;
for (auto& cur : pairs_) {
assert(base < cur);
// Delta encode each entry
SeqnoTimePair val = cur.ComputeDelta(base);
base = cur;
val.Encode(dest);
}
}
namespace {
Status DecodeImpl(Slice& input,
std::deque<SeqnoToTimeMapping::SeqnoTimePair>& pairs) {
if (input.empty()) {
return Status::OK();
}
uint64_t count;
if (!GetVarint64(&input, &count)) {
return Status::Corruption("Invalid sequence number time size");
}
SeqnoToTimeMapping::SeqnoTimePair base;
for (uint64_t i = 0; i < count; i++) {
SeqnoToTimeMapping::SeqnoTimePair val;
Status s = val.Decode(input);
if (!s.ok()) {
return s;
}
val.ApplyDelta(base);
pairs.emplace_back(val);
base = val;
}
if (!input.empty()) {
return Status::Corruption(
"Extra bytes at end of sequence number time mapping");
}
return Status::OK();
}
} // namespace
Status SeqnoToTimeMapping::DecodeFrom(const std::string& pairs_str) {
size_t orig_size = pairs_.size();
Slice input(pairs_str);
Status s = DecodeImpl(input, pairs_);
if (!s.ok()) {
// Roll back in case of corrupted data
pairs_.resize(orig_size);
} else if (orig_size > 0 || max_time_span_ < UINT64_MAX ||
capacity_ < UINT64_MAX) {
enforced_ = false;
}
return s;
}
void SeqnoToTimeMapping::SeqnoTimePair::Encode(std::string& dest) const {
PutVarint64Varint64(&dest, seqno, time);
}
Status SeqnoToTimeMapping::SeqnoTimePair::Decode(Slice& input) {
if (!GetVarint64(&input, &seqno)) {
return Status::Corruption("Invalid sequence number");
}
if (!GetVarint64(&input, &time)) {
return Status::Corruption("Invalid time");
}
return Status::OK();
}
void SeqnoToTimeMapping::CopyFromSeqnoRange(const SeqnoToTimeMapping& src,
SequenceNumber from_seqno,
SequenceNumber to_seqno) {
bool orig_empty = Empty();
auto src_it = src.FindGreaterEqSeqno(from_seqno);
// Allow nonsensical ranges like [1000, 0] which might show up e.g. for
// an SST file with no entries.
auto src_it_end =
to_seqno < from_seqno ? src_it : src.FindGreaterSeqno(to_seqno);
// To best answer GetProximalTimeBeforeSeqno(from_seqno) we need an entry
// with a seqno before that (if available)
if (src_it != src.pairs_.begin()) {
--src_it;
}
assert(src_it <= src_it_end);
std::copy(src_it, src_it_end, std::back_inserter(pairs_));
if (!orig_empty || max_time_span_ < UINT64_MAX || capacity_ < UINT64_MAX) {
enforced_ = false;
}
}
bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) {
if (capacity_ == 0) {
return false;
}
bool added = false;
if (seqno == 0) {
// skip seq number 0, which may have special meaning, like zeroed out data
// TODO: consider changing?
} else if (pairs_.empty()) {
enforced_ = true;
pairs_.emplace_back(seqno, time);
// skip normal enforced check below
return true;
} else {
auto& last = pairs_.back();
// We can attempt to merge with the last entry if the new entry sorts with
// it.
if (last.seqno <= seqno) {
bool merged = last.Merge({seqno, time});
if (!merged) {
if (enforced_ && (seqno <= last.seqno || time <= last.time)) {
// Out of order append should not happen, except in case of clock
// reset
assert(false);
} else {
pairs_.emplace_back(seqno, time);
added = true;
}
}
} else if (!enforced_) {
// Treat like AddUnenforced and fix up below
pairs_.emplace_back(seqno, time);
added = true;
} else {
// Out of order append attempted
assert(false);
}
}
// Similar to Enforce() but not quite
if (!enforced_) {
SortAndMerge();
assert(enforced_);
}
EnforceMaxTimeSpan();
EnforceCapacity(/*strict=*/false);
return added;
}
bool SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno,
SequenceNumber to_seqno,
uint64_t from_time, uint64_t to_time) {
assert(Empty());
assert(from_seqno > 0);
assert(to_seqno > from_seqno);
assert(from_time > kUnknownTimeBeforeAll);
assert(to_time >= from_time);
// TODO: smartly limit this to max_capacity_ representative samples
for (auto i = from_seqno; i <= to_seqno; i++) {
uint64_t t = from_time + (to_time - from_time) * (i - from_seqno) /
(to_seqno - from_seqno);
pairs_.emplace_back(i, t);
}
return /*success*/ true;
}
std::string SeqnoToTimeMapping::ToHumanString() const {
std::string ret;
for (const auto& seq_time : pairs_) {
AppendNumberTo(&ret, seq_time.seqno);
ret.append("->");
AppendNumberTo(&ret, seq_time.time);
ret.append(",");
}
return ret;
}
Slice PackValueAndWriteTime(const Slice& value, uint64_t unix_write_time,
std::string* buf) {
buf->assign(value.data(), value.size());
PutFixed64(buf, unix_write_time);
return Slice(*buf);
}
Slice PackValueAndSeqno(const Slice& value, SequenceNumber seqno,
std::string* buf) {
buf->assign(value.data(), value.size());
PutFixed64(buf, seqno);
return Slice(*buf);
}
std::tuple<Slice, uint64_t> ParsePackedValueWithWriteTime(const Slice& value) {
assert(value.size() >= sizeof(uint64_t));
Slice write_time_slice(value.data() + value.size() - sizeof(uint64_t),
sizeof(uint64_t));
uint64_t write_time;
[[maybe_unused]] auto res = GetFixed64(&write_time_slice, &write_time);
assert(res);
return std::make_tuple(Slice(value.data(), value.size() - sizeof(uint64_t)),
write_time);
}
std::tuple<Slice, SequenceNumber> ParsePackedValueWithSeqno(
const Slice& value) {
assert(value.size() >= sizeof(SequenceNumber));
Slice seqno_slice(value.data() + value.size() - sizeof(uint64_t),
sizeof(uint64_t));
SequenceNumber seqno;
[[maybe_unused]] auto res = GetFixed64(&seqno_slice, &seqno);
assert(res);
return std::make_tuple(
Slice(value.data(), value.size() - sizeof(SequenceNumber)), seqno);
}
Slice ParsePackedValueForValue(const Slice& value) {
assert(value.size() >= sizeof(uint64_t));
return Slice(value.data(), value.size() - sizeof(uint64_t));
}
} // namespace ROCKSDB_NAMESPACE