mirror of
https://github.com/facebook/rocksdb.git
synced 2024-12-01 07:15:51 +00:00
c73cf7a878
Summary: This PR adds user property collector factory `CompactForTieringCollectorFactory` to support observe SST file and mark it as need compaction for fast tracking data to the proper tier. A triggering ratio `compaction_trigger_ratio_` can be configured to achieve the following: 1) Setting the ratio to be equal to or smaller than 0 disables this collector 2) Setting the ratio to be within (0, 1] will write the number of observed eligible entries into a user property and marks a file as need-compaction when aforementioned condition is met. 3) Setting the ratio to be higher than 1 can be used to just writes the user table property, and not mark any file as need compaction. For a column family that does not enable tiering feature, even if an effective configuration is provided, this collector is still disabled. For a file that is already on the last level, this collector is also disabled. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12760 Test Plan: Added unit tests Reviewed By: pdillinger Differential Revision: D58734976 Pulled By: jowlyzhang fbshipit-source-id: 6daab2c4f62b5c6689c3c03e3b3907bbbe6b7a81
574 lines
17 KiB
C++
574 lines
17 KiB
C++
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
//
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "db/seqno_to_time_mapping.h"
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
#include <deque>
|
|
#include <functional>
|
|
#include <queue>
|
|
#include <vector>
|
|
|
|
#include "db/version_edit.h"
|
|
#include "util/string_util.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterTime(
|
|
uint64_t time) const {
|
|
assert(enforced_);
|
|
return std::upper_bound(pairs_.cbegin(), pairs_.cend(),
|
|
SeqnoTimePair{0, time}, SeqnoTimePair::TimeLess);
|
|
}
|
|
|
|
SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterEqSeqno(
|
|
SequenceNumber seqno) const {
|
|
assert(enforced_);
|
|
return std::lower_bound(pairs_.cbegin(), pairs_.cend(),
|
|
SeqnoTimePair{seqno, 0}, SeqnoTimePair::SeqnoLess);
|
|
}
|
|
|
|
SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterSeqno(
|
|
SequenceNumber seqno) const {
|
|
assert(enforced_);
|
|
return std::upper_bound(pairs_.cbegin(), pairs_.cend(),
|
|
SeqnoTimePair{seqno, 0}, SeqnoTimePair::SeqnoLess);
|
|
}
|
|
|
|
uint64_t SeqnoToTimeMapping::GetProximalTimeBeforeSeqno(
|
|
SequenceNumber seqno) const {
|
|
assert(enforced_);
|
|
// Find the last entry with a seqno strictly less than the given seqno.
|
|
// First, find the first entry >= the given seqno (or end)
|
|
auto it = FindGreaterEqSeqno(seqno);
|
|
if (it == pairs_.cbegin()) {
|
|
return kUnknownTimeBeforeAll;
|
|
}
|
|
// Then return data from previous.
|
|
it--;
|
|
return it->time;
|
|
}
|
|
|
|
SequenceNumber SeqnoToTimeMapping::GetProximalSeqnoBeforeTime(
|
|
uint64_t time) const {
|
|
assert(enforced_);
|
|
|
|
// Find the last entry with a time <= the given time.
|
|
// First, find the first entry > the given time (or end).
|
|
auto it = FindGreaterTime(time);
|
|
if (it == pairs_.cbegin()) {
|
|
return kUnknownSeqnoBeforeAll;
|
|
}
|
|
// Then return data from previous.
|
|
--it;
|
|
return it->seqno;
|
|
}
|
|
|
|
void SeqnoToTimeMapping::GetCurrentTieringCutoffSeqnos(
|
|
uint64_t current_time, uint64_t preserve_internal_time_seconds,
|
|
uint64_t preclude_last_level_data_seconds,
|
|
SequenceNumber* preserve_time_min_seqno,
|
|
SequenceNumber* preclude_last_level_min_seqno) const {
|
|
uint64_t preserve_time_duration = std::max(preserve_internal_time_seconds,
|
|
preclude_last_level_data_seconds);
|
|
if (preserve_time_duration <= 0) {
|
|
return;
|
|
}
|
|
uint64_t preserve_time = current_time > preserve_time_duration
|
|
? current_time - preserve_time_duration
|
|
: 0;
|
|
// GetProximalSeqnoBeforeTime tells us the last seqno known to have been
|
|
// written at or before the given time. + 1 to get the minimum we should
|
|
// preserve without excluding anything that might have been written on or
|
|
// after the given time.
|
|
if (preserve_time_min_seqno) {
|
|
*preserve_time_min_seqno = GetProximalSeqnoBeforeTime(preserve_time) + 1;
|
|
}
|
|
if (preclude_last_level_data_seconds > 0 && preclude_last_level_min_seqno) {
|
|
uint64_t preclude_last_level_time =
|
|
current_time > preclude_last_level_data_seconds
|
|
? current_time - preclude_last_level_data_seconds
|
|
: 0;
|
|
*preclude_last_level_min_seqno =
|
|
GetProximalSeqnoBeforeTime(preclude_last_level_time) + 1;
|
|
}
|
|
}
|
|
|
|
void SeqnoToTimeMapping::EnforceMaxTimeSpan(uint64_t now) {
|
|
assert(enforced_); // at least sorted
|
|
uint64_t cutoff_time;
|
|
if (pairs_.size() <= 1) {
|
|
return;
|
|
}
|
|
if (now > 0) {
|
|
if (now < max_time_span_) {
|
|
// Nothing eligible to prune / avoid underflow
|
|
return;
|
|
}
|
|
cutoff_time = now - max_time_span_;
|
|
} else {
|
|
const auto& last = pairs_.back();
|
|
if (last.time < max_time_span_) {
|
|
// Nothing eligible to prune / avoid underflow
|
|
return;
|
|
}
|
|
cutoff_time = last.time - max_time_span_;
|
|
}
|
|
// Keep one entry <= cutoff_time
|
|
while (pairs_.size() >= 2 && pairs_[0].time <= cutoff_time &&
|
|
pairs_[1].time <= cutoff_time) {
|
|
pairs_.pop_front();
|
|
}
|
|
}
|
|
|
|
void SeqnoToTimeMapping::EnforceCapacity(bool strict) {
|
|
assert(enforced_); // at least sorted
|
|
uint64_t strict_cap = capacity_;
|
|
if (strict_cap == 0) {
|
|
pairs_.clear();
|
|
return;
|
|
}
|
|
// Treat cap of 1 as 2 to work with the below algorithm (etc.)
|
|
if (strict_cap == 1) {
|
|
strict_cap = 2;
|
|
}
|
|
// When !strict, allow being over nominal capacity by a modest fraction.
|
|
uint64_t effective_cap = strict_cap + (strict ? 0 : strict_cap / 8);
|
|
if (effective_cap < strict_cap) {
|
|
// Correct overflow
|
|
effective_cap = UINT64_MAX;
|
|
}
|
|
if (pairs_.size() <= effective_cap) {
|
|
return;
|
|
}
|
|
// The below algorithm expects at least one removal candidate between first
|
|
// and last.
|
|
assert(pairs_.size() >= 3);
|
|
size_t to_remove_count = pairs_.size() - strict_cap;
|
|
|
|
struct RemovalCandidate {
|
|
uint64_t new_time_gap;
|
|
std::deque<SeqnoTimePair>::iterator it;
|
|
RemovalCandidate(uint64_t _new_time_gap,
|
|
std::deque<SeqnoTimePair>::iterator _it)
|
|
: new_time_gap(_new_time_gap), it(_it) {}
|
|
bool operator>(const RemovalCandidate& other) const {
|
|
if (new_time_gap == other.new_time_gap) {
|
|
// If same gap, treat the newer entry as less attractive
|
|
// for removal (like larger gap)
|
|
return it->seqno > other.it->seqno;
|
|
}
|
|
return new_time_gap > other.new_time_gap;
|
|
}
|
|
};
|
|
|
|
// A priority queue of best removal candidates (smallest time gap remaining
|
|
// after removal)
|
|
using RC = RemovalCandidate;
|
|
using PQ = std::priority_queue<RC, std::vector<RC>, std::greater<RC>>;
|
|
PQ pq;
|
|
|
|
// Add all the candidates (not including first and last)
|
|
{
|
|
auto it = pairs_.begin();
|
|
assert(it->time != kUnknownTimeBeforeAll);
|
|
uint64_t prev_prev_time = it->time;
|
|
++it;
|
|
assert(it->time != kUnknownTimeBeforeAll);
|
|
auto prev_it = it;
|
|
++it;
|
|
while (it != pairs_.end()) {
|
|
assert(it->time != kUnknownTimeBeforeAll);
|
|
uint64_t gap = it->time - prev_prev_time;
|
|
pq.emplace(gap, prev_it);
|
|
prev_prev_time = prev_it->time;
|
|
prev_it = it;
|
|
++it;
|
|
}
|
|
}
|
|
|
|
// Greedily remove the best candidate, iteratively
|
|
while (to_remove_count > 0) {
|
|
assert(!pq.empty());
|
|
// Remove the candidate with smallest gap
|
|
auto rc = pq.top();
|
|
pq.pop();
|
|
|
|
// NOTE: priority_queue does not support updating an existing element,
|
|
// but we can work around that because the gap tracked in pq is only
|
|
// going to be better than actuality, and we can detect and adjust
|
|
// when a better-than-actual gap is found.
|
|
|
|
// Determine actual time gap if this entry is removed (zero entries are
|
|
// marked for deletion)
|
|
auto it = rc.it + 1;
|
|
uint64_t after_time = it->time;
|
|
while (after_time == kUnknownTimeBeforeAll) {
|
|
assert(it != pairs_.end());
|
|
++it;
|
|
after_time = it->time;
|
|
}
|
|
it = rc.it - 1;
|
|
uint64_t before_time = it->time;
|
|
while (before_time == kUnknownTimeBeforeAll) {
|
|
assert(it != pairs_.begin());
|
|
--it;
|
|
before_time = it->time;
|
|
}
|
|
// Check whether the gap is still valid (or needs to be recomputed)
|
|
if (rc.new_time_gap == after_time - before_time) {
|
|
// Mark the entry as removed
|
|
rc.it->time = kUnknownTimeBeforeAll;
|
|
--to_remove_count;
|
|
} else {
|
|
// Insert a replacement up-to-date removal candidate
|
|
pq.emplace(after_time - before_time, rc.it);
|
|
}
|
|
}
|
|
|
|
// Collapse away entries marked for deletion
|
|
auto from_it = pairs_.begin();
|
|
auto to_it = from_it;
|
|
|
|
for (; from_it != pairs_.end(); ++from_it) {
|
|
if (from_it->time != kUnknownTimeBeforeAll) {
|
|
if (from_it != to_it) {
|
|
*to_it = *from_it;
|
|
}
|
|
++to_it;
|
|
}
|
|
}
|
|
|
|
// Erase slots freed up
|
|
pairs_.erase(to_it, pairs_.end());
|
|
assert(pairs_.size() == strict_cap);
|
|
}
|
|
|
|
bool SeqnoToTimeMapping::SeqnoTimePair::Merge(const SeqnoTimePair& other) {
|
|
assert(seqno <= other.seqno);
|
|
if (seqno == other.seqno) {
|
|
// Favoring GetProximalSeqnoBeforeTime over GetProximalTimeBeforeSeqno
|
|
// by keeping the older time. For example, consider nothing has been
|
|
// written to the DB in some time.
|
|
time = std::min(time, other.time);
|
|
return true;
|
|
} else if (time == other.time) {
|
|
// Favoring GetProximalSeqnoBeforeTime over GetProximalTimeBeforeSeqno
|
|
// by keeping the newer seqno. For example, when a burst of writes ages
|
|
// out, we want the cutoff to be the newest seqno from that burst.
|
|
seqno = std::max(seqno, other.seqno);
|
|
return true;
|
|
} else if (time > other.time) {
|
|
assert(seqno < other.seqno);
|
|
// Need to resolve an inconsistency (clock drift? very rough time?).
|
|
// Given the direction that entries are supposed to err, trust the earlier
|
|
// time entry as more reliable, and this choice ensures we don't
|
|
// accidentally throw out an entry within our time span.
|
|
*this = other;
|
|
return true;
|
|
} else {
|
|
// Not merged
|
|
return false;
|
|
}
|
|
}
|
|
|
|
void SeqnoToTimeMapping::SortAndMerge() {
|
|
assert(!enforced_);
|
|
if (!pairs_.empty()) {
|
|
std::sort(pairs_.begin(), pairs_.end());
|
|
|
|
auto from_it = pairs_.begin();
|
|
auto to_it = from_it;
|
|
for (++from_it; from_it != pairs_.end(); ++from_it) {
|
|
if (to_it->Merge(*from_it)) {
|
|
// Merged with last entry
|
|
} else {
|
|
// Copy into next entry
|
|
*++to_it = *from_it;
|
|
}
|
|
}
|
|
// Erase slots freed up from merging
|
|
pairs_.erase(to_it + 1, pairs_.end());
|
|
}
|
|
// Mark as "at least sorted"
|
|
enforced_ = true;
|
|
}
|
|
|
|
SeqnoToTimeMapping& SeqnoToTimeMapping::SetMaxTimeSpan(uint64_t max_time_span) {
|
|
max_time_span_ = max_time_span;
|
|
if (enforced_) {
|
|
EnforceMaxTimeSpan();
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
SeqnoToTimeMapping& SeqnoToTimeMapping::SetCapacity(uint64_t capacity) {
|
|
capacity_ = capacity;
|
|
if (enforced_) {
|
|
EnforceCapacity(/*strict=*/true);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
SeqnoToTimeMapping& SeqnoToTimeMapping::Enforce(uint64_t now) {
|
|
if (!enforced_) {
|
|
SortAndMerge();
|
|
assert(enforced_);
|
|
EnforceMaxTimeSpan(now);
|
|
} else if (now > 0) {
|
|
EnforceMaxTimeSpan(now);
|
|
}
|
|
EnforceCapacity(/*strict=*/true);
|
|
return *this;
|
|
}
|
|
|
|
void SeqnoToTimeMapping::AddUnenforced(SequenceNumber seqno, uint64_t time) {
|
|
if (seqno == 0) {
|
|
return;
|
|
}
|
|
enforced_ = false;
|
|
pairs_.emplace_back(seqno, time);
|
|
}
|
|
|
|
// The encoded format is:
|
|
// [num_of_entries][[seqno][time],[seqno][time],...]
|
|
// ^ ^
|
|
// var_int delta_encoded (var_int)
|
|
// Except empty string is used for empty mapping. This means the encoding
|
|
// doesn't fully form a prefix code, but that is OK for applications like
|
|
// TableProperties.
|
|
void SeqnoToTimeMapping::EncodeTo(std::string& dest) const {
|
|
assert(enforced_);
|
|
// Can use empty string for empty mapping
|
|
if (pairs_.empty()) {
|
|
return;
|
|
}
|
|
// Encode number of entries
|
|
PutVarint64(&dest, pairs_.size());
|
|
SeqnoTimePair base;
|
|
for (auto& cur : pairs_) {
|
|
assert(base < cur);
|
|
// Delta encode each entry
|
|
SeqnoTimePair val = cur.ComputeDelta(base);
|
|
base = cur;
|
|
val.Encode(dest);
|
|
}
|
|
}
|
|
|
|
namespace {
|
|
Status DecodeImpl(Slice& input,
|
|
std::deque<SeqnoToTimeMapping::SeqnoTimePair>& pairs) {
|
|
if (input.empty()) {
|
|
return Status::OK();
|
|
}
|
|
uint64_t count;
|
|
if (!GetVarint64(&input, &count)) {
|
|
return Status::Corruption("Invalid sequence number time size");
|
|
}
|
|
|
|
SeqnoToTimeMapping::SeqnoTimePair base;
|
|
for (uint64_t i = 0; i < count; i++) {
|
|
SeqnoToTimeMapping::SeqnoTimePair val;
|
|
Status s = val.Decode(input);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
val.ApplyDelta(base);
|
|
pairs.emplace_back(val);
|
|
base = val;
|
|
}
|
|
|
|
if (!input.empty()) {
|
|
return Status::Corruption(
|
|
"Extra bytes at end of sequence number time mapping");
|
|
}
|
|
return Status::OK();
|
|
}
|
|
} // namespace
|
|
|
|
Status SeqnoToTimeMapping::DecodeFrom(const std::string& pairs_str) {
|
|
size_t orig_size = pairs_.size();
|
|
|
|
Slice input(pairs_str);
|
|
Status s = DecodeImpl(input, pairs_);
|
|
if (!s.ok()) {
|
|
// Roll back in case of corrupted data
|
|
pairs_.resize(orig_size);
|
|
} else if (orig_size > 0 || max_time_span_ < UINT64_MAX ||
|
|
capacity_ < UINT64_MAX) {
|
|
enforced_ = false;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
void SeqnoToTimeMapping::SeqnoTimePair::Encode(std::string& dest) const {
|
|
PutVarint64Varint64(&dest, seqno, time);
|
|
}
|
|
|
|
Status SeqnoToTimeMapping::SeqnoTimePair::Decode(Slice& input) {
|
|
if (!GetVarint64(&input, &seqno)) {
|
|
return Status::Corruption("Invalid sequence number");
|
|
}
|
|
if (!GetVarint64(&input, &time)) {
|
|
return Status::Corruption("Invalid time");
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
void SeqnoToTimeMapping::CopyFromSeqnoRange(const SeqnoToTimeMapping& src,
|
|
SequenceNumber from_seqno,
|
|
SequenceNumber to_seqno) {
|
|
bool orig_empty = Empty();
|
|
auto src_it = src.FindGreaterEqSeqno(from_seqno);
|
|
// Allow nonsensical ranges like [1000, 0] which might show up e.g. for
|
|
// an SST file with no entries.
|
|
auto src_it_end =
|
|
to_seqno < from_seqno ? src_it : src.FindGreaterSeqno(to_seqno);
|
|
// To best answer GetProximalTimeBeforeSeqno(from_seqno) we need an entry
|
|
// with a seqno before that (if available)
|
|
if (src_it != src.pairs_.begin()) {
|
|
--src_it;
|
|
}
|
|
assert(src_it <= src_it_end);
|
|
std::copy(src_it, src_it_end, std::back_inserter(pairs_));
|
|
|
|
if (!orig_empty || max_time_span_ < UINT64_MAX || capacity_ < UINT64_MAX) {
|
|
enforced_ = false;
|
|
}
|
|
}
|
|
|
|
bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) {
|
|
if (capacity_ == 0) {
|
|
return false;
|
|
}
|
|
bool added = false;
|
|
if (seqno == 0) {
|
|
// skip seq number 0, which may have special meaning, like zeroed out data
|
|
// TODO: consider changing?
|
|
} else if (pairs_.empty()) {
|
|
enforced_ = true;
|
|
pairs_.emplace_back(seqno, time);
|
|
// skip normal enforced check below
|
|
return true;
|
|
} else {
|
|
auto& last = pairs_.back();
|
|
// We can attempt to merge with the last entry if the new entry sorts with
|
|
// it.
|
|
if (last.seqno <= seqno) {
|
|
bool merged = last.Merge({seqno, time});
|
|
if (!merged) {
|
|
if (enforced_ && (seqno <= last.seqno || time <= last.time)) {
|
|
// Out of order append should not happen, except in case of clock
|
|
// reset
|
|
assert(false);
|
|
} else {
|
|
pairs_.emplace_back(seqno, time);
|
|
added = true;
|
|
}
|
|
}
|
|
} else if (!enforced_) {
|
|
// Treat like AddUnenforced and fix up below
|
|
pairs_.emplace_back(seqno, time);
|
|
added = true;
|
|
} else {
|
|
// Out of order append attempted
|
|
assert(false);
|
|
}
|
|
}
|
|
// Similar to Enforce() but not quite
|
|
if (!enforced_) {
|
|
SortAndMerge();
|
|
assert(enforced_);
|
|
}
|
|
EnforceMaxTimeSpan();
|
|
EnforceCapacity(/*strict=*/false);
|
|
return added;
|
|
}
|
|
|
|
bool SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno,
|
|
SequenceNumber to_seqno,
|
|
uint64_t from_time, uint64_t to_time) {
|
|
assert(Empty());
|
|
assert(from_seqno > 0);
|
|
assert(to_seqno > from_seqno);
|
|
assert(from_time > kUnknownTimeBeforeAll);
|
|
assert(to_time >= from_time);
|
|
|
|
// TODO: smartly limit this to max_capacity_ representative samples
|
|
for (auto i = from_seqno; i <= to_seqno; i++) {
|
|
uint64_t t = from_time + (to_time - from_time) * (i - from_seqno) /
|
|
(to_seqno - from_seqno);
|
|
pairs_.emplace_back(i, t);
|
|
}
|
|
|
|
return /*success*/ true;
|
|
}
|
|
|
|
std::string SeqnoToTimeMapping::ToHumanString() const {
|
|
std::string ret;
|
|
for (const auto& seq_time : pairs_) {
|
|
AppendNumberTo(&ret, seq_time.seqno);
|
|
ret.append("->");
|
|
AppendNumberTo(&ret, seq_time.time);
|
|
ret.append(",");
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
Slice PackValueAndWriteTime(const Slice& value, uint64_t unix_write_time,
|
|
std::string* buf) {
|
|
buf->assign(value.data(), value.size());
|
|
PutFixed64(buf, unix_write_time);
|
|
return Slice(*buf);
|
|
}
|
|
|
|
Slice PackValueAndSeqno(const Slice& value, SequenceNumber seqno,
|
|
std::string* buf) {
|
|
buf->assign(value.data(), value.size());
|
|
PutFixed64(buf, seqno);
|
|
return Slice(*buf);
|
|
}
|
|
|
|
uint64_t ParsePackedValueForWriteTime(const Slice& value) {
|
|
assert(value.size() >= sizeof(uint64_t));
|
|
Slice write_time_slice(value.data() + value.size() - sizeof(uint64_t),
|
|
sizeof(uint64_t));
|
|
uint64_t write_time;
|
|
[[maybe_unused]] auto res = GetFixed64(&write_time_slice, &write_time);
|
|
assert(res);
|
|
return write_time;
|
|
}
|
|
|
|
std::tuple<Slice, uint64_t> ParsePackedValueWithWriteTime(const Slice& value) {
|
|
return std::make_tuple(Slice(value.data(), value.size() - sizeof(uint64_t)),
|
|
ParsePackedValueForWriteTime(value));
|
|
}
|
|
|
|
SequenceNumber ParsePackedValueForSeqno(const Slice& value) {
|
|
assert(value.size() >= sizeof(SequenceNumber));
|
|
Slice seqno_slice(value.data() + value.size() - sizeof(uint64_t),
|
|
sizeof(uint64_t));
|
|
SequenceNumber seqno;
|
|
[[maybe_unused]] auto res = GetFixed64(&seqno_slice, &seqno);
|
|
assert(res);
|
|
return seqno;
|
|
}
|
|
|
|
std::tuple<Slice, SequenceNumber> ParsePackedValueWithSeqno(
|
|
const Slice& value) {
|
|
return std::make_tuple(
|
|
Slice(value.data(), value.size() - sizeof(SequenceNumber)),
|
|
ParsePackedValueForSeqno(value));
|
|
}
|
|
|
|
Slice ParsePackedValueForValue(const Slice& value) {
|
|
assert(value.size() >= sizeof(uint64_t));
|
|
return Slice(value.data(), value.size() - sizeof(uint64_t));
|
|
}
|
|
} // namespace ROCKSDB_NAMESPACE
|