rocksdb/db/compaction/compaction_outputs.h
Jay Zhuang a3acf2ef87 Add seqno to time mapping (#10338)
Summary:
Which will be used for tiered storage to preclude hot data from
compacting to the cold tier (the last level).
Internally, adding seqno to time mapping. A periodic_task is scheduled
to record the current_seqno -> current_time in certain cadence. When
memtable flush, the mapping informaiton is stored in sstable property.
During compaction, the mapping information are merged and get the
approximate time of sequence number, which is used to determine if a key
is recently inserted or not and preclude it from the last level if it's
recently inserted (within the `preclude_last_level_data_seconds`).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10338

Test Plan: CI

Reviewed By: siying

Differential Revision: D37810187

Pulled By: jay-zhuang

fbshipit-source-id: 6953be7a18a99de8b1cb3b162d712f79c2b4899f
2022-07-14 21:49:34 -07:00

330 lines
11 KiB
C++

// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "db/blob/blob_garbage_meter.h"
#include "db/compaction/compaction.h"
#include "db/compaction/compaction_iterator.h"
#include "db/internal_stats.h"
#include "db/output_validator.h"
namespace ROCKSDB_NAMESPACE {
class CompactionOutputs;
using CompactionFileOpenFunc = std::function<Status(CompactionOutputs&)>;
using CompactionFileCloseFunc =
std::function<Status(CompactionOutputs&, const Status&, const Slice&)>;
// Files produced by subcompaction, most of the functions are used by
// compaction_job Open/Close compaction file functions.
class CompactionOutputs {
public:
// compaction output file
struct Output {
Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
bool _enable_order_check, bool _enable_hash, bool _finished,
uint64_t precalculated_hash)
: meta(std::move(_meta)),
validator(_icmp, _enable_order_check, _enable_hash,
precalculated_hash),
finished(_finished) {}
FileMetaData meta;
OutputValidator validator;
bool finished;
std::shared_ptr<const TableProperties> table_properties;
};
CompactionOutputs() = delete;
explicit CompactionOutputs(const Compaction* compaction,
const bool is_penultimate_level)
: compaction_(compaction), is_penultimate_level_(is_penultimate_level) {
partitioner_ = compaction->output_level() == 0
? nullptr
: compaction->CreateSstPartitioner();
}
// Add generated output to the list
void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp,
bool enable_order_check, bool enable_hash,
bool finished = false, uint64_t precalculated_hash = 0) {
outputs_.emplace_back(std::move(meta), icmp, enable_order_check,
enable_hash, finished, precalculated_hash);
}
// Set new table builder for the current output
void NewBuilder(const TableBuilderOptions& tboptions);
// Assign a new WritableFileWriter to the current output
void AssignFileWriter(WritableFileWriter* writer) {
file_writer_.reset(writer);
}
// TODO: Remove it when remote compaction support tiered compaction
void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; }
void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; }
// TODO: Move the BlobDB builder into CompactionOutputs
const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
if (is_penultimate_level_) {
assert(blob_file_additions_.empty());
}
return blob_file_additions_;
}
std::vector<BlobFileAddition>* GetBlobFileAdditionsPtr() {
assert(!is_penultimate_level_);
return &blob_file_additions_;
}
bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); }
BlobGarbageMeter* CreateBlobGarbageMeter() {
assert(!is_penultimate_level_);
blob_garbage_meter_ = std::make_unique<BlobGarbageMeter>();
return blob_garbage_meter_.get();
}
BlobGarbageMeter* GetBlobGarbageMeter() const {
if (is_penultimate_level_) {
// blobdb doesn't support per_key_placement yet
assert(blob_garbage_meter_ == nullptr);
return nullptr;
}
return blob_garbage_meter_.get();
}
void UpdateBlobStats() {
assert(!is_penultimate_level_);
stats_.num_output_files_blob = blob_file_additions_.size();
for (const auto& blob : blob_file_additions_) {
stats_.bytes_written_blob += blob.GetTotalBlobBytes();
}
}
// Finish the current output file
Status Finish(const Status& intput_status,
const SeqnoToTimeMapping& seqno_time_mapping);
// Update output table properties from table builder
void UpdateTableProperties() {
current_output().table_properties =
std::make_shared<TableProperties>(GetTableProperties());
}
IOStatus WriterSyncClose(const Status& intput_status, SystemClock* clock,
Statistics* statistics, bool use_fsync);
TableProperties GetTableProperties() {
return builder_->GetTableProperties();
}
Slice SmallestUserKey() const {
if (!outputs_.empty() && outputs_[0].finished) {
return outputs_[0].meta.smallest.user_key();
} else {
return Slice{nullptr, 0};
}
}
Slice LargestUserKey() const {
if (!outputs_.empty() && outputs_.back().finished) {
return outputs_.back().meta.largest.user_key();
} else {
return Slice{nullptr, 0};
}
}
// In case the last output file is empty, which doesn't need to keep.
void RemoveLastEmptyOutput() {
if (!outputs_.empty() && !outputs_.back().meta.fd.file_size) {
// An error occurred, so ignore the last output.
outputs_.pop_back();
}
}
// Remove the last output, for example the last output doesn't have data (no
// entry and no range-dels), but file_size might not be 0, as it has SST
// metadata.
void RemoveLastOutput() {
assert(!outputs_.empty());
outputs_.pop_back();
}
bool HasBuilder() const { return builder_ != nullptr; }
FileMetaData* GetMetaData() { return &current_output().meta; }
bool HasOutput() const { return !outputs_.empty(); }
uint64_t NumEntries() const { return builder_->NumEntries(); }
void ResetBuilder() {
builder_.reset();
current_output_file_size_ = 0;
}
// Add range-dels from the aggregator to the current output file
Status AddRangeDels(const Slice* comp_start, const Slice* comp_end,
CompactionIterationStats& range_del_out_stats,
bool bottommost_level, const InternalKeyComparator& icmp,
SequenceNumber earliest_snapshot,
const Slice& next_table_min_key);
// Is the current file is already pending for close
bool IsPendingClose() const { return pending_close_; }
// Current file should close before adding a new key
void SetPendingClose() { pending_close_ = true; }
// if the outputs have range delete, range delete is also data
bool HasRangeDel() const {
return range_del_agg_ && !range_del_agg_->IsEmpty();
}
private:
friend class SubcompactionState;
void Cleanup() {
if (builder_ != nullptr) {
// May happen if we get a shutdown call in the middle of compaction
builder_->Abandon();
builder_.reset();
}
}
uint64_t GetCurrentOutputFileSize() const {
return current_output_file_size_;
}
// Add curent key from compaction_iterator to the output file. If needed
// close and open new compaction output with the functions provided.
Status AddToOutput(const CompactionIterator& c_iter,
const CompactionFileOpenFunc& open_file_func,
const CompactionFileCloseFunc& close_file_func);
// Close the current output. `open_file_func` is needed for creating new file
// for range-dels only output file.
Status CloseOutput(const Status& curr_status,
const CompactionFileOpenFunc& open_file_func,
const CompactionFileCloseFunc& close_file_func) {
Status status = curr_status;
// handle subcompaction containing only range deletions
if (status.ok() && !HasBuilder() && !HasOutput() && HasRangeDel()) {
status = open_file_func(*this);
}
if (HasBuilder()) {
const Slice empty_key{};
Status s = close_file_func(*this, status, empty_key);
if (!s.ok() && status.ok()) {
status = s;
}
}
return status;
}
// This subcompaction's output could be empty if compaction was aborted before
// this subcompaction had a chance to generate any output files. When
// subcompactions are executed sequentially this is more likely and will be
// particularly likely for the later subcompactions to be empty. Once they are
// run in parallel however it should be much rarer.
// It's caller's responsibility to make sure it's not empty.
Output& current_output() {
assert(!outputs_.empty());
return outputs_.back();
}
// Assign the range_del_agg to the target output level. There's only one
// range-del-aggregator per compaction outputs, for
// output_to_penultimate_level compaction it is only assigned to the
// penultimate level.
void AssignRangeDelAggregator(
std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
assert(range_del_agg_ == nullptr);
range_del_agg_ = std::move(range_del_agg);
}
const Compaction* compaction_;
// The current file is pending close, which needs to run `close_file_func()`
// first to add a new key.
bool pending_close_ = false;
// current output builder and writer
std::unique_ptr<TableBuilder> builder_;
std::unique_ptr<WritableFileWriter> file_writer_;
uint64_t current_output_file_size_ = 0;
// all the compaction outputs so far
std::vector<Output> outputs_;
// BlobDB info
std::vector<BlobFileAddition> blob_file_additions_;
std::unique_ptr<BlobGarbageMeter> blob_garbage_meter_;
// Basic compaction output stats for this level's outputs
InternalStats::CompactionOutputsStats stats_;
// indicate if this CompactionOutputs obj for penultimate_level, should always
// be false if per_key_placement feature is not enabled.
const bool is_penultimate_level_;
std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_ = nullptr;
// partitioner information
std::string last_key_for_partitioner_;
std::unique_ptr<SstPartitioner> partitioner_;
};
// helper struct to concatenate the last level and penultimate level outputs
// which could be replaced by std::ranges::join_view() in c++20
struct OutputIterator {
public:
explicit OutputIterator(const std::vector<CompactionOutputs::Output>& a,
const std::vector<CompactionOutputs::Output>& b)
: a_(a), b_(b) {
within_a = !a_.empty();
idx_ = 0;
}
OutputIterator begin() { return *this; }
OutputIterator end() { return *this; }
size_t size() { return a_.size() + b_.size(); }
const CompactionOutputs::Output& operator*() const {
return within_a ? a_[idx_] : b_[idx_];
}
OutputIterator& operator++() {
idx_++;
if (within_a && idx_ >= a_.size()) {
within_a = false;
idx_ = 0;
}
assert(within_a || idx_ <= b_.size());
return *this;
}
bool operator!=(const OutputIterator& /*rhs*/) const {
return within_a || idx_ < b_.size();
}
private:
const std::vector<CompactionOutputs::Output>& a_;
const std::vector<CompactionOutputs::Output>& b_;
bool within_a;
size_t idx_;
};
} // namespace ROCKSDB_NAMESPACE