rocksdb/db/compaction/compaction_outputs.h

386 lines
13 KiB
C
Raw Normal View History

// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "db/blob/blob_garbage_meter.h"
#include "db/compaction/compaction.h"
#include "db/compaction/compaction_iterator.h"
#include "db/internal_stats.h"
#include "db/output_validator.h"
namespace ROCKSDB_NAMESPACE {
class CompactionOutputs;
using CompactionFileOpenFunc = std::function<Status(CompactionOutputs&)>;
using CompactionFileCloseFunc =
std::function<Status(CompactionOutputs&, const Status&, const Slice&)>;
// Files produced by subcompaction, most of the functions are used by
// compaction_job Open/Close compaction file functions.
class CompactionOutputs {
public:
// compaction output file
struct Output {
Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
bool _enable_order_check, bool _enable_hash, bool _finished,
uint64_t precalculated_hash)
: meta(std::move(_meta)),
validator(_icmp, _enable_order_check, _enable_hash,
precalculated_hash),
finished(_finished) {}
FileMetaData meta;
OutputValidator validator;
bool finished;
std::shared_ptr<const TableProperties> table_properties;
};
CompactionOutputs() = delete;
explicit CompactionOutputs(const Compaction* compaction,
const bool is_penultimate_level);
// Add generated output to the list
void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp,
bool enable_order_check, bool enable_hash,
bool finished = false, uint64_t precalculated_hash = 0) {
outputs_.emplace_back(std::move(meta), icmp, enable_order_check,
enable_hash, finished, precalculated_hash);
}
// Set new table builder for the current output
void NewBuilder(const TableBuilderOptions& tboptions);
// Assign a new WritableFileWriter to the current output
void AssignFileWriter(WritableFileWriter* writer) {
file_writer_.reset(writer);
}
// TODO: Remove it when remote compaction support tiered compaction
void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; }
void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; }
// TODO: Move the BlobDB builder into CompactionOutputs
const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
if (is_penultimate_level_) {
assert(blob_file_additions_.empty());
}
return blob_file_additions_;
}
std::vector<BlobFileAddition>* GetBlobFileAdditionsPtr() {
assert(!is_penultimate_level_);
return &blob_file_additions_;
}
bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); }
BlobGarbageMeter* CreateBlobGarbageMeter() {
assert(!is_penultimate_level_);
blob_garbage_meter_ = std::make_unique<BlobGarbageMeter>();
return blob_garbage_meter_.get();
}
BlobGarbageMeter* GetBlobGarbageMeter() const {
if (is_penultimate_level_) {
// blobdb doesn't support per_key_placement yet
assert(blob_garbage_meter_ == nullptr);
return nullptr;
}
return blob_garbage_meter_.get();
}
void UpdateBlobStats() {
assert(!is_penultimate_level_);
stats_.num_output_files_blob = blob_file_additions_.size();
for (const auto& blob : blob_file_additions_) {
stats_.bytes_written_blob += blob.GetTotalBlobBytes();
}
}
// Finish the current output file
Status Finish(const Status& intput_status,
const SeqnoToTimeMapping& seqno_time_mapping);
// Update output table properties from table builder
void UpdateTableProperties() {
current_output().table_properties =
std::make_shared<TableProperties>(GetTableProperties());
}
IOStatus WriterSyncClose(const Status& intput_status, SystemClock* clock,
Statistics* statistics, bool use_fsync);
TableProperties GetTableProperties() {
return builder_->GetTableProperties();
}
Slice SmallestUserKey() const {
if (!outputs_.empty() && outputs_[0].finished) {
return outputs_[0].meta.smallest.user_key();
} else {
return Slice{nullptr, 0};
}
}
Slice LargestUserKey() const {
if (!outputs_.empty() && outputs_.back().finished) {
return outputs_.back().meta.largest.user_key();
} else {
return Slice{nullptr, 0};
}
}
// In case the last output file is empty, which doesn't need to keep.
void RemoveLastEmptyOutput() {
if (!outputs_.empty() && !outputs_.back().meta.fd.file_size) {
// An error occurred, so ignore the last output.
outputs_.pop_back();
}
}
// Remove the last output, for example the last output doesn't have data (no
// entry and no range-dels), but file_size might not be 0, as it has SST
// metadata.
void RemoveLastOutput() {
assert(!outputs_.empty());
outputs_.pop_back();
}
bool HasBuilder() const { return builder_ != nullptr; }
FileMetaData* GetMetaData() { return &current_output().meta; }
bool HasOutput() const { return !outputs_.empty(); }
uint64_t NumEntries() const { return builder_->NumEntries(); }
void ResetBuilder() {
builder_.reset();
current_output_file_size_ = 0;
}
// Add range-dels from the aggregator to the current output file
User-defined timestamp support for `DeleteRange()` (#10661) Summary: Add user-defined timestamp support for range deletion. The new API is `DeleteRange(opt, cf, begin_key, end_key, ts)`. Most of the change is to update the comparator to compare without timestamp. Other than that, major changes are - internal range tombstone data structures (`FragmentedRangeTombstoneList`, `RangeTombstone`, etc.) to store timestamps. - Garbage collection of range tombstones and range tombstone covered keys during compaction. - Get()/MultiGet() to return the timestamp of a range tombstone when needed. - Get/Iterator with range tombstones bounded by readoptions.timestamp. - timestamp crash test now issues DeleteRange by default. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10661 Test Plan: - Added unit test: `make check` - Stress test: `python3 tools/db_crashtest.py --enable_ts whitebox --readpercent=57 --prefixpercent=4 --writepercent=25 -delpercent=5 --iterpercent=5 --delrangepercent=4` - Ran `db_bench` to measure regression when timestamp is not enabled. The tests are for write (with some range deletion) and iterate with DB fitting in memory: `./db_bench--benchmarks=fillrandom,seekrandom --writes_per_range_tombstone=200 --max_write_buffer_number=100 --min_write_buffer_number_to_merge=100 --writes=500000 --reads=500000 --seek_nexts=10 --disable_auto_compactions -disable_wal=true --max_num_range_tombstones=1000`. Did not see consistent regression in no timestamp case. | micros/op | fillrandom | seekrandom | | --- | --- | --- | |main| 2.58 |10.96| |PR 10661| 2.68 |10.63| Reviewed By: riversand963 Differential Revision: D39441192 Pulled By: cbi42 fbshipit-source-id: f05aca3c41605caf110daf0ff405919f300ddec2
2022-09-30 23:13:03 +00:00
// @param comp_start_user_key and comp_end_user_key include timestamp if
// user-defined timestamp is enabled.
// @param full_history_ts_low used for range tombstone garbage collection.
Status AddRangeDels(const Slice* comp_start_user_key,
const Slice* comp_end_user_key,
CompactionIterationStats& range_del_out_stats,
bool bottommost_level, const InternalKeyComparator& icmp,
SequenceNumber earliest_snapshot,
User-defined timestamp support for `DeleteRange()` (#10661) Summary: Add user-defined timestamp support for range deletion. The new API is `DeleteRange(opt, cf, begin_key, end_key, ts)`. Most of the change is to update the comparator to compare without timestamp. Other than that, major changes are - internal range tombstone data structures (`FragmentedRangeTombstoneList`, `RangeTombstone`, etc.) to store timestamps. - Garbage collection of range tombstones and range tombstone covered keys during compaction. - Get()/MultiGet() to return the timestamp of a range tombstone when needed. - Get/Iterator with range tombstones bounded by readoptions.timestamp. - timestamp crash test now issues DeleteRange by default. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10661 Test Plan: - Added unit test: `make check` - Stress test: `python3 tools/db_crashtest.py --enable_ts whitebox --readpercent=57 --prefixpercent=4 --writepercent=25 -delpercent=5 --iterpercent=5 --delrangepercent=4` - Ran `db_bench` to measure regression when timestamp is not enabled. The tests are for write (with some range deletion) and iterate with DB fitting in memory: `./db_bench--benchmarks=fillrandom,seekrandom --writes_per_range_tombstone=200 --max_write_buffer_number=100 --min_write_buffer_number_to_merge=100 --writes=500000 --reads=500000 --seek_nexts=10 --disable_auto_compactions -disable_wal=true --max_num_range_tombstones=1000`. Did not see consistent regression in no timestamp case. | micros/op | fillrandom | seekrandom | | --- | --- | --- | |main| 2.58 |10.96| |PR 10661| 2.68 |10.63| Reviewed By: riversand963 Differential Revision: D39441192 Pulled By: cbi42 fbshipit-source-id: f05aca3c41605caf110daf0ff405919f300ddec2
2022-09-30 23:13:03 +00:00
const Slice& next_table_min_key,
const std::string& full_history_ts_low);
// if the outputs have range delete, range delete is also data
bool HasRangeDel() const {
return range_del_agg_ && !range_del_agg_->IsEmpty();
}
private:
friend class SubcompactionState;
void FillFilesToCutForTtl();
void SetOutputSlitKey(const std::optional<Slice> start,
const std::optional<Slice> end) {
const InternalKeyComparator* icmp =
&compaction_->column_family_data()->internal_comparator();
const InternalKey* output_split_key = compaction_->GetOutputSplitKey();
// Invalid output_split_key indicates that we do not need to split
if (output_split_key != nullptr) {
// We may only split the output when the cursor is in the range. Split
if ((!end.has_value() ||
icmp->user_comparator()->Compare(
ExtractUserKey(output_split_key->Encode()), end.value()) < 0) &&
(!start.has_value() || icmp->user_comparator()->Compare(
ExtractUserKey(output_split_key->Encode()),
start.value()) > 0)) {
local_output_split_key_ = output_split_key;
}
}
}
// Returns true iff we should stop building the current output
// before processing the current key in compaction iterator.
bool ShouldStopBefore(const CompactionIterator& c_iter);
void Cleanup() {
if (builder_ != nullptr) {
// May happen if we get a shutdown call in the middle of compaction
builder_->Abandon();
builder_.reset();
}
}
Align compaction output file boundaries to the next level ones (#10655) Summary: Try to align the compaction output file boundaries to the next level ones (grandparent level), to reduce the level compaction write-amplification. In level compaction, there are "wasted" data at the beginning and end of the output level files. Align the file boundary can avoid such "wasted" compaction. With this PR, it tries to align the non-bottommost level file boundaries to its next level ones. It may cut file when the file size is large enough (at least 50% of target_file_size) and not too large (2x target_file_size). db_bench shows about 12.56% compaction reduction: ``` TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432 # baseline: Flush(GB): cumulative 25.882, interval 7.216 Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds # with this change: Flush(GB): cumulative 25.882, interval 7.753 Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds ``` The compaction simulator shows a similar result (14% with 100G random data). As a side effect, with this PR, the SST file size can exceed the target_file_size, but is capped at 2x target_file_size. And there will be smaller files. Here are file size statistics when loading 100GB with the target file size 32MB: ``` baseline this_PR count 1.656000e+03 1.705000e+03 mean 3.116062e+07 3.028076e+07 std 7.145242e+06 8.046139e+06 ``` The feature is enabled by default, to revert to the old behavior disable it with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false` Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for use case like user adding 2 or more non-overlapping data range at the same time, it can reduce the overlapping of 2 datasets in the lower levels. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655 Reviewed By: cbi42 Differential Revision: D39552321 Pulled By: jay-zhuang fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2022-09-30 02:43:55 +00:00
// update tracked grandparents information like grandparent index, if it's
// in the gap between 2 grandparent files, accumulated grandparent files size
// etc.
// It returns how many boundaries it crosses by including current key.
size_t UpdateGrandparentBoundaryInfo(const Slice& internal_key);
// helper function to get the overlapped grandparent files size, it's only
// used for calculating the first key's overlap.
uint64_t GetCurrentKeyGrandparentOverlappedBytes(
const Slice& internal_key) const;
// Add current key from compaction_iterator to the output file. If needed
// close and open new compaction output with the functions provided.
Status AddToOutput(const CompactionIterator& c_iter,
const CompactionFileOpenFunc& open_file_func,
const CompactionFileCloseFunc& close_file_func);
// Close the current output. `open_file_func` is needed for creating new file
// for range-dels only output file.
Status CloseOutput(const Status& curr_status,
const CompactionFileOpenFunc& open_file_func,
const CompactionFileCloseFunc& close_file_func) {
Status status = curr_status;
// handle subcompaction containing only range deletions
if (status.ok() && !HasBuilder() && !HasOutput() && HasRangeDel()) {
status = open_file_func(*this);
}
if (HasBuilder()) {
const Slice empty_key{};
Status s = close_file_func(*this, status, empty_key);
if (!s.ok() && status.ok()) {
status = s;
}
}
return status;
}
// This subcompaction's output could be empty if compaction was aborted before
// this subcompaction had a chance to generate any output files. When
// subcompactions are executed sequentially this is more likely and will be
// particularly likely for the later subcompactions to be empty. Once they are
// run in parallel however it should be much rarer.
// It's caller's responsibility to make sure it's not empty.
Output& current_output() {
assert(!outputs_.empty());
return outputs_.back();
}
// Assign the range_del_agg to the target output level. There's only one
// range-del-aggregator per compaction outputs, for
// output_to_penultimate_level compaction it is only assigned to the
// penultimate level.
void AssignRangeDelAggregator(
std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
assert(range_del_agg_ == nullptr);
range_del_agg_ = std::move(range_del_agg);
}
const Compaction* compaction_;
// current output builder and writer
std::unique_ptr<TableBuilder> builder_;
std::unique_ptr<WritableFileWriter> file_writer_;
uint64_t current_output_file_size_ = 0;
// all the compaction outputs so far
std::vector<Output> outputs_;
// BlobDB info
std::vector<BlobFileAddition> blob_file_additions_;
std::unique_ptr<BlobGarbageMeter> blob_garbage_meter_;
// Basic compaction output stats for this level's outputs
InternalStats::CompactionOutputsStats stats_;
// indicate if this CompactionOutputs obj for penultimate_level, should always
// be false if per_key_placement feature is not enabled.
const bool is_penultimate_level_;
std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_ = nullptr;
// partitioner information
std::string last_key_for_partitioner_;
std::unique_ptr<SstPartitioner> partitioner_;
// A flag determines if this subcompaction has been split by the cursor
bool is_split_ = false;
// We also maintain the output split key for each subcompaction to avoid
// repetitive comparison in ShouldStopBefore()
const InternalKey* local_output_split_key_ = nullptr;
// Some identified files with old oldest ancester time and the range should be
// isolated out so that the output file(s) in that range can be merged down
// for TTL and clear the timestamps for the range.
std::vector<FileMetaData*> files_to_cut_for_ttl_;
int cur_files_to_cut_for_ttl_ = -1;
int next_files_to_cut_for_ttl_ = 0;
// An index that used to speed up ShouldStopBefore().
size_t grandparent_index_ = 0;
Align compaction output file boundaries to the next level ones (#10655) Summary: Try to align the compaction output file boundaries to the next level ones (grandparent level), to reduce the level compaction write-amplification. In level compaction, there are "wasted" data at the beginning and end of the output level files. Align the file boundary can avoid such "wasted" compaction. With this PR, it tries to align the non-bottommost level file boundaries to its next level ones. It may cut file when the file size is large enough (at least 50% of target_file_size) and not too large (2x target_file_size). db_bench shows about 12.56% compaction reduction: ``` TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432 # baseline: Flush(GB): cumulative 25.882, interval 7.216 Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds # with this change: Flush(GB): cumulative 25.882, interval 7.753 Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds ``` The compaction simulator shows a similar result (14% with 100G random data). As a side effect, with this PR, the SST file size can exceed the target_file_size, but is capped at 2x target_file_size. And there will be smaller files. Here are file size statistics when loading 100GB with the target file size 32MB: ``` baseline this_PR count 1.656000e+03 1.705000e+03 mean 3.116062e+07 3.028076e+07 std 7.145242e+06 8.046139e+06 ``` The feature is enabled by default, to revert to the old behavior disable it with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false` Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for use case like user adding 2 or more non-overlapping data range at the same time, it can reduce the overlapping of 2 datasets in the lower levels. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655 Reviewed By: cbi42 Differential Revision: D39552321 Pulled By: jay-zhuang fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2022-09-30 02:43:55 +00:00
// if the output key is being grandparent files gap, so:
// key > grandparents[grandparent_index_ - 1].largest &&
// key < grandparents[grandparent_index_].smallest
bool being_grandparent_gap_ = true;
// The number of bytes overlapping between the current output and
// grandparent files used in ShouldStopBefore().
Align compaction output file boundaries to the next level ones (#10655) Summary: Try to align the compaction output file boundaries to the next level ones (grandparent level), to reduce the level compaction write-amplification. In level compaction, there are "wasted" data at the beginning and end of the output level files. Align the file boundary can avoid such "wasted" compaction. With this PR, it tries to align the non-bottommost level file boundaries to its next level ones. It may cut file when the file size is large enough (at least 50% of target_file_size) and not too large (2x target_file_size). db_bench shows about 12.56% compaction reduction: ``` TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432 # baseline: Flush(GB): cumulative 25.882, interval 7.216 Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds # with this change: Flush(GB): cumulative 25.882, interval 7.753 Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds ``` The compaction simulator shows a similar result (14% with 100G random data). As a side effect, with this PR, the SST file size can exceed the target_file_size, but is capped at 2x target_file_size. And there will be smaller files. Here are file size statistics when loading 100GB with the target file size 32MB: ``` baseline this_PR count 1.656000e+03 1.705000e+03 mean 3.116062e+07 3.028076e+07 std 7.145242e+06 8.046139e+06 ``` The feature is enabled by default, to revert to the old behavior disable it with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false` Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for use case like user adding 2 or more non-overlapping data range at the same time, it can reduce the overlapping of 2 datasets in the lower levels. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655 Reviewed By: cbi42 Differential Revision: D39552321 Pulled By: jay-zhuang fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2022-09-30 02:43:55 +00:00
uint64_t grandparent_overlapped_bytes_ = 0;
// A flag determines whether the key has been seen in ShouldStopBefore()
bool seen_key_ = false;
Align compaction output file boundaries to the next level ones (#10655) Summary: Try to align the compaction output file boundaries to the next level ones (grandparent level), to reduce the level compaction write-amplification. In level compaction, there are "wasted" data at the beginning and end of the output level files. Align the file boundary can avoid such "wasted" compaction. With this PR, it tries to align the non-bottommost level file boundaries to its next level ones. It may cut file when the file size is large enough (at least 50% of target_file_size) and not too large (2x target_file_size). db_bench shows about 12.56% compaction reduction: ``` TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432 # baseline: Flush(GB): cumulative 25.882, interval 7.216 Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds # with this change: Flush(GB): cumulative 25.882, interval 7.753 Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds ``` The compaction simulator shows a similar result (14% with 100G random data). As a side effect, with this PR, the SST file size can exceed the target_file_size, but is capped at 2x target_file_size. And there will be smaller files. Here are file size statistics when loading 100GB with the target file size 32MB: ``` baseline this_PR count 1.656000e+03 1.705000e+03 mean 3.116062e+07 3.028076e+07 std 7.145242e+06 8.046139e+06 ``` The feature is enabled by default, to revert to the old behavior disable it with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false` Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for use case like user adding 2 or more non-overlapping data range at the same time, it can reduce the overlapping of 2 datasets in the lower levels. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655 Reviewed By: cbi42 Differential Revision: D39552321 Pulled By: jay-zhuang fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2022-09-30 02:43:55 +00:00
// for the current output file, how many file boundaries has it crossed,
// basically number of files overlapped * 2
size_t grandparent_boundary_switched_num_ = 0;
};
// helper struct to concatenate the last level and penultimate level outputs
// which could be replaced by std::ranges::join_view() in c++20
struct OutputIterator {
public:
explicit OutputIterator(const std::vector<CompactionOutputs::Output>& a,
const std::vector<CompactionOutputs::Output>& b)
: a_(a), b_(b) {
within_a = !a_.empty();
idx_ = 0;
}
OutputIterator begin() { return *this; }
OutputIterator end() { return *this; }
size_t size() { return a_.size() + b_.size(); }
const CompactionOutputs::Output& operator*() const {
return within_a ? a_[idx_] : b_[idx_];
}
OutputIterator& operator++() {
idx_++;
if (within_a && idx_ >= a_.size()) {
within_a = false;
idx_ = 0;
}
assert(within_a || idx_ <= b_.size());
return *this;
}
bool operator!=(const OutputIterator& /*rhs*/) const {
return within_a || idx_ < b_.size();
}
private:
const std::vector<CompactionOutputs::Output>& a_;
const std::vector<CompactionOutputs::Output>& b_;
bool within_a;
size_t idx_;
};
} // namespace ROCKSDB_NAMESPACE