rocksdb/db/compaction/compaction_picker_level.cc
sdong a9565ccb26 Try to trivial move more than one files (#10190)
Summary:
In leveled compaction, try to trivial move more than one files if possible, up to 4 files or max_compaction_bytes. This is to allow higher write throughput for some use cases where data is loaded in sequential order, where appying compaction results is the bottleneck.

When pick up a file to compact and it doesn't have overlapping files in the next level, try to expand to the next file if there is still no overlapping.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10190

Test Plan:
Add some unit tests.
For performance, Try to run
./db_bench_multi_move --benchmarks=fillseq --compression_type=lz4 --write_buffer_size=5000000 --num=100000000 --value_size=1000 -level_compaction_dynamic_level_bytes
Together with https://github.com/facebook/rocksdb/pull/10188 , stalling will be eliminated in this benchmark.

Reviewed By: jay-zhuang

Differential Revision: D37230647

fbshipit-source-id: 42b260f545c46abc5d90335ac2bbfcd09602b549
2022-07-05 10:10:37 -07:00

684 lines
26 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction/compaction_picker_level.h"
#include <string>
#include <utility>
#include <vector>
#include "db/version_edit.h"
#include "logging/log_buffer.h"
#include "test_util/sync_point.h"
namespace ROCKSDB_NAMESPACE {
bool LevelCompactionPicker::NeedsCompaction(
const VersionStorageInfo* vstorage) const {
if (!vstorage->ExpiredTtlFiles().empty()) {
return true;
}
if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
return true;
}
if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
return true;
}
if (!vstorage->FilesMarkedForCompaction().empty()) {
return true;
}
if (!vstorage->FilesMarkedForForcedBlobGC().empty()) {
return true;
}
for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
if (vstorage->CompactionScore(i) >= 1) {
return true;
}
}
return false;
}
namespace {
// A class to build a leveled compaction step-by-step.
class LevelCompactionBuilder {
public:
LevelCompactionBuilder(const std::string& cf_name,
VersionStorageInfo* vstorage,
SequenceNumber earliest_mem_seqno,
CompactionPicker* compaction_picker,
LogBuffer* log_buffer,
const MutableCFOptions& mutable_cf_options,
const ImmutableOptions& ioptions,
const MutableDBOptions& mutable_db_options)
: cf_name_(cf_name),
vstorage_(vstorage),
earliest_mem_seqno_(earliest_mem_seqno),
compaction_picker_(compaction_picker),
log_buffer_(log_buffer),
mutable_cf_options_(mutable_cf_options),
ioptions_(ioptions),
mutable_db_options_(mutable_db_options) {}
// Pick and return a compaction.
Compaction* PickCompaction();
// Pick the initial files to compact to the next level. (or together
// in Intra-L0 compactions)
void SetupInitialFiles();
// If the initial files are from L0 level, pick other L0
// files if needed.
bool SetupOtherL0FilesIfNeeded();
// Based on initial files, setup other files need to be compacted
// in this compaction, accordingly.
bool SetupOtherInputsIfNeeded();
Compaction* GetCompaction();
// For the specfied level, pick a file that we want to compact.
// Returns false if there is no file to compact.
// If it returns true, inputs->files.size() will be exactly one.
// If level is 0 and there is already a compaction on that level, this
// function will return false.
bool PickFileToCompact();
// Return true if a L0 trivial move is picked up.
bool TryPickL0TrivialMove();
// For L0->L0, picks the longest span of files that aren't currently
// undergoing compaction for which work-per-deleted-file decreases. The span
// always starts from the newest L0 file.
//
// Intra-L0 compaction is independent of all other files, so it can be
// performed even when L0->base_level compactions are blocked.
//
// Returns true if `inputs` is populated with a span of files to be compacted;
// otherwise, returns false.
bool PickIntraL0Compaction();
// Return true if TrivialMove is extended. `start_index` is the index of
// the intiial file picked, which should already be in `start_level_inputs_`.
bool TryExtendNonL0TrivialMove(int start_index);
// Picks a file from level_files to compact.
// level_files is a vector of (level, file metadata) in ascending order of
// level. If compact_to_next_level is true, compact the file to the next
// level, otherwise, compact to the same level as the input file.
void PickFileToCompact(
const autovector<std::pair<int, FileMetaData*>>& level_files,
bool compact_to_next_level);
const std::string& cf_name_;
VersionStorageInfo* vstorage_;
SequenceNumber earliest_mem_seqno_;
CompactionPicker* compaction_picker_;
LogBuffer* log_buffer_;
int start_level_ = -1;
int output_level_ = -1;
int parent_index_ = -1;
int base_index_ = -1;
double start_level_score_ = 0;
bool is_manual_ = false;
bool is_l0_trivial_move_ = false;
CompactionInputFiles start_level_inputs_;
std::vector<CompactionInputFiles> compaction_inputs_;
CompactionInputFiles output_level_inputs_;
std::vector<FileMetaData*> grandparents_;
CompactionReason compaction_reason_ = CompactionReason::kUnknown;
const MutableCFOptions& mutable_cf_options_;
const ImmutableOptions& ioptions_;
const MutableDBOptions& mutable_db_options_;
// Pick a path ID to place a newly generated file, with its level
static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
const MutableCFOptions& mutable_cf_options,
int level);
static const int kMinFilesForIntraL0Compaction = 4;
};
void LevelCompactionBuilder::PickFileToCompact(
const autovector<std::pair<int, FileMetaData*>>& level_files,
bool compact_to_next_level) {
for (auto& level_file : level_files) {
// If it's being compacted it has nothing to do here.
// If this assert() fails that means that some function marked some
// files as being_compacted, but didn't call ComputeCompactionScore()
assert(!level_file.second->being_compacted);
start_level_ = level_file.first;
if ((compact_to_next_level &&
start_level_ == vstorage_->num_non_empty_levels() - 1) ||
(start_level_ == 0 &&
!compaction_picker_->level0_compactions_in_progress()->empty())) {
continue;
}
if (compact_to_next_level) {
output_level_ =
(start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
} else {
output_level_ = start_level_;
}
start_level_inputs_.files = {level_file.second};
start_level_inputs_.level = start_level_;
if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
&start_level_inputs_)) {
return;
}
}
start_level_inputs_.files.clear();
}
void LevelCompactionBuilder::SetupInitialFiles() {
// Find the compactions by size on all levels.
bool skipped_l0_to_base = false;
for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
start_level_score_ = vstorage_->CompactionScore(i);
start_level_ = vstorage_->CompactionScoreLevel(i);
assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
if (start_level_score_ >= 1) {
if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
// If L0->base_level compaction is pending, don't schedule further
// compaction from base level. Otherwise L0->base_level compaction
// may starve.
continue;
}
output_level_ =
(start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
if (PickFileToCompact()) {
// found the compaction!
if (start_level_ == 0) {
// L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
compaction_reason_ = CompactionReason::kLevelL0FilesNum;
} else {
// L1+ score = `Level files size` / `MaxBytesForLevel`
compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
}
break;
} else {
// didn't find the compaction, clear the inputs
start_level_inputs_.clear();
if (start_level_ == 0) {
skipped_l0_to_base = true;
// L0->base_level may be blocked due to ongoing L0->base_level
// compactions. It may also be blocked by an ongoing compaction from
// base_level downwards.
//
// In these cases, to reduce L0 file count and thus reduce likelihood
// of write stalls, we can attempt compacting a span of files within
// L0.
if (PickIntraL0Compaction()) {
output_level_ = 0;
compaction_reason_ = CompactionReason::kLevelL0FilesNum;
break;
}
}
}
} else {
// Compaction scores are sorted in descending order, no further scores
// will be >= 1.
break;
}
}
if (!start_level_inputs_.empty()) {
return;
}
// if we didn't find a compaction, check if there are any files marked for
// compaction
parent_index_ = base_index_ = -1;
compaction_picker_->PickFilesMarkedForCompaction(
cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_);
if (!start_level_inputs_.empty()) {
compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
return;
}
// Bottommost Files Compaction on deleting tombstones
PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), false);
if (!start_level_inputs_.empty()) {
compaction_reason_ = CompactionReason::kBottommostFiles;
return;
}
// TTL Compaction
PickFileToCompact(vstorage_->ExpiredTtlFiles(), true);
if (!start_level_inputs_.empty()) {
compaction_reason_ = CompactionReason::kTtl;
return;
}
// Periodic Compaction
PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), false);
if (!start_level_inputs_.empty()) {
compaction_reason_ = CompactionReason::kPeriodicCompaction;
return;
}
// Forced blob garbage collection
PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false);
if (!start_level_inputs_.empty()) {
compaction_reason_ = CompactionReason::kForcedBlobGC;
return;
}
}
bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
if (start_level_ == 0 && output_level_ != 0 && !is_l0_trivial_move_) {
return compaction_picker_->GetOverlappingL0Files(
vstorage_, &start_level_inputs_, output_level_, &parent_index_);
}
return true;
}
bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
// Setup input files from output level. For output to L0, we only compact
// spans of files that do not interact with any pending compactions, so don't
// need to consider other levels.
if (output_level_ != 0) {
output_level_inputs_.level = output_level_;
if (!is_l0_trivial_move_ &&
!compaction_picker_->SetupOtherInputs(
cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
&output_level_inputs_, &parent_index_, base_index_)) {
return false;
}
compaction_inputs_.push_back(start_level_inputs_);
if (!output_level_inputs_.empty()) {
compaction_inputs_.push_back(output_level_inputs_);
}
if (!is_l0_trivial_move_) {
// In some edge cases we could pick a compaction that will be compacting
// a key range that overlap with another running compaction, and both
// of them have the same output level. This could happen if
// (1) we are running a non-exclusive manual compaction
// (2) AddFile ingest a new file into the LSM tree
// We need to disallow this from happening.
if (compaction_picker_->FilesRangeOverlapWithCompaction(
compaction_inputs_, output_level_)) {
// This compaction output could potentially conflict with the output
// of a currently running compaction, we cannot run it.
return false;
}
compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
output_level_inputs_, &grandparents_);
}
} else {
compaction_inputs_.push_back(start_level_inputs_);
}
return true;
}
Compaction* LevelCompactionBuilder::PickCompaction() {
// Pick up the first file to start compaction. It may have been extended
// to a clean cut.
SetupInitialFiles();
if (start_level_inputs_.empty()) {
return nullptr;
}
assert(start_level_ >= 0 && output_level_ >= 0);
// If it is a L0 -> base level compaction, we need to set up other L0
// files if needed.
if (!SetupOtherL0FilesIfNeeded()) {
return nullptr;
}
// Pick files in the output level and expand more files in the start level
// if needed.
if (!SetupOtherInputsIfNeeded()) {
return nullptr;
}
// Form a compaction object containing the files we picked.
Compaction* c = GetCompaction();
TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
return c;
}
Compaction* LevelCompactionBuilder::GetCompaction() {
auto c = new Compaction(
vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
std::move(compaction_inputs_), output_level_,
MaxFileSizeForLevel(mutable_cf_options_, output_level_,
ioptions_.compaction_style, vstorage_->base_level(),
ioptions_.level_compaction_dynamic_level_bytes),
mutable_cf_options_.max_compaction_bytes,
GetPathId(ioptions_, mutable_cf_options_, output_level_),
GetCompressionType(vstorage_, mutable_cf_options_, output_level_,
vstorage_->base_level()),
GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
Temperature::kUnknown,
/* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
/* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
/* l0_files_might_overlap */ start_level_ == 0 && !is_l0_trivial_move_,
compaction_reason_);
// If it's level 0 compaction, make sure we don't execute any other level 0
// compactions in parallel
compaction_picker_->RegisterCompaction(c);
// Creating a compaction influences the compaction score because the score
// takes running compactions into account (by skipping files that are already
// being compacted). Since we just changed compaction score, we recalculate it
// here
vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
return c;
}
/*
* Find the optimal path to place a file
* Given a level, finds the path where levels up to it will fit in levels
* up to and including this path
*/
uint32_t LevelCompactionBuilder::GetPathId(
const ImmutableCFOptions& ioptions,
const MutableCFOptions& mutable_cf_options, int level) {
uint32_t p = 0;
assert(!ioptions.cf_paths.empty());
// size remaining in the most recent path
uint64_t current_path_size = ioptions.cf_paths[0].target_size;
uint64_t level_size;
int cur_level = 0;
// max_bytes_for_level_base denotes L1 size.
// We estimate L0 size to be the same as L1.
level_size = mutable_cf_options.max_bytes_for_level_base;
// Last path is the fallback
while (p < ioptions.cf_paths.size() - 1) {
if (level_size <= current_path_size) {
if (cur_level == level) {
// Does desired level fit in this path?
return p;
} else {
current_path_size -= level_size;
if (cur_level > 0) {
if (ioptions.level_compaction_dynamic_level_bytes) {
// Currently, level_compaction_dynamic_level_bytes is ignored when
// multiple db paths are specified. https://github.com/facebook/
// rocksdb/blob/main/db/column_family.cc.
// Still, adding this check to avoid accidentally using
// max_bytes_for_level_multiplier_additional
level_size = static_cast<uint64_t>(
level_size * mutable_cf_options.max_bytes_for_level_multiplier);
} else {
level_size = static_cast<uint64_t>(
level_size * mutable_cf_options.max_bytes_for_level_multiplier *
mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
}
}
cur_level++;
continue;
}
}
p++;
current_path_size = ioptions.cf_paths[p].target_size;
}
return p;
}
bool LevelCompactionBuilder::TryPickL0TrivialMove() {
if (vstorage_->base_level() <= 0) {
return false;
}
if (start_level_ == 0 && mutable_cf_options_.compression_per_level.empty() &&
!vstorage_->LevelFiles(output_level_).empty() &&
ioptions_.db_paths.size() <= 1) {
// Try to pick trivial move from L0 to L1. We start from the oldest
// file. We keep expanding to newer files if it would form a
// trivial move.
// For now we don't support it with
// mutable_cf_options_.compression_per_level to prevent the logic
// of determining whether L0 can be trivial moved to the next level.
// We skip the case where output level is empty, since in this case, at
// least the oldest file would qualify for trivial move, and this would
// be a surprising behavior with few benefits.
// We search from the oldest file from the newest. In theory, there are
// files in the middle can form trivial move too, but it is probably
// uncommon and we ignore these cases for simplicity.
const std::vector<FileMetaData*>& level_files =
vstorage_->LevelFiles(start_level_);
InternalKey my_smallest, my_largest;
for (auto it = level_files.rbegin(); it != level_files.rend(); ++it) {
CompactionInputFiles output_level_inputs;
output_level_inputs.level = output_level_;
FileMetaData* file = *it;
if (it == level_files.rbegin()) {
my_smallest = file->smallest;
my_largest = file->largest;
} else {
if (compaction_picker_->icmp()->Compare(file->largest, my_smallest) <
0) {
my_smallest = file->smallest;
} else if (compaction_picker_->icmp()->Compare(file->smallest,
my_largest) > 0) {
my_largest = file->largest;
} else {
break;
}
}
vstorage_->GetOverlappingInputs(output_level_, &my_smallest, &my_largest,
&output_level_inputs.files);
if (output_level_inputs.empty()) {
assert(!file->being_compacted);
start_level_inputs_.files.push_back(file);
} else {
break;
}
}
}
if (!start_level_inputs_.empty()) {
// Sort files by key range. Not sure it's 100% necessary but it's cleaner
// to always keep files sorted by key the key ranges don't overlap.
std::sort(start_level_inputs_.files.begin(),
start_level_inputs_.files.end(),
[icmp = compaction_picker_->icmp()](FileMetaData* f1,
FileMetaData* f2) -> bool {
return (icmp->Compare(f1->smallest, f2->smallest) < 0);
});
is_l0_trivial_move_ = true;
return true;
}
return false;
}
bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
if (start_level_inputs_.size() == 1 &&
(ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) &&
(mutable_cf_options_.compression_per_level.empty())) {
// Only file of `index`, and it is likely a trivial move. Try to
// expand if it is still a trivial move, but not beyond
// max_compaction_bytes or 4 files, so that we don't create too
// much compaction pressure for the next level.
// Ignore if there are more than one DB path, as it would be hard
// to predict whether it is a trivial move.
const std::vector<FileMetaData*>& level_files =
vstorage_->LevelFiles(start_level_);
const size_t kMaxMultiTrivialMove = 4;
FileMetaData* initial_file = start_level_inputs_.files[0];
size_t total_size = initial_file->fd.GetFileSize();
CompactionInputFiles output_level_inputs;
output_level_inputs.level = output_level_;
for (int i = start_index + 1;
i < static_cast<int>(level_files.size()) &&
start_level_inputs_.size() < kMaxMultiTrivialMove;
i++) {
FileMetaData* next_file = level_files[i];
if (next_file->being_compacted) {
break;
}
vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest),
&(next_file->largest),
&output_level_inputs.files);
if (!output_level_inputs.empty()) {
break;
}
if (i < static_cast<int>(level_files.size()) - 1 &&
compaction_picker_->icmp()->user_comparator()->Compare(
next_file->largest.user_key(),
level_files[i + 1]->smallest.user_key()) == 0) {
// Not a clean up after adding the next file. Skip.
break;
}
total_size += next_file->fd.GetFileSize();
if (total_size > mutable_cf_options_.max_compaction_bytes) {
break;
}
start_level_inputs_.files.push_back(next_file);
}
return start_level_inputs_.size() > 1;
}
return false;
}
bool LevelCompactionBuilder::PickFileToCompact() {
// level 0 files are overlapping. So we cannot pick more
// than one concurrent compactions at this level. This
// could be made better by looking at key-ranges that are
// being compacted at level 0.
if (start_level_ == 0 &&
!compaction_picker_->level0_compactions_in_progress()->empty()) {
TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
return false;
}
start_level_inputs_.clear();
start_level_inputs_.level = start_level_;
assert(start_level_ >= 0);
if (TryPickL0TrivialMove()) {
return true;
}
const std::vector<FileMetaData*>& level_files =
vstorage_->LevelFiles(start_level_);
// Pick the file with the highest score in this level that is not already
// being compacted.
const std::vector<int>& file_scores =
vstorage_->FilesByCompactionPri(start_level_);
unsigned int cmp_idx;
for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
cmp_idx < file_scores.size(); cmp_idx++) {
int index = file_scores[cmp_idx];
auto* f = level_files[index];
// do not pick a file to compact if it is being compacted
// from n-1 level.
if (f->being_compacted) {
if (ioptions_.compaction_pri == kRoundRobin) {
// TODO(zichen): this file may be involved in one compaction from
// an upper level, cannot advance the cursor for round-robin policy.
// Currently, we do not pick any file to compact in this case. We
// should fix this later to ensure a compaction is picked but the
// cursor shall not be advanced.
return false;
}
continue;
}
start_level_inputs_.files.push_back(f);
if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
&start_level_inputs_) ||
compaction_picker_->FilesRangeOverlapWithCompaction(
{start_level_inputs_}, output_level_)) {
// A locked (pending compaction) input-level file was pulled in due to
// user-key overlap.
start_level_inputs_.clear();
// To ensure every file is selcted in a round-robin manner, we cannot
// skip the current file. So we return false and wait for the next time
// we can pick this file to compact
if (ioptions_.compaction_pri == kRoundRobin) {
return false;
}
continue;
}
// Now that input level is fully expanded, we check whether any output
// files are locked due to pending compaction.
//
// Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
// level files are locked, not just the extra ones pulled in for user-key
// overlap.
InternalKey smallest, largest;
compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
CompactionInputFiles output_level_inputs;
output_level_inputs.level = output_level_;
vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
&output_level_inputs.files);
if (output_level_inputs.empty()) {
if (TryExtendNonL0TrivialMove(index)) {
break;
}
} else {
if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
&output_level_inputs)) {
start_level_inputs_.clear();
if (ioptions_.compaction_pri == kRoundRobin) {
return false;
}
continue;
}
}
base_index_ = index;
break;
}
// store where to start the iteration in the next call to PickCompaction
if (ioptions_.compaction_pri != kRoundRobin) {
vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
}
return start_level_inputs_.size() > 0;
}
bool LevelCompactionBuilder::PickIntraL0Compaction() {
start_level_inputs_.clear();
const std::vector<FileMetaData*>& level_files =
vstorage_->LevelFiles(0 /* level */);
if (level_files.size() <
static_cast<size_t>(
mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
level_files[0]->being_compacted) {
// If L0 isn't accumulating much files beyond the regular trigger, don't
// resort to L0->L0 compaction yet.
return false;
}
return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
std::numeric_limits<uint64_t>::max(),
mutable_cf_options_.max_compaction_bytes,
&start_level_inputs_, earliest_mem_seqno_);
}
} // namespace
Compaction* LevelCompactionPicker::PickCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) {
LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
log_buffer, mutable_cf_options, ioptions_,
mutable_db_options);
return builder.PickCompaction();
}
} // namespace ROCKSDB_NAMESPACE