mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-26 16:30:56 +00:00
8be824e316
Summary: In leveled compaction, we pick intra-L0 compaction instead of L0->Lbase whenever L0 size is small. When L0 files contain many deletions, it makes more sense to compact then down instead of accumulating tombstones in L0. This PR uses compensated_file_size when computing L0 size for determining intra-L0 compaction. Also scale down the limit on total L0 size further to be more cautious about accumulating data in L0. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12878 Test Plan: updated unit test. Reviewed By: hx235 Differential Revision: D59932421 Pulled By: cbi42 fbshipit-source-id: 9de973ac51eb7df81b38b8c68110072b1aa06321
978 lines
37 KiB
C++
978 lines
37 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "db/compaction/compaction_picker_level.h"
|
|
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "db/version_edit.h"
|
|
#include "logging/log_buffer.h"
|
|
#include "test_util/sync_point.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
bool LevelCompactionPicker::NeedsCompaction(
|
|
const VersionStorageInfo* vstorage) const {
|
|
if (!vstorage->ExpiredTtlFiles().empty()) {
|
|
return true;
|
|
}
|
|
if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
|
|
return true;
|
|
}
|
|
if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
|
|
return true;
|
|
}
|
|
if (!vstorage->FilesMarkedForCompaction().empty()) {
|
|
return true;
|
|
}
|
|
if (!vstorage->FilesMarkedForForcedBlobGC().empty()) {
|
|
return true;
|
|
}
|
|
for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
|
|
if (vstorage->CompactionScore(i) >= 1) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
namespace {
|
|
|
|
enum class CompactToNextLevel {
|
|
kNo, // compact to the same level as the input file
|
|
kYes, // compact to the next level except the last level to the same level
|
|
kSkipLastLevel, // compact to the next level but skip the last level
|
|
};
|
|
|
|
// A class to build a leveled compaction step-by-step.
|
|
class LevelCompactionBuilder {
|
|
public:
|
|
LevelCompactionBuilder(const std::string& cf_name,
|
|
VersionStorageInfo* vstorage,
|
|
CompactionPicker* compaction_picker,
|
|
LogBuffer* log_buffer,
|
|
const MutableCFOptions& mutable_cf_options,
|
|
const ImmutableOptions& ioptions,
|
|
const MutableDBOptions& mutable_db_options)
|
|
: cf_name_(cf_name),
|
|
vstorage_(vstorage),
|
|
compaction_picker_(compaction_picker),
|
|
log_buffer_(log_buffer),
|
|
mutable_cf_options_(mutable_cf_options),
|
|
ioptions_(ioptions),
|
|
mutable_db_options_(mutable_db_options) {}
|
|
|
|
// Pick and return a compaction.
|
|
Compaction* PickCompaction();
|
|
|
|
// Pick the initial files to compact to the next level. (or together
|
|
// in Intra-L0 compactions)
|
|
void SetupInitialFiles();
|
|
|
|
// If the initial files are from L0 level, pick other L0
|
|
// files if needed.
|
|
bool SetupOtherL0FilesIfNeeded();
|
|
|
|
// Compaction with round-robin compaction priority allows more files to be
|
|
// picked to form a large compaction
|
|
void SetupOtherFilesWithRoundRobinExpansion();
|
|
// Based on initial files, setup other files need to be compacted
|
|
// in this compaction, accordingly.
|
|
bool SetupOtherInputsIfNeeded();
|
|
|
|
Compaction* GetCompaction();
|
|
|
|
// From `start_level_`, pick files to compact to `output_level_`.
|
|
// Returns false if there is no file to compact.
|
|
// If it returns true, inputs->files.size() will be exactly one for
|
|
// all compaction priorities except round-robin. For round-robin,
|
|
// multiple consecutive files may be put into inputs->files.
|
|
// If level is 0 and there is already a compaction on that level, this
|
|
// function will return false.
|
|
bool PickFileToCompact();
|
|
|
|
// Return true if a L0 trivial move is picked up.
|
|
bool TryPickL0TrivialMove();
|
|
|
|
// For L0->L0, picks the longest span of files that aren't currently
|
|
// undergoing compaction for which work-per-deleted-file decreases. The span
|
|
// always starts from the newest L0 file.
|
|
//
|
|
// Intra-L0 compaction is independent of all other files, so it can be
|
|
// performed even when L0->base_level compactions are blocked.
|
|
//
|
|
// Returns true if `inputs` is populated with a span of files to be compacted;
|
|
// otherwise, returns false.
|
|
bool PickIntraL0Compaction();
|
|
|
|
// When total L0 size is small compared to Lbase, try to pick intra-L0
|
|
// compaction starting from the newest L0 file. This helps to prevent
|
|
// L0->Lbase compaction with large write-amp.
|
|
//
|
|
// Returns true iff an intra-L0 compaction is picked.
|
|
// `start_level_inputs_` and `output_level_` will be updated accordingly if
|
|
// a compaction is picked.
|
|
bool PickSizeBasedIntraL0Compaction();
|
|
|
|
// Return true if TrivialMove is extended. `start_index` is the index of
|
|
// the initial file picked, which should already be in `start_level_inputs_`.
|
|
bool TryExtendNonL0TrivialMove(int start_index,
|
|
bool only_expand_right = false);
|
|
|
|
// Picks a file from level_files to compact.
|
|
// level_files is a vector of (level, file metadata) in ascending order of
|
|
// level. If compact_to_next_level is true, compact the file to the next
|
|
// level, otherwise, compact to the same level as the input file.
|
|
// If skip_last_level is true, skip the last level.
|
|
void PickFileToCompact(
|
|
const autovector<std::pair<int, FileMetaData*>>& level_files,
|
|
CompactToNextLevel compact_to_next_level);
|
|
|
|
const std::string& cf_name_;
|
|
VersionStorageInfo* vstorage_;
|
|
CompactionPicker* compaction_picker_;
|
|
LogBuffer* log_buffer_;
|
|
int start_level_ = -1;
|
|
int output_level_ = -1;
|
|
int parent_index_ = -1;
|
|
int base_index_ = -1;
|
|
double start_level_score_ = 0;
|
|
bool is_manual_ = false;
|
|
bool is_l0_trivial_move_ = false;
|
|
CompactionInputFiles start_level_inputs_;
|
|
std::vector<CompactionInputFiles> compaction_inputs_;
|
|
CompactionInputFiles output_level_inputs_;
|
|
std::vector<FileMetaData*> grandparents_;
|
|
CompactionReason compaction_reason_ = CompactionReason::kUnknown;
|
|
|
|
const MutableCFOptions& mutable_cf_options_;
|
|
const ImmutableOptions& ioptions_;
|
|
const MutableDBOptions& mutable_db_options_;
|
|
// Pick a path ID to place a newly generated file, with its level
|
|
static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
|
|
const MutableCFOptions& mutable_cf_options,
|
|
int level);
|
|
|
|
static const int kMinFilesForIntraL0Compaction = 4;
|
|
};
|
|
|
|
void LevelCompactionBuilder::PickFileToCompact(
|
|
const autovector<std::pair<int, FileMetaData*>>& level_files,
|
|
CompactToNextLevel compact_to_next_level) {
|
|
for (auto& level_file : level_files) {
|
|
// If it's being compacted it has nothing to do here.
|
|
// If this assert() fails that means that some function marked some
|
|
// files as being_compacted, but didn't call ComputeCompactionScore()
|
|
assert(!level_file.second->being_compacted);
|
|
start_level_ = level_file.first;
|
|
if ((compact_to_next_level == CompactToNextLevel::kSkipLastLevel &&
|
|
start_level_ == vstorage_->num_non_empty_levels() - 1) ||
|
|
(start_level_ == 0 &&
|
|
!compaction_picker_->level0_compactions_in_progress()->empty())) {
|
|
continue;
|
|
}
|
|
|
|
// Compact to the next level only if the file is not in the last level and
|
|
// compact_to_next_level is kYes or kSkipLastLevel.
|
|
if (compact_to_next_level != CompactToNextLevel::kNo &&
|
|
(start_level_ < vstorage_->num_non_empty_levels() - 1)) {
|
|
output_level_ =
|
|
(start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
|
|
} else {
|
|
output_level_ = start_level_;
|
|
}
|
|
start_level_inputs_.files = {level_file.second};
|
|
start_level_inputs_.level = start_level_;
|
|
if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
|
|
&start_level_inputs_)) {
|
|
return;
|
|
}
|
|
}
|
|
start_level_inputs_.files.clear();
|
|
}
|
|
|
|
void LevelCompactionBuilder::SetupInitialFiles() {
|
|
// Find the compactions by size on all levels.
|
|
bool skipped_l0_to_base = false;
|
|
for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
|
|
start_level_score_ = vstorage_->CompactionScore(i);
|
|
start_level_ = vstorage_->CompactionScoreLevel(i);
|
|
assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
|
|
if (start_level_score_ >= 1) {
|
|
if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
|
|
// If L0->base_level compaction is pending, don't schedule further
|
|
// compaction from base level. Otherwise L0->base_level compaction
|
|
// may starve.
|
|
continue;
|
|
}
|
|
output_level_ =
|
|
(start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
|
|
bool picked_file_to_compact = PickFileToCompact();
|
|
TEST_SYNC_POINT_CALLBACK("PostPickFileToCompact",
|
|
&picked_file_to_compact);
|
|
if (picked_file_to_compact) {
|
|
// found the compaction!
|
|
if (start_level_ == 0) {
|
|
// L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
|
|
compaction_reason_ = CompactionReason::kLevelL0FilesNum;
|
|
} else {
|
|
// L1+ score = `Level files size` / `MaxBytesForLevel`
|
|
compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
|
|
}
|
|
break;
|
|
} else {
|
|
// didn't find the compaction, clear the inputs
|
|
start_level_inputs_.clear();
|
|
if (start_level_ == 0) {
|
|
skipped_l0_to_base = true;
|
|
// L0->base_level may be blocked due to ongoing L0->base_level
|
|
// compactions. It may also be blocked by an ongoing compaction from
|
|
// base_level downwards.
|
|
//
|
|
// In these cases, to reduce L0 file count and thus reduce likelihood
|
|
// of write stalls, we can attempt compacting a span of files within
|
|
// L0.
|
|
if (PickIntraL0Compaction()) {
|
|
output_level_ = 0;
|
|
compaction_reason_ = CompactionReason::kLevelL0FilesNum;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Compaction scores are sorted in descending order, no further scores
|
|
// will be >= 1.
|
|
break;
|
|
}
|
|
}
|
|
if (!start_level_inputs_.empty()) {
|
|
return;
|
|
}
|
|
|
|
// if we didn't find a compaction, check if there are any files marked for
|
|
// compaction
|
|
parent_index_ = base_index_ = -1;
|
|
|
|
compaction_picker_->PickFilesMarkedForCompaction(
|
|
cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_);
|
|
if (!start_level_inputs_.empty()) {
|
|
compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
|
|
return;
|
|
}
|
|
|
|
// Bottommost Files Compaction on deleting tombstones
|
|
PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(),
|
|
CompactToNextLevel::kNo);
|
|
if (!start_level_inputs_.empty()) {
|
|
compaction_reason_ = CompactionReason::kBottommostFiles;
|
|
return;
|
|
}
|
|
|
|
// TTL Compaction
|
|
if (ioptions_.compaction_pri == kRoundRobin &&
|
|
!vstorage_->ExpiredTtlFiles().empty()) {
|
|
auto expired_files = vstorage_->ExpiredTtlFiles();
|
|
// the expired files list should already be sorted by level
|
|
start_level_ = expired_files.front().first;
|
|
#ifndef NDEBUG
|
|
for (const auto& file : expired_files) {
|
|
assert(start_level_ <= file.first);
|
|
}
|
|
#endif
|
|
if (start_level_ > 0) {
|
|
output_level_ = start_level_ + 1;
|
|
if (PickFileToCompact()) {
|
|
compaction_reason_ = CompactionReason::kRoundRobinTtl;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
PickFileToCompact(vstorage_->ExpiredTtlFiles(),
|
|
CompactToNextLevel::kSkipLastLevel);
|
|
if (!start_level_inputs_.empty()) {
|
|
compaction_reason_ = CompactionReason::kTtl;
|
|
return;
|
|
}
|
|
|
|
// Periodic Compaction
|
|
PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(),
|
|
ioptions_.level_compaction_dynamic_level_bytes
|
|
? CompactToNextLevel::kYes
|
|
: CompactToNextLevel::kNo);
|
|
if (!start_level_inputs_.empty()) {
|
|
compaction_reason_ = CompactionReason::kPeriodicCompaction;
|
|
return;
|
|
}
|
|
|
|
// Forced blob garbage collection
|
|
PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(),
|
|
CompactToNextLevel::kNo);
|
|
if (!start_level_inputs_.empty()) {
|
|
compaction_reason_ = CompactionReason::kForcedBlobGC;
|
|
return;
|
|
}
|
|
}
|
|
|
|
bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
|
|
if (start_level_ == 0 && output_level_ != 0 && !is_l0_trivial_move_) {
|
|
return compaction_picker_->GetOverlappingL0Files(
|
|
vstorage_, &start_level_inputs_, output_level_, &parent_index_);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
|
|
// We only expand when the start level is not L0 under round robin
|
|
assert(start_level_ >= 1);
|
|
|
|
// For round-robin compaction priority, we have 3 constraints when picking
|
|
// multiple files.
|
|
// Constraint 1: We can only pick consecutive files
|
|
// -> Constraint 1a: When a file is being compacted (or some input files
|
|
// are being compacted after expanding, we cannot
|
|
// choose it and have to stop choosing more files
|
|
// -> Constraint 1b: When we reach the last file (with largest keys), we
|
|
// cannot choose more files (the next file will be the
|
|
// first one)
|
|
// Constraint 2: We should ensure the total compaction bytes (including the
|
|
// overlapped files from the next level) is no more than
|
|
// mutable_cf_options_.max_compaction_bytes
|
|
// Constraint 3: We try our best to pick as many files as possible so that
|
|
// the post-compaction level size is less than
|
|
// MaxBytesForLevel(start_level_)
|
|
// Constraint 4: We do not expand if it is possible to apply a trivial move
|
|
// Constraint 5 (TODO): Try to pick minimal files to split into the target
|
|
// number of subcompactions
|
|
TEST_SYNC_POINT("LevelCompactionPicker::RoundRobin");
|
|
|
|
// Only expand the inputs when we have selected a file in start_level_inputs_
|
|
if (start_level_inputs_.size() == 0) {
|
|
return;
|
|
}
|
|
|
|
uint64_t start_lvl_bytes_no_compacting = 0;
|
|
uint64_t curr_bytes_to_compact = 0;
|
|
uint64_t start_lvl_max_bytes_to_compact = 0;
|
|
const std::vector<FileMetaData*>& level_files =
|
|
vstorage_->LevelFiles(start_level_);
|
|
// Constraint 3 (pre-calculate the ideal max bytes to compact)
|
|
for (auto f : level_files) {
|
|
if (!f->being_compacted) {
|
|
start_lvl_bytes_no_compacting += f->fd.GetFileSize();
|
|
}
|
|
}
|
|
if (start_lvl_bytes_no_compacting >
|
|
vstorage_->MaxBytesForLevel(start_level_)) {
|
|
start_lvl_max_bytes_to_compact = start_lvl_bytes_no_compacting -
|
|
vstorage_->MaxBytesForLevel(start_level_);
|
|
}
|
|
|
|
size_t start_index = vstorage_->FilesByCompactionPri(start_level_)[0];
|
|
InternalKey smallest, largest;
|
|
// Constraint 4 (No need to check again later)
|
|
compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
|
|
CompactionInputFiles output_level_inputs;
|
|
output_level_inputs.level = output_level_;
|
|
vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
|
|
&output_level_inputs.files);
|
|
if (output_level_inputs.empty()) {
|
|
if (TryExtendNonL0TrivialMove((int)start_index,
|
|
true /* only_expand_right */)) {
|
|
return;
|
|
}
|
|
}
|
|
// Constraint 3
|
|
if (start_level_inputs_[0]->fd.GetFileSize() >=
|
|
start_lvl_max_bytes_to_compact) {
|
|
return;
|
|
}
|
|
CompactionInputFiles tmp_start_level_inputs;
|
|
tmp_start_level_inputs = start_level_inputs_;
|
|
// TODO (zichen): Future parallel round-robin may also need to update this
|
|
// Constraint 1b (only expand till the end)
|
|
for (size_t i = start_index + 1; i < level_files.size(); i++) {
|
|
auto* f = level_files[i];
|
|
if (f->being_compacted) {
|
|
// Constraint 1a
|
|
return;
|
|
}
|
|
|
|
tmp_start_level_inputs.files.push_back(f);
|
|
if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
|
|
&tmp_start_level_inputs) ||
|
|
compaction_picker_->FilesRangeOverlapWithCompaction(
|
|
{tmp_start_level_inputs}, output_level_,
|
|
Compaction::EvaluatePenultimateLevel(
|
|
vstorage_, ioptions_, start_level_, output_level_))) {
|
|
// Constraint 1a
|
|
tmp_start_level_inputs.clear();
|
|
return;
|
|
}
|
|
|
|
curr_bytes_to_compact = 0;
|
|
for (auto start_lvl_f : tmp_start_level_inputs.files) {
|
|
curr_bytes_to_compact += start_lvl_f->fd.GetFileSize();
|
|
}
|
|
|
|
// Check whether any output level files are locked
|
|
compaction_picker_->GetRange(tmp_start_level_inputs, &smallest, &largest);
|
|
vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
|
|
&output_level_inputs.files);
|
|
if (!output_level_inputs.empty() &&
|
|
!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
|
|
&output_level_inputs)) {
|
|
// Constraint 1a
|
|
tmp_start_level_inputs.clear();
|
|
return;
|
|
}
|
|
|
|
uint64_t start_lvl_curr_bytes_to_compact = curr_bytes_to_compact;
|
|
for (auto output_lvl_f : output_level_inputs.files) {
|
|
curr_bytes_to_compact += output_lvl_f->fd.GetFileSize();
|
|
}
|
|
if (curr_bytes_to_compact > mutable_cf_options_.max_compaction_bytes) {
|
|
// Constraint 2
|
|
tmp_start_level_inputs.clear();
|
|
return;
|
|
}
|
|
|
|
start_level_inputs_.files = tmp_start_level_inputs.files;
|
|
// Constraint 3
|
|
if (start_lvl_curr_bytes_to_compact > start_lvl_max_bytes_to_compact) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
|
|
// Setup input files from output level. For output to L0, we only compact
|
|
// spans of files that do not interact with any pending compactions, so don't
|
|
// need to consider other levels.
|
|
if (output_level_ != 0) {
|
|
output_level_inputs_.level = output_level_;
|
|
bool round_robin_expanding =
|
|
ioptions_.compaction_pri == kRoundRobin &&
|
|
compaction_reason_ == CompactionReason::kLevelMaxLevelSize;
|
|
if (round_robin_expanding) {
|
|
SetupOtherFilesWithRoundRobinExpansion();
|
|
}
|
|
if (!is_l0_trivial_move_ &&
|
|
!compaction_picker_->SetupOtherInputs(
|
|
cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
|
|
&output_level_inputs_, &parent_index_, base_index_,
|
|
round_robin_expanding)) {
|
|
return false;
|
|
}
|
|
|
|
compaction_inputs_.push_back(start_level_inputs_);
|
|
if (!output_level_inputs_.empty()) {
|
|
compaction_inputs_.push_back(output_level_inputs_);
|
|
}
|
|
|
|
// In some edge cases we could pick a compaction that will be compacting
|
|
// a key range that overlap with another running compaction, and both
|
|
// of them have the same output level. This could happen if
|
|
// (1) we are running a non-exclusive manual compaction
|
|
// (2) AddFile ingest a new file into the LSM tree
|
|
// We need to disallow this from happening.
|
|
if (compaction_picker_->FilesRangeOverlapWithCompaction(
|
|
compaction_inputs_, output_level_,
|
|
Compaction::EvaluatePenultimateLevel(
|
|
vstorage_, ioptions_, start_level_, output_level_))) {
|
|
// This compaction output could potentially conflict with the output
|
|
// of a currently running compaction, we cannot run it.
|
|
return false;
|
|
}
|
|
if (!is_l0_trivial_move_) {
|
|
compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
|
|
output_level_inputs_, &grandparents_);
|
|
}
|
|
} else {
|
|
compaction_inputs_.push_back(start_level_inputs_);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Compaction* LevelCompactionBuilder::PickCompaction() {
|
|
// Pick up the first file to start compaction. It may have been extended
|
|
// to a clean cut.
|
|
SetupInitialFiles();
|
|
if (start_level_inputs_.empty()) {
|
|
return nullptr;
|
|
}
|
|
assert(start_level_ >= 0 && output_level_ >= 0);
|
|
|
|
// If it is a L0 -> base level compaction, we need to set up other L0
|
|
// files if needed.
|
|
if (!SetupOtherL0FilesIfNeeded()) {
|
|
return nullptr;
|
|
}
|
|
|
|
// Pick files in the output level and expand more files in the start level
|
|
// if needed.
|
|
if (!SetupOtherInputsIfNeeded()) {
|
|
return nullptr;
|
|
}
|
|
|
|
// Form a compaction object containing the files we picked.
|
|
Compaction* c = GetCompaction();
|
|
|
|
TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
|
|
|
|
return c;
|
|
}
|
|
|
|
Compaction* LevelCompactionBuilder::GetCompaction() {
|
|
// TryPickL0TrivialMove() does not apply to the case when compacting L0 to an
|
|
// empty output level. So L0 files is picked in PickFileToCompact() by
|
|
// compaction score. We may still be able to do trivial move when this file
|
|
// does not overlap with other L0s. This happens when
|
|
// compaction_inputs_[0].size() == 1 since SetupOtherL0FilesIfNeeded() did not
|
|
// pull in more L0s.
|
|
assert(!compaction_inputs_.empty());
|
|
bool l0_files_might_overlap =
|
|
start_level_ == 0 && !is_l0_trivial_move_ &&
|
|
(compaction_inputs_.size() > 1 || compaction_inputs_[0].size() > 1);
|
|
auto c = new Compaction(
|
|
vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
|
|
std::move(compaction_inputs_), output_level_,
|
|
MaxFileSizeForLevel(mutable_cf_options_, output_level_,
|
|
ioptions_.compaction_style, vstorage_->base_level(),
|
|
ioptions_.level_compaction_dynamic_level_bytes),
|
|
mutable_cf_options_.max_compaction_bytes,
|
|
GetPathId(ioptions_, mutable_cf_options_, output_level_),
|
|
GetCompressionType(vstorage_, mutable_cf_options_, output_level_,
|
|
vstorage_->base_level()),
|
|
GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
|
|
mutable_cf_options_.default_write_temperature,
|
|
/* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
|
|
/* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
|
|
l0_files_might_overlap, compaction_reason_);
|
|
|
|
// If it's level 0 compaction, make sure we don't execute any other level 0
|
|
// compactions in parallel
|
|
compaction_picker_->RegisterCompaction(c);
|
|
|
|
// Creating a compaction influences the compaction score because the score
|
|
// takes running compactions into account (by skipping files that are already
|
|
// being compacted). Since we just changed compaction score, we recalculate it
|
|
// here
|
|
vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
|
|
return c;
|
|
}
|
|
|
|
/*
|
|
* Find the optimal path to place a file
|
|
* Given a level, finds the path where levels up to it will fit in levels
|
|
* up to and including this path
|
|
*/
|
|
uint32_t LevelCompactionBuilder::GetPathId(
|
|
const ImmutableCFOptions& ioptions,
|
|
const MutableCFOptions& mutable_cf_options, int level) {
|
|
uint32_t p = 0;
|
|
assert(!ioptions.cf_paths.empty());
|
|
|
|
// size remaining in the most recent path
|
|
uint64_t current_path_size = ioptions.cf_paths[0].target_size;
|
|
|
|
uint64_t level_size;
|
|
int cur_level = 0;
|
|
|
|
// max_bytes_for_level_base denotes L1 size.
|
|
// We estimate L0 size to be the same as L1.
|
|
level_size = mutable_cf_options.max_bytes_for_level_base;
|
|
|
|
// Last path is the fallback
|
|
while (p < ioptions.cf_paths.size() - 1) {
|
|
if (level_size <= current_path_size) {
|
|
if (cur_level == level) {
|
|
// Does desired level fit in this path?
|
|
return p;
|
|
} else {
|
|
current_path_size -= level_size;
|
|
if (cur_level > 0) {
|
|
if (ioptions.level_compaction_dynamic_level_bytes) {
|
|
// Currently, level_compaction_dynamic_level_bytes is ignored when
|
|
// multiple db paths are specified. https://github.com/facebook/
|
|
// rocksdb/blob/main/db/column_family.cc.
|
|
// Still, adding this check to avoid accidentally using
|
|
// max_bytes_for_level_multiplier_additional
|
|
level_size = static_cast<uint64_t>(
|
|
level_size * mutable_cf_options.max_bytes_for_level_multiplier);
|
|
} else {
|
|
level_size = static_cast<uint64_t>(
|
|
level_size * mutable_cf_options.max_bytes_for_level_multiplier *
|
|
mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
|
|
}
|
|
}
|
|
cur_level++;
|
|
continue;
|
|
}
|
|
}
|
|
p++;
|
|
current_path_size = ioptions.cf_paths[p].target_size;
|
|
}
|
|
return p;
|
|
}
|
|
|
|
bool LevelCompactionBuilder::TryPickL0TrivialMove() {
|
|
if (vstorage_->base_level() <= 0) {
|
|
return false;
|
|
}
|
|
if (start_level_ == 0 && mutable_cf_options_.compression_per_level.empty() &&
|
|
!vstorage_->LevelFiles(output_level_).empty() &&
|
|
ioptions_.db_paths.size() <= 1) {
|
|
// Try to pick trivial move from L0 to L1. We start from the oldest
|
|
// file. We keep expanding to newer files if it would form a
|
|
// trivial move.
|
|
// For now we don't support it with
|
|
// mutable_cf_options_.compression_per_level to prevent the logic
|
|
// of determining whether L0 can be trivial moved to the next level.
|
|
// We skip the case where output level is empty, since in this case, at
|
|
// least the oldest file would qualify for trivial move, and this would
|
|
// be a surprising behavior with few benefits.
|
|
|
|
// We search from the oldest file from the newest. In theory, there are
|
|
// files in the middle can form trivial move too, but it is probably
|
|
// uncommon and we ignore these cases for simplicity.
|
|
const std::vector<FileMetaData*>& level_files =
|
|
vstorage_->LevelFiles(start_level_);
|
|
|
|
InternalKey my_smallest, my_largest;
|
|
for (auto it = level_files.rbegin(); it != level_files.rend(); ++it) {
|
|
CompactionInputFiles output_level_inputs;
|
|
output_level_inputs.level = output_level_;
|
|
FileMetaData* file = *it;
|
|
if (it == level_files.rbegin()) {
|
|
my_smallest = file->smallest;
|
|
my_largest = file->largest;
|
|
} else {
|
|
if (compaction_picker_->icmp()->Compare(file->largest, my_smallest) <
|
|
0) {
|
|
my_smallest = file->smallest;
|
|
} else if (compaction_picker_->icmp()->Compare(file->smallest,
|
|
my_largest) > 0) {
|
|
my_largest = file->largest;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
vstorage_->GetOverlappingInputs(output_level_, &my_smallest, &my_largest,
|
|
&output_level_inputs.files);
|
|
if (output_level_inputs.empty()) {
|
|
assert(!file->being_compacted);
|
|
start_level_inputs_.files.push_back(file);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!start_level_inputs_.empty()) {
|
|
// Sort files by key range. Not sure it's 100% necessary but it's cleaner
|
|
// to always keep files sorted by key the key ranges don't overlap.
|
|
std::sort(start_level_inputs_.files.begin(),
|
|
start_level_inputs_.files.end(),
|
|
[icmp = compaction_picker_->icmp()](FileMetaData* f1,
|
|
FileMetaData* f2) -> bool {
|
|
return (icmp->Compare(f1->smallest, f2->smallest) < 0);
|
|
});
|
|
|
|
is_l0_trivial_move_ = true;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index,
|
|
bool only_expand_right) {
|
|
if (start_level_inputs_.size() == 1 &&
|
|
(ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) &&
|
|
(mutable_cf_options_.compression_per_level.empty())) {
|
|
// Only file of `index`, and it is likely a trivial move. Try to
|
|
// expand if it is still a trivial move, but not beyond
|
|
// max_compaction_bytes or 4 files, so that we don't create too
|
|
// much compaction pressure for the next level.
|
|
// Ignore if there are more than one DB path, as it would be hard
|
|
// to predict whether it is a trivial move.
|
|
const std::vector<FileMetaData*>& level_files =
|
|
vstorage_->LevelFiles(start_level_);
|
|
const size_t kMaxMultiTrivialMove = 4;
|
|
FileMetaData* initial_file = start_level_inputs_.files[0];
|
|
size_t total_size = initial_file->fd.GetFileSize();
|
|
CompactionInputFiles output_level_inputs;
|
|
output_level_inputs.level = output_level_;
|
|
// Expand towards right
|
|
for (int i = start_index + 1;
|
|
i < static_cast<int>(level_files.size()) &&
|
|
start_level_inputs_.size() < kMaxMultiTrivialMove;
|
|
i++) {
|
|
FileMetaData* next_file = level_files[i];
|
|
if (next_file->being_compacted) {
|
|
break;
|
|
}
|
|
vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest),
|
|
&(next_file->largest),
|
|
&output_level_inputs.files);
|
|
if (!output_level_inputs.empty()) {
|
|
break;
|
|
}
|
|
if (i < static_cast<int>(level_files.size()) - 1 &&
|
|
compaction_picker_->icmp()
|
|
->user_comparator()
|
|
->CompareWithoutTimestamp(
|
|
next_file->largest.user_key(),
|
|
level_files[i + 1]->smallest.user_key()) == 0) {
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
"LevelCompactionBuilder::TryExtendNonL0TrivialMove:NoCleanCut",
|
|
nullptr);
|
|
// Not a clean up after adding the next file. Skip.
|
|
break;
|
|
}
|
|
total_size += next_file->fd.GetFileSize();
|
|
if (total_size > mutable_cf_options_.max_compaction_bytes) {
|
|
break;
|
|
}
|
|
start_level_inputs_.files.push_back(next_file);
|
|
}
|
|
// Expand towards left
|
|
if (!only_expand_right) {
|
|
for (int i = start_index - 1;
|
|
i >= 0 && start_level_inputs_.size() < kMaxMultiTrivialMove; i--) {
|
|
FileMetaData* next_file = level_files[i];
|
|
if (next_file->being_compacted) {
|
|
break;
|
|
}
|
|
vstorage_->GetOverlappingInputs(output_level_, &(next_file->smallest),
|
|
&(initial_file->largest),
|
|
&output_level_inputs.files);
|
|
if (!output_level_inputs.empty()) {
|
|
break;
|
|
}
|
|
if (i > 0 && compaction_picker_->icmp()
|
|
->user_comparator()
|
|
->CompareWithoutTimestamp(
|
|
next_file->smallest.user_key(),
|
|
level_files[i - 1]->largest.user_key()) == 0) {
|
|
// Not a clean up after adding the next file. Skip.
|
|
break;
|
|
}
|
|
total_size += next_file->fd.GetFileSize();
|
|
if (total_size > mutable_cf_options_.max_compaction_bytes) {
|
|
break;
|
|
}
|
|
// keep `files` sorted in increasing order by key range
|
|
start_level_inputs_.files.insert(start_level_inputs_.files.begin(),
|
|
next_file);
|
|
}
|
|
}
|
|
return start_level_inputs_.size() > 1;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool LevelCompactionBuilder::PickFileToCompact() {
|
|
// level 0 files are overlapping. So we cannot pick more
|
|
// than one concurrent compactions at this level. This
|
|
// could be made better by looking at key-ranges that are
|
|
// being compacted at level 0.
|
|
if (start_level_ == 0 &&
|
|
!compaction_picker_->level0_compactions_in_progress()->empty()) {
|
|
if (PickSizeBasedIntraL0Compaction()) {
|
|
return true;
|
|
}
|
|
TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
|
|
return false;
|
|
}
|
|
|
|
start_level_inputs_.clear();
|
|
start_level_inputs_.level = start_level_;
|
|
|
|
assert(start_level_ >= 0);
|
|
|
|
if (TryPickL0TrivialMove()) {
|
|
return true;
|
|
}
|
|
if (start_level_ == 0 && PickSizeBasedIntraL0Compaction()) {
|
|
return true;
|
|
}
|
|
|
|
const std::vector<FileMetaData*>& level_files =
|
|
vstorage_->LevelFiles(start_level_);
|
|
|
|
// Pick the file with the highest score in this level that is not already
|
|
// being compacted.
|
|
const std::vector<int>& file_scores =
|
|
vstorage_->FilesByCompactionPri(start_level_);
|
|
|
|
unsigned int cmp_idx;
|
|
for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
|
|
cmp_idx < file_scores.size(); cmp_idx++) {
|
|
int index = file_scores[cmp_idx];
|
|
auto* f = level_files[index];
|
|
|
|
// do not pick a file to compact if it is being compacted
|
|
// from n-1 level.
|
|
if (f->being_compacted) {
|
|
if (ioptions_.compaction_pri == kRoundRobin) {
|
|
// TODO(zichen): this file may be involved in one compaction from
|
|
// an upper level, cannot advance the cursor for round-robin policy.
|
|
// Currently, we do not pick any file to compact in this case. We
|
|
// should fix this later to ensure a compaction is picked but the
|
|
// cursor shall not be advanced.
|
|
return false;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
start_level_inputs_.files.push_back(f);
|
|
if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
|
|
&start_level_inputs_) ||
|
|
compaction_picker_->FilesRangeOverlapWithCompaction(
|
|
{start_level_inputs_}, output_level_,
|
|
Compaction::EvaluatePenultimateLevel(
|
|
vstorage_, ioptions_, start_level_, output_level_))) {
|
|
// A locked (pending compaction) input-level file was pulled in due to
|
|
// user-key overlap.
|
|
start_level_inputs_.clear();
|
|
|
|
if (ioptions_.compaction_pri == kRoundRobin) {
|
|
return false;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Now that input level is fully expanded, we check whether any output
|
|
// files are locked due to pending compaction.
|
|
//
|
|
// Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
|
|
// level files are locked, not just the extra ones pulled in for user-key
|
|
// overlap.
|
|
InternalKey smallest, largest;
|
|
compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
|
|
CompactionInputFiles output_level_inputs;
|
|
output_level_inputs.level = output_level_;
|
|
vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
|
|
&output_level_inputs.files);
|
|
if (output_level_inputs.empty()) {
|
|
if (start_level_ > 0 &&
|
|
TryExtendNonL0TrivialMove(index,
|
|
ioptions_.compaction_pri ==
|
|
kRoundRobin /* only_expand_right */)) {
|
|
break;
|
|
}
|
|
} else {
|
|
if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
|
|
&output_level_inputs)) {
|
|
start_level_inputs_.clear();
|
|
if (ioptions_.compaction_pri == kRoundRobin) {
|
|
return false;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
base_index_ = index;
|
|
break;
|
|
}
|
|
|
|
// store where to start the iteration in the next call to PickCompaction
|
|
if (ioptions_.compaction_pri != kRoundRobin) {
|
|
vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
|
|
}
|
|
return start_level_inputs_.size() > 0;
|
|
}
|
|
|
|
bool LevelCompactionBuilder::PickIntraL0Compaction() {
|
|
start_level_inputs_.clear();
|
|
const std::vector<FileMetaData*>& level_files =
|
|
vstorage_->LevelFiles(0 /* level */);
|
|
if (level_files.size() <
|
|
static_cast<size_t>(
|
|
mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
|
|
level_files[0]->being_compacted) {
|
|
// If L0 isn't accumulating much files beyond the regular trigger, don't
|
|
// resort to L0->L0 compaction yet.
|
|
return false;
|
|
}
|
|
return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
|
|
std::numeric_limits<uint64_t>::max(),
|
|
mutable_cf_options_.max_compaction_bytes,
|
|
&start_level_inputs_);
|
|
}
|
|
|
|
bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {
|
|
assert(start_level_ == 0);
|
|
int base_level = vstorage_->base_level();
|
|
if (base_level <= 0) {
|
|
return false;
|
|
}
|
|
const std::vector<FileMetaData*>& l0_files =
|
|
vstorage_->LevelFiles(/*level=*/0);
|
|
size_t min_num_file =
|
|
std::max(2, mutable_cf_options_.level0_file_num_compaction_trigger);
|
|
if (l0_files.size() < min_num_file) {
|
|
return false;
|
|
}
|
|
uint64_t l0_size = 0;
|
|
for (const auto& file : l0_files) {
|
|
assert(file->compensated_file_size >= file->fd.GetFileSize());
|
|
// Compact down L0s with more deletions.
|
|
l0_size += file->compensated_file_size;
|
|
}
|
|
|
|
// Avoid L0->Lbase compactions that are inefficient for write-amp.
|
|
const double kMultiplier =
|
|
std::max(10.0, mutable_cf_options_.max_bytes_for_level_multiplier) * 2;
|
|
const uint64_t min_lbase_size = MultiplyCheckOverflow(l0_size, kMultiplier);
|
|
assert(min_lbase_size >= l0_size);
|
|
const std::vector<FileMetaData*>& lbase_files =
|
|
vstorage_->LevelFiles(/*level=*/base_level);
|
|
uint64_t lbase_size = 0;
|
|
for (const auto& file : lbase_files) {
|
|
lbase_size += file->fd.GetFileSize();
|
|
if (lbase_size > min_lbase_size) {
|
|
break;
|
|
}
|
|
}
|
|
if (lbase_size <= min_lbase_size) {
|
|
return false;
|
|
}
|
|
|
|
start_level_inputs_.clear();
|
|
start_level_inputs_.level = 0;
|
|
for (const auto& file : l0_files) {
|
|
if (file->being_compacted) {
|
|
break;
|
|
}
|
|
start_level_inputs_.files.push_back(file);
|
|
}
|
|
if (start_level_inputs_.files.size() < min_num_file) {
|
|
start_level_inputs_.clear();
|
|
return false;
|
|
}
|
|
output_level_ = 0;
|
|
return true /* picked an intra-L0 compaction */;
|
|
}
|
|
} // namespace
|
|
|
|
Compaction* LevelCompactionPicker::PickCompaction(
|
|
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
|
|
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
|
|
LogBuffer* log_buffer) {
|
|
LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
|
|
mutable_cf_options, ioptions_,
|
|
mutable_db_options);
|
|
return builder.PickCompaction();
|
|
}
|
|
} // namespace ROCKSDB_NAMESPACE
|