mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-25 22:44:05 +00:00
229297d1b8
Summary: A second attempt after https://github.com/facebook/rocksdb/issues/10802, with bug fixes and refactoring. This PR updates compaction logic to take range tombstones into account when determining whether to cut the current compaction output file (https://github.com/facebook/rocksdb/issues/4811). Before this change, only point keys were considered, and range tombstones could cause large compactions. For example, if the current compaction outputs is a range tombstone [a, b) and 2 point keys y, z, they would be added to the same file, and may overlap with too many files in the next level and cause a large compaction in the future. This PR also includes ajkr's effort to simplify the logic to add range tombstones to compaction output files in `AddRangeDels()` ([https://github.com/facebook/rocksdb/issues/11078](https://github.com/facebook/rocksdb/pull/11078#issuecomment-1386078861)). The main change is for `CompactionIterator` to emit range tombstone start keys to be processed by `CompactionOutputs`. A new class `CompactionMergingIterator` is introduced to replace `MergingIterator` under `CompactionIterator` to enable emitting of range tombstone start keys. Further improvement after this PR include cutting compaction output at some grandparent boundary key (instead of the next output key) when cutting within a range tombstone to reduce overlap with grandparents. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11113 Test Plan: * added unit test in db_range_del_test * crash test with a small key range: `python3 tools/db_crashtest.py blackbox --simple --max_key=100 --interval=600 --write_buffer_size=262144 --target_file_size_base=256 --max_bytes_for_level_base=262144 --block_size=128 --value_size_mult=33 --subcompactions=10 --use_multiget=1 --delpercent=3 --delrangepercent=2 --verify_iterator_with_expected_state_one_in=2 --num_iterations=10` Reviewed By: ajkr Differential Revision: D42655709 Pulled By: cbi42 fbshipit-source-id: 8367e36ef5640e8f21c14a3855d4a8d6e360a34c
221 lines
8.9 KiB
C++
221 lines
8.9 KiB
C++
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
//
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#pragma once
|
|
|
|
#include <optional>
|
|
|
|
#include "db/blob/blob_file_addition.h"
|
|
#include "db/blob/blob_garbage_meter.h"
|
|
#include "db/compaction/compaction.h"
|
|
#include "db/compaction/compaction_iterator.h"
|
|
#include "db/compaction/compaction_outputs.h"
|
|
#include "db/internal_stats.h"
|
|
#include "db/output_validator.h"
|
|
#include "db/range_del_aggregator.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
// Maintains state and outputs for each sub-compaction
|
|
// It contains 2 `CompactionOutputs`:
|
|
// 1. one for the normal output files
|
|
// 2. another for the penultimate level outputs
|
|
// a `current` pointer maintains the current output group, when calling
|
|
// `AddToOutput()`, it checks the output of the current compaction_iterator key
|
|
// and point `current` to the target output group. By default, it just points to
|
|
// normal compaction_outputs, if the compaction_iterator key should be placed on
|
|
// the penultimate level, `current` is changed to point to
|
|
// `penultimate_level_outputs`.
|
|
// The later operations uses `Current()` to get the target group.
|
|
//
|
|
// +----------+ +-----------------------------+ +---------+
|
|
// | *current |--------> | compaction_outputs |----->| output |
|
|
// +----------+ +-----------------------------+ +---------+
|
|
// | | output |
|
|
// | +---------+
|
|
// | | ... |
|
|
// |
|
|
// | +-----------------------------+ +---------+
|
|
// +-------------> | penultimate_level_outputs |----->| output |
|
|
// +-----------------------------+ +---------+
|
|
// | ... |
|
|
|
|
class SubcompactionState {
|
|
public:
|
|
const Compaction* compaction;
|
|
|
|
// The boundaries of the key-range this compaction is interested in. No two
|
|
// sub-compactions may have overlapping key-ranges.
|
|
// 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded
|
|
const std::optional<Slice> start, end;
|
|
|
|
// The return status of this sub-compaction
|
|
Status status;
|
|
|
|
// The return IO Status of this sub-compaction
|
|
IOStatus io_status;
|
|
|
|
// Notify on sub-compaction completion only if listener was notified on
|
|
// sub-compaction begin.
|
|
bool notify_on_subcompaction_completion = false;
|
|
|
|
// compaction job stats for this sub-compaction
|
|
CompactionJobStats compaction_job_stats;
|
|
|
|
// sub-compaction job id, which is used to identify different sub-compaction
|
|
// within the same compaction job.
|
|
const uint32_t sub_job_id;
|
|
|
|
Slice SmallestUserKey() const;
|
|
|
|
Slice LargestUserKey() const;
|
|
|
|
// Get all outputs from the subcompaction. For per_key_placement compaction,
|
|
// it returns both the last level outputs and penultimate level outputs.
|
|
OutputIterator GetOutputs() const;
|
|
|
|
// Assign range dels aggregator, for each range_del, it can only be assigned
|
|
// to one output level, for per_key_placement, it's going to be the
|
|
// penultimate level.
|
|
// TODO: This does not work for per_key_placement + user-defined timestamp +
|
|
// DeleteRange() combo. If user-defined timestamp is enabled,
|
|
// it is possible for a range tombstone to belong to bottommost level (
|
|
// seqno < earliest snapshot) without being dropped (garbage collection
|
|
// for user-defined timestamp).
|
|
void AssignRangeDelAggregator(
|
|
std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
|
|
if (compaction->SupportsPerKeyPlacement()) {
|
|
penultimate_level_outputs_.AssignRangeDelAggregator(
|
|
std::move(range_del_agg));
|
|
} else {
|
|
compaction_outputs_.AssignRangeDelAggregator(std::move(range_del_agg));
|
|
}
|
|
}
|
|
|
|
void RemoveLastEmptyOutput() {
|
|
compaction_outputs_.RemoveLastEmptyOutput();
|
|
penultimate_level_outputs_.RemoveLastEmptyOutput();
|
|
}
|
|
|
|
void BuildSubcompactionJobInfo(
|
|
SubcompactionJobInfo& subcompaction_job_info) const {
|
|
const Compaction* c = compaction;
|
|
const ColumnFamilyData* cfd = c->column_family_data();
|
|
|
|
subcompaction_job_info.cf_id = cfd->GetID();
|
|
subcompaction_job_info.cf_name = cfd->GetName();
|
|
subcompaction_job_info.status = status;
|
|
subcompaction_job_info.subcompaction_job_id = static_cast<int>(sub_job_id);
|
|
subcompaction_job_info.base_input_level = c->start_level();
|
|
subcompaction_job_info.output_level = c->output_level();
|
|
subcompaction_job_info.stats = compaction_job_stats;
|
|
}
|
|
|
|
SubcompactionState() = delete;
|
|
SubcompactionState(const SubcompactionState&) = delete;
|
|
SubcompactionState& operator=(const SubcompactionState&) = delete;
|
|
|
|
SubcompactionState(Compaction* c, const std::optional<Slice> _start,
|
|
const std::optional<Slice> _end, uint32_t _sub_job_id)
|
|
: compaction(c),
|
|
start(_start),
|
|
end(_end),
|
|
sub_job_id(_sub_job_id),
|
|
compaction_outputs_(c, /*is_penultimate_level=*/false),
|
|
penultimate_level_outputs_(c, /*is_penultimate_level=*/true) {
|
|
assert(compaction != nullptr);
|
|
// Set output split key (used for RoundRobin feature) only for normal
|
|
// compaction_outputs, output to penultimate_level feature doesn't support
|
|
// RoundRobin feature (and may never going to be supported, because for
|
|
// RoundRobin, the data time is mostly naturally sorted, no need to have
|
|
// per-key placement with output_to_penultimate_level).
|
|
compaction_outputs_.SetOutputSlitKey(start, end);
|
|
}
|
|
|
|
SubcompactionState(SubcompactionState&& state) noexcept
|
|
: compaction(state.compaction),
|
|
start(state.start),
|
|
end(state.end),
|
|
status(std::move(state.status)),
|
|
io_status(std::move(state.io_status)),
|
|
notify_on_subcompaction_completion(
|
|
state.notify_on_subcompaction_completion),
|
|
compaction_job_stats(std::move(state.compaction_job_stats)),
|
|
sub_job_id(state.sub_job_id),
|
|
compaction_outputs_(std::move(state.compaction_outputs_)),
|
|
penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)),
|
|
is_current_penultimate_level_(state.is_current_penultimate_level_),
|
|
has_penultimate_level_outputs_(state.has_penultimate_level_outputs_) {
|
|
current_outputs_ = is_current_penultimate_level_
|
|
? &penultimate_level_outputs_
|
|
: &compaction_outputs_;
|
|
}
|
|
|
|
bool HasPenultimateLevelOutputs() const {
|
|
return has_penultimate_level_outputs_ ||
|
|
penultimate_level_outputs_.HasRangeDel();
|
|
}
|
|
|
|
bool IsCurrentPenultimateLevel() const {
|
|
return is_current_penultimate_level_;
|
|
}
|
|
|
|
// Add all the new files from this compaction to version_edit
|
|
void AddOutputsEdit(VersionEdit* out_edit) const {
|
|
for (const auto& file : penultimate_level_outputs_.outputs_) {
|
|
out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta);
|
|
}
|
|
for (const auto& file : compaction_outputs_.outputs_) {
|
|
out_edit->AddFile(compaction->output_level(), file.meta);
|
|
}
|
|
}
|
|
|
|
void Cleanup(Cache* cache);
|
|
|
|
void AggregateCompactionStats(
|
|
InternalStats::CompactionStatsFull& compaction_stats) const;
|
|
|
|
CompactionOutputs& Current() const {
|
|
assert(current_outputs_);
|
|
return *current_outputs_;
|
|
}
|
|
|
|
// Add compaction_iterator key/value to the `Current` output group.
|
|
Status AddToOutput(const CompactionIterator& iter,
|
|
const CompactionFileOpenFunc& open_file_func,
|
|
const CompactionFileCloseFunc& close_file_func);
|
|
|
|
// Close all compaction output files, both output_to_penultimate_level outputs
|
|
// and normal outputs.
|
|
Status CloseCompactionFiles(const Status& curr_status,
|
|
const CompactionFileOpenFunc& open_file_func,
|
|
const CompactionFileCloseFunc& close_file_func) {
|
|
// Call FinishCompactionOutputFile() even if status is not ok: it needs to
|
|
// close the output file.
|
|
// CloseOutput() may open new compaction output files.
|
|
is_current_penultimate_level_ = true;
|
|
Status s = penultimate_level_outputs_.CloseOutput(
|
|
curr_status, open_file_func, close_file_func);
|
|
is_current_penultimate_level_ = false;
|
|
s = compaction_outputs_.CloseOutput(s, open_file_func, close_file_func);
|
|
return s;
|
|
}
|
|
|
|
private:
|
|
// State kept for output being generated
|
|
CompactionOutputs compaction_outputs_;
|
|
CompactionOutputs penultimate_level_outputs_;
|
|
CompactionOutputs* current_outputs_ = &compaction_outputs_;
|
|
bool is_current_penultimate_level_ = false;
|
|
bool has_penultimate_level_outputs_ = false;
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|