mirror of https://github.com/facebook/rocksdb.git
Tiered Compaction: per key placement support (#9964)
Summary: Support per_key_placement for last level compaction, which will be used for tiered compaction. * compaction iterator reports which level a key should output to; * compaction get the output level information and check if it's safe to output the data to penultimate level; * all compaction output files will be installed. * extra internal compaction stats added for penultimate level. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9964 Test Plan: * Unittest * db_bench, no significate difference: https://gist.github.com/jay-zhuang/3645f8fb97ec0ab47c10704bb39fd6e4 * microbench manual compaction no significate difference: https://gist.github.com/jay-zhuang/ba679b3e89e24992615ee9eef310e6dd * run the db_stress multiple times (not covering the new feature) looks good (internal: https://fburl.com/sandcastle/9w84pp2m) Reviewed By: ajkr Differential Revision: D36249494 Pulled By: jay-zhuang fbshipit-source-id: a96da57c8031c1df83e4a7a8567b657a112b80a3
This commit is contained in:
parent
7e1b417824
commit
6ce0b2ca34
|
@ -627,7 +627,11 @@ set(SOURCES
|
|||
db/compaction/compaction_picker_fifo.cc
|
||||
db/compaction/compaction_picker_level.cc
|
||||
db/compaction/compaction_picker_universal.cc
|
||||
db/compaction/compaction_service_job.cc
|
||||
db/compaction/compaction_state.cc
|
||||
db/compaction/compaction_outputs.cc
|
||||
db/compaction/sst_partitioner.cc
|
||||
db/compaction/subcompaction_state.cc
|
||||
db/convenience.cc
|
||||
db/db_filesnapshot.cc
|
||||
db/db_impl/compacted_db_impl.cc
|
||||
|
@ -1231,6 +1235,7 @@ if(WITH_TESTS)
|
|||
db/compaction/compaction_iterator_test.cc
|
||||
db/compaction/compaction_picker_test.cc
|
||||
db/compaction/compaction_service_test.cc
|
||||
db/compaction/tiered_compaction_test.cc
|
||||
db/comparator_db_test.cc
|
||||
db/corruption_test.cc
|
||||
db/cuckoo_table_db_test.cc
|
||||
|
|
3
Makefile
3
Makefile
|
@ -1783,6 +1783,9 @@ write_unprepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_unpre
|
|||
timestamped_snapshot_test: $(OBJ_DIR)/utilities/transactions/timestamped_snapshot_test.o $(TEST_LIBRARY) $(LIBRARY)
|
||||
$(AM_LINK)
|
||||
|
||||
tiered_compaction_test: $(OBJ_DIR)/db/compaction/tiered_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
|
||||
$(AM_LINK)
|
||||
|
||||
sst_dump: $(OBJ_DIR)/tools/sst_dump.o $(TOOLS_LIBRARY) $(LIBRARY)
|
||||
$(AM_LINK)
|
||||
|
||||
|
|
14
TARGETS
14
TARGETS
|
@ -38,11 +38,15 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
|
|||
"db/compaction/compaction.cc",
|
||||
"db/compaction/compaction_iterator.cc",
|
||||
"db/compaction/compaction_job.cc",
|
||||
"db/compaction/compaction_outputs.cc",
|
||||
"db/compaction/compaction_picker.cc",
|
||||
"db/compaction/compaction_picker_fifo.cc",
|
||||
"db/compaction/compaction_picker_level.cc",
|
||||
"db/compaction/compaction_picker_universal.cc",
|
||||
"db/compaction/compaction_service_job.cc",
|
||||
"db/compaction/compaction_state.cc",
|
||||
"db/compaction/sst_partitioner.cc",
|
||||
"db/compaction/subcompaction_state.cc",
|
||||
"db/convenience.cc",
|
||||
"db/db_filesnapshot.cc",
|
||||
"db/db_impl/compacted_db_impl.cc",
|
||||
|
@ -368,11 +372,15 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
|
|||
"db/compaction/compaction.cc",
|
||||
"db/compaction/compaction_iterator.cc",
|
||||
"db/compaction/compaction_job.cc",
|
||||
"db/compaction/compaction_outputs.cc",
|
||||
"db/compaction/compaction_picker.cc",
|
||||
"db/compaction/compaction_picker_fifo.cc",
|
||||
"db/compaction/compaction_picker_level.cc",
|
||||
"db/compaction/compaction_picker_universal.cc",
|
||||
"db/compaction/compaction_service_job.cc",
|
||||
"db/compaction/compaction_state.cc",
|
||||
"db/compaction/sst_partitioner.cc",
|
||||
"db/compaction/subcompaction_state.cc",
|
||||
"db/convenience.cc",
|
||||
"db/db_filesnapshot.cc",
|
||||
"db/db_impl/compacted_db_impl.cc",
|
||||
|
@ -5764,6 +5772,12 @@ cpp_unittest_wrapper(name="thread_local_test",
|
|||
extra_compiler_flags=[])
|
||||
|
||||
|
||||
cpp_unittest_wrapper(name="tiered_compaction_test",
|
||||
srcs=["db/compaction/tiered_compaction_test.cc"],
|
||||
deps=[":rocksdb_test_lib"],
|
||||
extra_compiler_flags=[])
|
||||
|
||||
|
||||
cpp_unittest_wrapper(name="timer_queue_test",
|
||||
srcs=["util/timer_queue_test.cc"],
|
||||
deps=[":rocksdb_test_lib"],
|
||||
|
|
|
@ -77,11 +77,11 @@ void Compaction::SetInputVersion(Version* _input_version) {
|
|||
void Compaction::GetBoundaryKeys(
|
||||
VersionStorageInfo* vstorage,
|
||||
const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
|
||||
Slice* largest_user_key) {
|
||||
Slice* largest_user_key, int exclude_level) {
|
||||
bool initialized = false;
|
||||
const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
|
||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||
if (inputs[i].files.empty()) {
|
||||
if (inputs[i].files.empty() || inputs[i].level == exclude_level) {
|
||||
continue;
|
||||
}
|
||||
if (inputs[i].level == 0) {
|
||||
|
@ -257,7 +257,9 @@ Compaction::Compaction(
|
|||
_blob_garbage_collection_age_cutoff < 0 ||
|
||||
_blob_garbage_collection_age_cutoff > 1
|
||||
? mutable_cf_options()->blob_garbage_collection_age_cutoff
|
||||
: _blob_garbage_collection_age_cutoff) {
|
||||
: _blob_garbage_collection_age_cutoff),
|
||||
penultimate_level_(EvaluatePenultimateLevel(
|
||||
immutable_options_, start_level_, output_level_)) {
|
||||
MarkFilesBeingCompacted(true);
|
||||
if (is_manual_compaction_) {
|
||||
compaction_reason_ = CompactionReason::kManualCompaction;
|
||||
|
@ -303,6 +305,18 @@ Compaction::Compaction(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
PopulatePenultimateLevelOutputRange();
|
||||
}
|
||||
|
||||
void Compaction::PopulatePenultimateLevelOutputRange() {
|
||||
if (!SupportsPerKeyPlacement()) {
|
||||
return;
|
||||
}
|
||||
|
||||
GetBoundaryKeys(input_vstorage_, inputs_,
|
||||
&penultimate_level_smallest_user_key_,
|
||||
&penultimate_level_largest_user_key_, number_levels_ - 1);
|
||||
}
|
||||
|
||||
Compaction::~Compaction() {
|
||||
|
@ -314,6 +328,37 @@ Compaction::~Compaction() {
|
|||
}
|
||||
}
|
||||
|
||||
bool Compaction::SupportsPerKeyPlacement() const {
|
||||
return penultimate_level_ != kInvalidLevel;
|
||||
}
|
||||
|
||||
int Compaction::GetPenultimateLevel() const { return penultimate_level_; }
|
||||
|
||||
bool Compaction::OverlapPenultimateLevelOutputRange(
|
||||
const Slice& smallest_key, const Slice& largest_key) const {
|
||||
if (!SupportsPerKeyPlacement()) {
|
||||
return false;
|
||||
}
|
||||
const Comparator* ucmp =
|
||||
input_vstorage_->InternalComparator()->user_comparator();
|
||||
|
||||
return ucmp->Compare(smallest_key, penultimate_level_largest_user_key_) <=
|
||||
0 &&
|
||||
ucmp->Compare(largest_key, penultimate_level_smallest_user_key_) >= 0;
|
||||
}
|
||||
|
||||
bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const {
|
||||
if (!SupportsPerKeyPlacement()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const Comparator* ucmp =
|
||||
input_vstorage_->InternalComparator()->user_comparator();
|
||||
|
||||
return ucmp->Compare(key, penultimate_level_smallest_user_key_) >= 0 &&
|
||||
ucmp->Compare(key, penultimate_level_largest_user_key_) <= 0;
|
||||
}
|
||||
|
||||
bool Compaction::InputCompressionMatchesOutput() const {
|
||||
int base_level = input_vstorage_->base_level();
|
||||
bool matches =
|
||||
|
@ -677,8 +722,36 @@ uint64_t Compaction::MinInputFileOldestAncesterTime(
|
|||
return min_oldest_ancester_time;
|
||||
}
|
||||
|
||||
int Compaction::GetInputBaseLevel() const {
|
||||
return input_vstorage_->base_level();
|
||||
int Compaction::EvaluatePenultimateLevel(
|
||||
const ImmutableOptions& immutable_options, const int start_level,
|
||||
const int output_level) {
|
||||
// TODO: currently per_key_placement feature only support level and universal
|
||||
// compaction
|
||||
if (immutable_options.compaction_style != kCompactionStyleLevel &&
|
||||
immutable_options.compaction_style != kCompactionStyleUniversal) {
|
||||
return kInvalidLevel;
|
||||
}
|
||||
if (output_level != immutable_options.num_levels - 1) {
|
||||
return kInvalidLevel;
|
||||
}
|
||||
|
||||
int penultimate_level = output_level - 1;
|
||||
assert(penultimate_level < immutable_options.num_levels);
|
||||
if (penultimate_level <= 0 || penultimate_level < start_level) {
|
||||
return kInvalidLevel;
|
||||
}
|
||||
|
||||
// TODO: will add public like `options.preclude_last_level_data_seconds` for
|
||||
// per_key_placement feature, will check that option here. Currently, only
|
||||
// set by unittest
|
||||
bool supports_per_key_placement = false;
|
||||
TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
|
||||
&supports_per_key_placement);
|
||||
if (!supports_per_key_placement) {
|
||||
return kInvalidLevel;
|
||||
}
|
||||
|
||||
return penultimate_level;
|
||||
}
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
|
@ -302,7 +302,25 @@ class Compaction {
|
|||
|
||||
Slice GetLargestUserKey() const { return largest_user_key_; }
|
||||
|
||||
int GetInputBaseLevel() const;
|
||||
// Return true if the compaction supports per_key_placement
|
||||
bool SupportsPerKeyPlacement() const;
|
||||
|
||||
// Get per_key_placement penultimate output level, which is `last_level - 1`
|
||||
// if per_key_placement feature is supported. Otherwise, return -1.
|
||||
int GetPenultimateLevel() const;
|
||||
|
||||
// Return true if the given range is overlap with penultimate level output
|
||||
// range.
|
||||
bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
|
||||
const Slice& largest_key) const;
|
||||
|
||||
// Return true if the key is within penultimate level output range for
|
||||
// per_key_placement feature, which is safe to place the key to the
|
||||
// penultimate level. different compaction strategy has different rules.
|
||||
// If per_key_placement is not supported, always return false.
|
||||
// TODO: currently it doesn't support moving data from the last level to the
|
||||
// penultimate level
|
||||
bool WithinPenultimateLevelOutputRange(const Slice& key) const;
|
||||
|
||||
CompactionReason compaction_reason() const { return compaction_reason_; }
|
||||
|
||||
|
@ -339,6 +357,15 @@ class Compaction {
|
|||
return notify_on_compaction_completion_;
|
||||
}
|
||||
|
||||
static constexpr int kInvalidLevel = -1;
|
||||
// Evaluate penultimate output level. If the compaction supports
|
||||
// per_key_placement feature, it returns the penultimate level number.
|
||||
// Otherwise, it's set to kInvalidLevel (-1), which means
|
||||
// output_to_penultimate_level is not supported.
|
||||
static int EvaluatePenultimateLevel(const ImmutableOptions& immutable_options,
|
||||
const int start_level,
|
||||
const int output_level);
|
||||
|
||||
private:
|
||||
// mark (or clear) all files that are being compacted
|
||||
void MarkFilesBeingCompacted(bool mark_as_compacted);
|
||||
|
@ -346,7 +373,18 @@ class Compaction {
|
|||
// get the smallest and largest key present in files to be compacted
|
||||
static void GetBoundaryKeys(VersionStorageInfo* vstorage,
|
||||
const std::vector<CompactionInputFiles>& inputs,
|
||||
Slice* smallest_key, Slice* largest_key);
|
||||
Slice* smallest_key, Slice* largest_key,
|
||||
int exclude_level = -1);
|
||||
|
||||
// populate penultimate level output range, which will be used to determine if
|
||||
// a key is safe to output to the penultimate level (details see
|
||||
// `Compaction::WithinPenultimateLevelOutputRange()`.
|
||||
// TODO: Currently the penultimate level output range is the min/max keys of
|
||||
// non-last-level input files. Which is only good if there's no key moved
|
||||
// from the last level to the penultimate level. For a more complicated per
|
||||
// key placement which may move data from the last level to the penultimate
|
||||
// level, it needs extra check.
|
||||
void PopulatePenultimateLevelOutputRange();
|
||||
|
||||
// Get the atomic file boundaries for all files in the compaction. Necessary
|
||||
// in order to avoid the scenario described in
|
||||
|
@ -444,8 +482,36 @@ class Compaction {
|
|||
|
||||
// Blob garbage collection age cutoff.
|
||||
double blob_garbage_collection_age_cutoff_;
|
||||
|
||||
// only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
|
||||
// means not supported.
|
||||
const int penultimate_level_;
|
||||
|
||||
// Key range for penultimate level output
|
||||
Slice penultimate_level_smallest_user_key_;
|
||||
Slice penultimate_level_largest_user_key_;
|
||||
};
|
||||
|
||||
#ifndef NDEBUG
|
||||
// Helper struct only for tests, which contains the data to decide if a key
|
||||
// should be output to the penultimate level.
|
||||
// TODO: remove this when the public feature knob is available
|
||||
struct PerKeyPlacementContext {
|
||||
const int level;
|
||||
const Slice key;
|
||||
const Slice value;
|
||||
const SequenceNumber seq_num;
|
||||
|
||||
bool output_to_penultimate_level;
|
||||
|
||||
PerKeyPlacementContext(int _level, Slice _key, Slice _value,
|
||||
SequenceNumber _seq_num)
|
||||
: level(_level), key(_key), value(_value), seq_num(_seq_num) {
|
||||
output_to_penultimate_level = false;
|
||||
}
|
||||
};
|
||||
#endif /* !NDEBUG */
|
||||
|
||||
// Return sum of sizes of all files in `files`.
|
||||
extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
|
||||
|
||||
|
|
|
@ -1075,6 +1075,52 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() {
|
|||
}
|
||||
}
|
||||
|
||||
void CompactionIterator::DecideOutputLevel() {
|
||||
#ifndef NDEBUG
|
||||
// TODO: will be set by sequence number or key range, for now, it will only be
|
||||
// set by unittest
|
||||
PerKeyPlacementContext context(level_, ikey_.user_key, value_,
|
||||
ikey_.sequence);
|
||||
TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
|
||||
&context);
|
||||
output_to_penultimate_level_ = context.output_to_penultimate_level;
|
||||
#endif /* !NDEBUG */
|
||||
|
||||
// if the key is within the earliest snapshot, it has to output to the
|
||||
// penultimate level.
|
||||
if (ikey_.sequence > earliest_snapshot_) {
|
||||
output_to_penultimate_level_ = true;
|
||||
}
|
||||
|
||||
if (output_to_penultimate_level_) {
|
||||
// If it's decided to output to the penultimate level, but unsafe to do so,
|
||||
// still output to the last level. For example, moving the data from a lower
|
||||
// level to a higher level outside of the higher-level input key range is
|
||||
// considered unsafe, because the key may conflict with higher-level SSTs
|
||||
// not from this compaction.
|
||||
// TODO: add statistic for declined output_to_penultimate_level
|
||||
bool safe_to_penultimate_level =
|
||||
compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key);
|
||||
if (!safe_to_penultimate_level) {
|
||||
output_to_penultimate_level_ = false;
|
||||
// It could happen when disable/enable `bottommost_temperature` while
|
||||
// holding a snapshot. When `bottommost_temperature` is not set
|
||||
// (==kUnknown), the data newer than any snapshot is pushed to the last
|
||||
// level, but when the per_key_placement feature is enabled on the fly,
|
||||
// the data later than the snapshot has to be moved to the penultimate
|
||||
// level, which may or may not be safe. So the user needs to make sure all
|
||||
// snapshot is released before enabling `bottommost_temperature` feature
|
||||
// We will migrate the feature to `last_level_temperature` and maybe make
|
||||
// it not dynamically changeable.
|
||||
if (ikey_.sequence > earliest_snapshot_) {
|
||||
status_ = Status::Corruption(
|
||||
"Unsafe to store Seq later than snapshot in the last level if "
|
||||
"per_key_placement is enabled");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CompactionIterator::PrepareOutput() {
|
||||
if (valid_) {
|
||||
if (ikey_.type == kTypeValue) {
|
||||
|
@ -1083,6 +1129,10 @@ void CompactionIterator::PrepareOutput() {
|
|||
GarbageCollectBlobIfNeeded();
|
||||
}
|
||||
|
||||
if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
|
||||
DecideOutputLevel();
|
||||
}
|
||||
|
||||
// Zeroing out the sequence number leads to better compression.
|
||||
// If this is the bottommost level (no files in lower levels)
|
||||
// and the earliest snapshot is larger than this seqno
|
||||
|
@ -1097,7 +1147,8 @@ void CompactionIterator::PrepareOutput() {
|
|||
if (valid_ && compaction_ != nullptr &&
|
||||
!compaction_->allow_ingest_behind() && bottommost_level_ &&
|
||||
DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
|
||||
ikey_.type != kTypeMerge && current_key_committed_) {
|
||||
ikey_.type != kTypeMerge && current_key_committed_ &&
|
||||
!output_to_penultimate_level_) {
|
||||
if (ikey_.type == kTypeDeletion ||
|
||||
(ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
|
||||
ROCKS_LOG_FATAL(
|
||||
|
|
|
@ -105,6 +105,10 @@ class CompactionIterator {
|
|||
virtual bool DoesInputReferenceBlobFiles() const = 0;
|
||||
|
||||
virtual const Compaction* real_compaction() const = 0;
|
||||
|
||||
virtual bool SupportsPerKeyPlacement() const = 0;
|
||||
|
||||
virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0;
|
||||
};
|
||||
|
||||
class RealCompaction : public CompactionProxy {
|
||||
|
@ -163,6 +167,16 @@ class CompactionIterator {
|
|||
|
||||
const Compaction* real_compaction() const override { return compaction_; }
|
||||
|
||||
bool SupportsPerKeyPlacement() const override {
|
||||
return compaction_->SupportsPerKeyPlacement();
|
||||
}
|
||||
|
||||
// Check if key is within penultimate level output range, to see if it's
|
||||
// safe to output to the penultimate level for per_key_placement feature.
|
||||
bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
|
||||
return compaction_->WithinPenultimateLevelOutputRange(key);
|
||||
}
|
||||
|
||||
private:
|
||||
const Compaction* compaction_;
|
||||
};
|
||||
|
@ -227,6 +241,12 @@ class CompactionIterator {
|
|||
const Slice& user_key() const { return current_user_key_; }
|
||||
const CompactionIterationStats& iter_stats() const { return iter_stats_; }
|
||||
uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
|
||||
// If the current key should be placed on penultimate level, only valid if
|
||||
// per_key_placement is supported
|
||||
bool output_to_penultimate_level() const {
|
||||
return output_to_penultimate_level_;
|
||||
}
|
||||
Status InputStatus() const { return input_.status(); }
|
||||
|
||||
private:
|
||||
// Processes the input stream to find the next output
|
||||
|
@ -235,6 +255,10 @@ class CompactionIterator {
|
|||
// Do final preparations before presenting the output to the callee.
|
||||
void PrepareOutput();
|
||||
|
||||
// Decide the current key should be output to the last level or penultimate
|
||||
// level, only call for compaction supports per key placement
|
||||
void DecideOutputLevel();
|
||||
|
||||
// Passes the output value to the blob file builder (if any), and replaces it
|
||||
// with the corresponding blob reference if it has been actually written to a
|
||||
// blob file (i.e. if it passed the value size check). Returns true if the
|
||||
|
@ -417,6 +441,11 @@ class CompactionIterator {
|
|||
// just been zeroed out during bottommost compaction.
|
||||
bool last_key_seq_zeroed_{false};
|
||||
|
||||
// True if the current key should be output to the penultimate level if
|
||||
// possible, compaction logic makes the final decision on which level to
|
||||
// output to.
|
||||
bool output_to_penultimate_level_{false};
|
||||
|
||||
void AdvanceInputIter() { input_.Next(); }
|
||||
|
||||
void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); }
|
||||
|
|
|
@ -180,11 +180,21 @@ class FakeCompaction : public CompactionIterator::CompactionProxy {
|
|||
|
||||
const Compaction* real_compaction() const override { return nullptr; }
|
||||
|
||||
bool SupportsPerKeyPlacement() const override {
|
||||
return supports_per_key_placement;
|
||||
}
|
||||
|
||||
bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
|
||||
return (!key.starts_with("unsafe_pb"));
|
||||
}
|
||||
|
||||
bool key_not_exists_beyond_output_level = false;
|
||||
|
||||
bool is_bottommost_level = false;
|
||||
|
||||
bool is_allow_ingest_behind = false;
|
||||
|
||||
bool supports_per_key_placement = false;
|
||||
};
|
||||
|
||||
// A simplified snapshot checker which assumes each snapshot has a global
|
||||
|
@ -254,6 +264,7 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
|
|||
compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind();
|
||||
compaction_proxy_->key_not_exists_beyond_output_level =
|
||||
key_not_exists_beyond_output_level;
|
||||
compaction_proxy_->supports_per_key_placement = SupportsPerKeyPlacement();
|
||||
compaction.reset(compaction_proxy_);
|
||||
}
|
||||
bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
|
||||
|
@ -295,6 +306,8 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
|
|||
|
||||
virtual bool AllowIngestBehind() const { return false; }
|
||||
|
||||
virtual bool SupportsPerKeyPlacement() const { return false; }
|
||||
|
||||
void RunTest(
|
||||
const std::vector<std::string>& input_keys,
|
||||
const std::vector<std::string>& input_values,
|
||||
|
@ -756,6 +769,119 @@ TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) {
|
|||
INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
|
||||
testing::Values(true, false));
|
||||
|
||||
class PerKeyPlacementCompIteratorTest : public CompactionIteratorTest {
|
||||
public:
|
||||
bool SupportsPerKeyPlacement() const override { return true; }
|
||||
};
|
||||
|
||||
TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) {
|
||||
std::atomic_uint64_t latest_cold_seq = 0;
|
||||
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"CompactionIterator::PrepareOutput.context", [&](void* arg) {
|
||||
auto context = static_cast<PerKeyPlacementContext*>(arg);
|
||||
context->output_to_penultimate_level =
|
||||
context->seq_num > latest_cold_seq;
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
latest_cold_seq = 5;
|
||||
|
||||
InitIterators(
|
||||
{test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeValue),
|
||||
test::KeyStr("c", 5, kTypeValue)},
|
||||
{"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
|
||||
nullptr, nullptr, true);
|
||||
c_iter_->SeekToFirst();
|
||||
ASSERT_TRUE(c_iter_->Valid());
|
||||
|
||||
// the first 2 keys are hot, which should has
|
||||
// `output_to_penultimate_level()==true` and seq num not zeroed out
|
||||
ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
|
||||
ASSERT_TRUE(c_iter_->output_to_penultimate_level());
|
||||
c_iter_->Next();
|
||||
ASSERT_TRUE(c_iter_->Valid());
|
||||
ASSERT_EQ(test::KeyStr("b", 6, kTypeValue), c_iter_->key().ToString());
|
||||
ASSERT_TRUE(c_iter_->output_to_penultimate_level());
|
||||
c_iter_->Next();
|
||||
ASSERT_TRUE(c_iter_->Valid());
|
||||
// `a` is cold data, which should be output to bottommost
|
||||
ASSERT_EQ(test::KeyStr("c", 0, kTypeValue), c_iter_->key().ToString());
|
||||
ASSERT_FALSE(c_iter_->output_to_penultimate_level());
|
||||
c_iter_->Next();
|
||||
ASSERT_OK(c_iter_->status());
|
||||
ASSERT_FALSE(c_iter_->Valid());
|
||||
}
|
||||
|
||||
TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) {
|
||||
AddSnapshot(5);
|
||||
|
||||
InitIterators(
|
||||
{test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeDeletion),
|
||||
test::KeyStr("b", 5, kTypeValue)},
|
||||
{"vala", "", "valb"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
|
||||
nullptr, nullptr, true);
|
||||
c_iter_->SeekToFirst();
|
||||
ASSERT_TRUE(c_iter_->Valid());
|
||||
|
||||
// The first key and the tombstone are within snapshot, which should output
|
||||
// to the penultimate level (and seq num cannot be zeroed out).
|
||||
ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
|
||||
ASSERT_TRUE(c_iter_->output_to_penultimate_level());
|
||||
c_iter_->Next();
|
||||
ASSERT_TRUE(c_iter_->Valid());
|
||||
ASSERT_EQ(test::KeyStr("b", 6, kTypeDeletion), c_iter_->key().ToString());
|
||||
ASSERT_TRUE(c_iter_->output_to_penultimate_level());
|
||||
c_iter_->Next();
|
||||
ASSERT_TRUE(c_iter_->Valid());
|
||||
// `a` is not protected by the snapshot, the sequence number is zero out and
|
||||
// should output bottommost
|
||||
ASSERT_EQ(test::KeyStr("b", 0, kTypeValue), c_iter_->key().ToString());
|
||||
ASSERT_FALSE(c_iter_->output_to_penultimate_level());
|
||||
c_iter_->Next();
|
||||
ASSERT_OK(c_iter_->status());
|
||||
ASSERT_FALSE(c_iter_->Valid());
|
||||
}
|
||||
|
||||
TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) {
|
||||
std::atomic_uint64_t latest_cold_seq = 0;
|
||||
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"CompactionIterator::PrepareOutput.context", [&](void* arg) {
|
||||
auto context = static_cast<PerKeyPlacementContext*>(arg);
|
||||
context->output_to_penultimate_level =
|
||||
context->seq_num > latest_cold_seq;
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
latest_cold_seq = 6;
|
||||
|
||||
AddSnapshot(5);
|
||||
|
||||
InitIterators({test::KeyStr("a", 7, kTypeValue),
|
||||
test::KeyStr("unsafe_pb", 6, kTypeValue),
|
||||
test::KeyStr("c", 5, kTypeValue)},
|
||||
{"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber,
|
||||
kMaxSequenceNumber, nullptr, nullptr, true);
|
||||
c_iter_->SeekToFirst();
|
||||
ASSERT_TRUE(c_iter_->Valid());
|
||||
|
||||
ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
|
||||
ASSERT_TRUE(c_iter_->output_to_penultimate_level());
|
||||
// the 2nd key is unsafe to output_to_penultimate_level, but it's within
|
||||
// snapshot so for per_key_placement feature it has to be outputted to the
|
||||
// penultimate level. which is a corruption. We should never see
|
||||
// such case as the data with seq num (within snapshot) should always come
|
||||
// from higher compaction input level, which makes it safe to
|
||||
// output_to_penultimate_level.
|
||||
c_iter_->Next();
|
||||
ASSERT_TRUE(c_iter_->status().IsCorruption());
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest,
|
||||
PerKeyPlacementCompIteratorTest,
|
||||
testing::Values(true, false));
|
||||
|
||||
// Tests how CompactionIterator work together with SnapshotChecker.
|
||||
class CompactionIteratorWithSnapshotCheckerTest
|
||||
: public CompactionIteratorTest {
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -20,6 +20,7 @@
|
|||
#include "db/blob/blob_file_completion_callback.h"
|
||||
#include "db/column_family.h"
|
||||
#include "db/compaction/compaction_iterator.h"
|
||||
#include "db/compaction/compaction_outputs.h"
|
||||
#include "db/flush_scheduler.h"
|
||||
#include "db/internal_stats.h"
|
||||
#include "db/job_context.h"
|
||||
|
@ -47,6 +48,7 @@
|
|||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
class Arena;
|
||||
class CompactionState;
|
||||
class ErrorHandler;
|
||||
class MemTable;
|
||||
class SnapshotChecker;
|
||||
|
@ -56,11 +58,91 @@ class Version;
|
|||
class VersionEdit;
|
||||
class VersionSet;
|
||||
|
||||
class SubcompactionState;
|
||||
|
||||
// CompactionJob is responsible for executing the compaction. Each (manual or
|
||||
// automated) compaction corresponds to a CompactionJob object, and usually
|
||||
// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
|
||||
// will divide the compaction into subcompactions and execute them in parallel
|
||||
// if needed.
|
||||
//
|
||||
// CompactionJob has 2 main stats:
|
||||
// 1. CompactionJobStats compaction_job_stats_
|
||||
// CompactionJobStats is a public data structure which is part of Compaction
|
||||
// event listener that rocksdb share the job stats with the user.
|
||||
// Internally it's an aggregation of all the compaction_job_stats from each
|
||||
// `SubcompactionState`:
|
||||
// +------------------------+
|
||||
// | SubcompactionState |
|
||||
// | |
|
||||
// +--------->| compaction_job_stats |
|
||||
// | | |
|
||||
// | +------------------------+
|
||||
// +------------------------+ |
|
||||
// | CompactionJob | | +------------------------+
|
||||
// | | | | SubcompactionState |
|
||||
// | compaction_job_stats +-----+ | |
|
||||
// | | +--------->| compaction_job_stats |
|
||||
// | | | | |
|
||||
// +------------------------+ | +------------------------+
|
||||
// |
|
||||
// | +------------------------+
|
||||
// | | SubcompactionState |
|
||||
// | | |
|
||||
// +--------->+ compaction_job_stats |
|
||||
// | | |
|
||||
// | +------------------------+
|
||||
// |
|
||||
// | +------------------------+
|
||||
// | | ... |
|
||||
// +--------->+ |
|
||||
// +------------------------+
|
||||
//
|
||||
// 2. CompactionStatsFull compaction_stats_
|
||||
// `CompactionStatsFull` is an internal stats about the compaction, which
|
||||
// is eventually sent to `ColumnFamilyData::internal_stats_` and used for
|
||||
// logging and public metrics.
|
||||
// Internally, it's an aggregation of stats_ from each `SubcompactionState`.
|
||||
// It has 2 parts, normal stats about the main compaction information and
|
||||
// the penultimate level output stats.
|
||||
// `SubcompactionState` maintains the CompactionOutputs for normal output and
|
||||
// the penultimate level output if exists, the per_level stats is
|
||||
// stored with the outputs.
|
||||
// +---------------------------+
|
||||
// | SubcompactionState |
|
||||
// | |
|
||||
// | +----------------------+ |
|
||||
// | | CompactionOutputs | |
|
||||
// | | (normal output) | |
|
||||
// +---->| stats_ | |
|
||||
// | | +----------------------+ |
|
||||
// | | |
|
||||
// | | +----------------------+ |
|
||||
// +--------------------------------+ | | | CompactionOutputs | |
|
||||
// | CompactionJob | | | | (penultimate_level) | |
|
||||
// | | +--------->| stats_ | |
|
||||
// | compaction_stats_ | | | | +----------------------+ |
|
||||
// | +-------------------------+ | | | | |
|
||||
// | |stats (normal) |------|----+ +---------------------------+
|
||||
// | +-------------------------+ | | |
|
||||
// | | | |
|
||||
// | +-------------------------+ | | | +---------------------------+
|
||||
// | |penultimate_level_stats +------+ | | SubcompactionState |
|
||||
// | +-------------------------+ | | | | |
|
||||
// | | | | | +----------------------+ |
|
||||
// | | | | | | CompactionOutputs | |
|
||||
// +--------------------------------+ | | | | (normal output) | |
|
||||
// | +---->| stats_ | |
|
||||
// | | +----------------------+ |
|
||||
// | | |
|
||||
// | | +----------------------+ |
|
||||
// | | | CompactionOutputs | |
|
||||
// | | | (penultimate_level) | |
|
||||
// +--------->| stats_ | |
|
||||
// | +----------------------+ |
|
||||
// | |
|
||||
// +---------------------------+
|
||||
|
||||
class CompactionJob {
|
||||
public:
|
||||
CompactionJob(
|
||||
|
@ -107,11 +189,6 @@ class CompactionJob {
|
|||
IOStatus io_status() const { return io_status_; }
|
||||
|
||||
protected:
|
||||
struct SubcompactionState;
|
||||
// CompactionJob state
|
||||
struct CompactionState;
|
||||
|
||||
void AggregateStatistics();
|
||||
void UpdateCompactionStats();
|
||||
void LogCompaction();
|
||||
virtual void RecordCompactionIOStats();
|
||||
|
@ -122,7 +199,7 @@ class CompactionJob {
|
|||
void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
|
||||
|
||||
CompactionState* compact_;
|
||||
InternalStats::CompactionStats compaction_stats_;
|
||||
InternalStats::CompactionStatsFull compaction_stats_;
|
||||
const ImmutableDBOptions& db_options_;
|
||||
const MutableDBOptions mutable_db_options_copy_;
|
||||
LogBuffer* log_buffer_;
|
||||
|
@ -135,6 +212,8 @@ class CompactionJob {
|
|||
|
||||
IOStatus io_status_;
|
||||
|
||||
CompactionJobStats* compaction_job_stats_;
|
||||
|
||||
private:
|
||||
friend class CompactionJobTestBase;
|
||||
|
||||
|
@ -150,15 +229,14 @@ class CompactionJob {
|
|||
|
||||
// update the thread status for starting a compaction.
|
||||
void ReportStartedCompaction(Compaction* compaction);
|
||||
void AllocateCompactionOutputFileNumbers();
|
||||
|
||||
Status FinishCompactionOutputFile(
|
||||
const Status& input_status, SubcompactionState* sub_compact,
|
||||
CompactionRangeDelAggregator* range_del_agg,
|
||||
CompactionIterationStats* range_del_out_stats,
|
||||
const Slice* next_table_min_key = nullptr);
|
||||
Status FinishCompactionOutputFile(const Status& input_status,
|
||||
SubcompactionState* sub_compact,
|
||||
CompactionOutputs& outputs,
|
||||
const Slice& next_table_min_key);
|
||||
Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
|
||||
Status OpenCompactionOutputFile(SubcompactionState* sub_compact);
|
||||
Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
|
||||
CompactionOutputs& outputs);
|
||||
void UpdateCompactionJobStats(
|
||||
const InternalStats::CompactionStats& stats) const;
|
||||
void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
|
||||
|
@ -167,20 +245,12 @@ class CompactionJob {
|
|||
void UpdateCompactionInputStatsHelper(
|
||||
int* num_files, uint64_t* bytes_read, int input_level);
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
void BuildSubcompactionJobInfo(
|
||||
SubcompactionState* sub_compact,
|
||||
SubcompactionJobInfo* subcompaction_job_info) const;
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact);
|
||||
|
||||
void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);
|
||||
|
||||
uint32_t job_id_;
|
||||
|
||||
CompactionJobStats* compaction_job_stats_;
|
||||
|
||||
// DBImpl state
|
||||
const std::string& dbname_;
|
||||
const std::string db_id_;
|
||||
|
@ -222,14 +292,12 @@ class CompactionJob {
|
|||
bool measure_io_stats_;
|
||||
// Stores the Slices that designate the boundaries for each subcompaction
|
||||
std::vector<Slice> boundaries_;
|
||||
// Stores the approx size of keys covered in the range of each subcompaction
|
||||
std::vector<uint64_t> sizes_;
|
||||
Env::Priority thread_pri_;
|
||||
std::string full_history_ts_low_;
|
||||
std::string trim_ts_;
|
||||
BlobFileCompletionCallback* blob_callback_;
|
||||
|
||||
uint64_t GetCompactionId(SubcompactionState* sub_compact);
|
||||
uint64_t GetCompactionId(SubcompactionState* sub_compact) const;
|
||||
|
||||
// Get table file name in where it's outputting to, which should also be in
|
||||
// `output_directory_`.
|
||||
|
@ -265,7 +333,6 @@ struct CompactionServiceInput {
|
|||
std::string begin;
|
||||
bool has_end = false;
|
||||
std::string end;
|
||||
uint64_t approx_size = 0;
|
||||
|
||||
// serialization interface to read and write the object
|
||||
static Status Read(const std::string& data_str, CompactionServiceInput* obj);
|
||||
|
@ -357,7 +424,7 @@ class CompactionServiceCompactionJob : private CompactionJob {
|
|||
const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
|
||||
const std::atomic<bool>& manual_compaction_canceled,
|
||||
const std::string& db_id, const std::string& db_session_id,
|
||||
const std::string& output_path,
|
||||
std::string output_path,
|
||||
const CompactionServiceInput& compaction_service_input,
|
||||
CompactionServiceResult* compaction_service_result);
|
||||
|
||||
|
|
|
@ -482,6 +482,17 @@ class CompactionJobTestBase : public testing::Test {
|
|||
cfd_ = versions_->GetColumnFamilySet()->GetDefault();
|
||||
}
|
||||
|
||||
void RunLastLevelCompaction(
|
||||
const std::vector<std::vector<FileMetaData*>>& input_files,
|
||||
std::function<void(Compaction& comp)>&& verify_func,
|
||||
const std::vector<SequenceNumber>& snapshots = {}) {
|
||||
const int kLastLevel = cf_options_.num_levels - 1;
|
||||
verify_per_key_placement_ = std::move(verify_func);
|
||||
mock::KVVector empty_map;
|
||||
RunCompaction(input_files, empty_map, snapshots, kMaxSequenceNumber,
|
||||
kLastLevel, false);
|
||||
}
|
||||
|
||||
void RunCompaction(
|
||||
const std::vector<std::vector<FileMetaData*>>& input_files,
|
||||
const mock::KVVector& expected_results,
|
||||
|
@ -571,6 +582,12 @@ class CompactionJobTestBase : public testing::Test {
|
|||
if (check_get_priority) {
|
||||
CheckGetRateLimiterPriority(compaction_job);
|
||||
}
|
||||
|
||||
if (verify_per_key_placement_) {
|
||||
// Verify per_key_placement compaction
|
||||
assert(compaction.SupportsPerKeyPlacement());
|
||||
verify_per_key_placement_(compaction);
|
||||
}
|
||||
}
|
||||
|
||||
void CheckGetRateLimiterPriority(CompactionJob& compaction_job) {
|
||||
|
@ -620,6 +637,7 @@ class CompactionJobTestBase : public testing::Test {
|
|||
std::string full_history_ts_low_;
|
||||
const std::function<std::string(uint64_t)> encode_u64_ts_;
|
||||
bool test_io_priority_;
|
||||
std::function<void(Compaction& comp)> verify_per_key_placement_;
|
||||
};
|
||||
|
||||
// TODO(icanadi) Make it simpler once we mock out VersionSet
|
||||
|
@ -1311,6 +1329,75 @@ TEST_F(CompactionJobTest, OldestBlobFileNumber) {
|
|||
/* expected_oldest_blob_file_number */ 19);
|
||||
}
|
||||
|
||||
TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
|
||||
cf_options_.bottommost_temperature = Temperature::kCold;
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
|
||||
auto supports_per_key_placement = static_cast<bool*>(arg);
|
||||
*supports_per_key_placement = true;
|
||||
});
|
||||
|
||||
std::atomic_uint64_t latest_cold_seq = 0;
|
||||
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"CompactionIterator::PrepareOutput.context", [&](void* arg) {
|
||||
auto context = static_cast<PerKeyPlacementContext*>(arg);
|
||||
context->output_to_penultimate_level =
|
||||
context->seq_num > latest_cold_seq;
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
NewDB();
|
||||
|
||||
// Add files on different levels that may overlap
|
||||
auto file0_1 = mock::MakeMockFile({{KeyStr("z", 12U, kTypeValue), "val"}});
|
||||
AddMockFile(file0_1);
|
||||
|
||||
auto file1_1 = mock::MakeMockFile({{KeyStr("b", 10U, kTypeValue), "val"},
|
||||
{KeyStr("f", 11U, kTypeValue), "val"}});
|
||||
AddMockFile(file1_1, 1);
|
||||
auto file1_2 = mock::MakeMockFile({{KeyStr("j", 12U, kTypeValue), "val"},
|
||||
{KeyStr("k", 13U, kTypeValue), "val"}});
|
||||
AddMockFile(file1_2, 1);
|
||||
auto file1_3 = mock::MakeMockFile({{KeyStr("p", 14U, kTypeValue), "val"},
|
||||
{KeyStr("u", 15U, kTypeValue), "val"}});
|
||||
AddMockFile(file1_3, 1);
|
||||
|
||||
auto file2_1 = mock::MakeMockFile({{KeyStr("f", 8U, kTypeValue), "val"},
|
||||
{KeyStr("h", 9U, kTypeValue), "val"}});
|
||||
AddMockFile(file2_1, 2);
|
||||
auto file2_2 = mock::MakeMockFile({{KeyStr("m", 6U, kTypeValue), "val"},
|
||||
{KeyStr("p", 7U, kTypeValue), "val"}});
|
||||
AddMockFile(file2_2, 2);
|
||||
|
||||
auto file3_1 = mock::MakeMockFile({{KeyStr("g", 2U, kTypeValue), "val"},
|
||||
{KeyStr("k", 3U, kTypeValue), "val"}});
|
||||
AddMockFile(file3_1, 3);
|
||||
auto file3_2 = mock::MakeMockFile({{KeyStr("v", 4U, kTypeValue), "val"},
|
||||
{KeyStr("x", 5U, kTypeValue), "val"}});
|
||||
AddMockFile(file3_2, 3);
|
||||
|
||||
auto cfd = versions_->GetColumnFamilySet()->GetDefault();
|
||||
auto files0 = cfd->current()->storage_info()->LevelFiles(0);
|
||||
auto files1 = cfd->current()->storage_info()->LevelFiles(1);
|
||||
auto files2 = cfd->current()->storage_info()->LevelFiles(2);
|
||||
auto files3 = cfd->current()->storage_info()->LevelFiles(3);
|
||||
|
||||
RunLastLevelCompaction(
|
||||
{files0, files1, files2, files3}, /*verify_func=*/[&](Compaction& comp) {
|
||||
for (char c = 'a'; c <= 'z'; c++) {
|
||||
std::string c_str;
|
||||
c_str = c;
|
||||
const Slice key(c_str);
|
||||
if (c == 'a') {
|
||||
ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key));
|
||||
} else {
|
||||
ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(CompactionJobTest, NoEnforceSingleDeleteContract) {
|
||||
db_options_.enforce_single_del_contracts = false;
|
||||
NewDB();
|
||||
|
@ -1360,7 +1447,6 @@ TEST_F(CompactionJobTest, InputSerialization) {
|
|||
if (input.has_end) {
|
||||
input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
|
||||
}
|
||||
input.approx_size = rnd64.Uniform(UINT64_MAX);
|
||||
|
||||
std::string output;
|
||||
ASSERT_OK(input.Write(&output));
|
||||
|
|
|
@ -0,0 +1,314 @@
|
|||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
//
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/compaction/compaction_outputs.h"
|
||||
|
||||
#include "db/builder.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) {
|
||||
builder_.reset(NewTableBuilder(tboptions, file_writer_.get()));
|
||||
}
|
||||
|
||||
Status CompactionOutputs::Finish(const Status& intput_status) {
|
||||
FileMetaData* meta = GetMetaData();
|
||||
assert(meta != nullptr);
|
||||
Status s = intput_status;
|
||||
if (s.ok()) {
|
||||
s = builder_->Finish();
|
||||
} else {
|
||||
builder_->Abandon();
|
||||
}
|
||||
Status io_s = builder_->io_status();
|
||||
if (s.ok()) {
|
||||
s = io_s;
|
||||
} else {
|
||||
io_s.PermitUncheckedError();
|
||||
}
|
||||
const uint64_t current_bytes = builder_->FileSize();
|
||||
if (s.ok()) {
|
||||
meta->fd.file_size = current_bytes;
|
||||
meta->marked_for_compaction = builder_->NeedCompact();
|
||||
}
|
||||
current_output().finished = true;
|
||||
stats_.bytes_written += current_bytes;
|
||||
stats_.num_output_files = outputs_.size();
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status,
|
||||
SystemClock* clock,
|
||||
Statistics* statistics,
|
||||
bool use_fsync) {
|
||||
IOStatus io_s;
|
||||
if (input_status.ok()) {
|
||||
StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS);
|
||||
io_s = file_writer_->Sync(use_fsync);
|
||||
}
|
||||
if (input_status.ok() && io_s.ok()) {
|
||||
io_s = file_writer_->Close();
|
||||
}
|
||||
|
||||
if (input_status.ok() && io_s.ok()) {
|
||||
FileMetaData* meta = GetMetaData();
|
||||
meta->file_checksum = file_writer_->GetFileChecksum();
|
||||
meta->file_checksum_func_name = file_writer_->GetFileChecksumFuncName();
|
||||
}
|
||||
|
||||
file_writer_.reset();
|
||||
|
||||
return io_s;
|
||||
}
|
||||
|
||||
Status CompactionOutputs::AddToOutput(
|
||||
const CompactionIterator& c_iter,
|
||||
const CompactionFileOpenFunc& open_file_func,
|
||||
const CompactionFileCloseFunc& close_file_func) {
|
||||
Status s;
|
||||
const Slice& key = c_iter.key();
|
||||
|
||||
if (!pending_close_ && c_iter.Valid() && partitioner_ && HasBuilder() &&
|
||||
partitioner_->ShouldPartition(
|
||||
PartitionerRequest(last_key_for_partitioner_, c_iter.user_key(),
|
||||
current_output_file_size_)) == kRequired) {
|
||||
pending_close_ = true;
|
||||
}
|
||||
|
||||
if (pending_close_) {
|
||||
s = close_file_func(*this, c_iter.InputStatus(), key);
|
||||
pending_close_ = false;
|
||||
}
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Open output file if necessary
|
||||
if (!HasBuilder()) {
|
||||
s = open_file_func(*this);
|
||||
}
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
Output& curr = current_output();
|
||||
assert(builder_ != nullptr);
|
||||
const Slice& value = c_iter.value();
|
||||
s = curr.validator.Add(key, value);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
builder_->Add(key, value);
|
||||
|
||||
stats_.num_output_records++;
|
||||
current_output_file_size_ = builder_->EstimatedFileSize();
|
||||
|
||||
if (blob_garbage_meter_) {
|
||||
s = blob_garbage_meter_->ProcessOutFlow(key, value);
|
||||
}
|
||||
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
const ParsedInternalKey& ikey = c_iter.ikey();
|
||||
s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
|
||||
ikey.type);
|
||||
|
||||
// Close output file if it is big enough. Two possibilities determine it's
|
||||
// time to close it: (1) the current key should be this file's last key, (2)
|
||||
// the next key should not be in this file.
|
||||
//
|
||||
// TODO(aekmekji): determine if file should be closed earlier than this
|
||||
// during subcompactions (i.e. if output size, estimated by input size, is
|
||||
// going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB
|
||||
// and 0.6MB instead of 1MB and 0.2MB)
|
||||
if (compaction_->output_level() != 0 &&
|
||||
current_output_file_size_ >= compaction_->max_output_file_size()) {
|
||||
pending_close_ = true;
|
||||
}
|
||||
|
||||
if (partitioner_) {
|
||||
last_key_for_partitioner_.assign(c_iter.user_key().data_,
|
||||
c_iter.user_key().size_);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
Status CompactionOutputs::AddRangeDels(
|
||||
const Slice* comp_start, const Slice* comp_end,
|
||||
CompactionIterationStats& range_del_out_stats, bool bottommost_level,
|
||||
const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
|
||||
const Slice& next_table_min_key) {
|
||||
assert(HasRangeDel());
|
||||
FileMetaData& meta = current_output().meta;
|
||||
const Comparator* ucmp = icmp.user_comparator();
|
||||
|
||||
Slice lower_bound_guard, upper_bound_guard;
|
||||
std::string smallest_user_key;
|
||||
const Slice *lower_bound, *upper_bound;
|
||||
bool lower_bound_from_sub_compact = false;
|
||||
|
||||
size_t output_size = outputs_.size();
|
||||
if (output_size == 1) {
|
||||
// For the first output table, include range tombstones before the min
|
||||
// key but after the subcompaction boundary.
|
||||
lower_bound = comp_start;
|
||||
lower_bound_from_sub_compact = true;
|
||||
} else if (meta.smallest.size() > 0) {
|
||||
// For subsequent output tables, only include range tombstones from min
|
||||
// key onwards since the previous file was extended to contain range
|
||||
// tombstones falling before min key.
|
||||
smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
|
||||
lower_bound_guard = Slice(smallest_user_key);
|
||||
lower_bound = &lower_bound_guard;
|
||||
} else {
|
||||
lower_bound = nullptr;
|
||||
}
|
||||
if (!next_table_min_key.empty()) {
|
||||
// This may be the last file in the subcompaction in some cases, so we
|
||||
// need to compare the end key of subcompaction with the next file start
|
||||
// key. When the end key is chosen by the subcompaction, we know that
|
||||
// it must be the biggest key in output file. Therefore, it is safe to
|
||||
// use the smaller key as the upper bound of the output file, to ensure
|
||||
// that there is no overlapping between different output files.
|
||||
upper_bound_guard = ExtractUserKey(next_table_min_key);
|
||||
if (comp_end != nullptr &&
|
||||
ucmp->Compare(upper_bound_guard, *comp_end) >= 0) {
|
||||
upper_bound = comp_end;
|
||||
} else {
|
||||
upper_bound = &upper_bound_guard;
|
||||
}
|
||||
} else {
|
||||
// This is the last file in the subcompaction, so extend until the
|
||||
// subcompaction ends.
|
||||
upper_bound = comp_end;
|
||||
}
|
||||
bool has_overlapping_endpoints;
|
||||
if (upper_bound != nullptr && meta.largest.size() > 0) {
|
||||
has_overlapping_endpoints =
|
||||
ucmp->Compare(meta.largest.user_key(), *upper_bound) == 0;
|
||||
} else {
|
||||
has_overlapping_endpoints = false;
|
||||
}
|
||||
|
||||
// The end key of the subcompaction must be bigger or equal to the upper
|
||||
// bound. If the end of subcompaction is null or the upper bound is null,
|
||||
// it means that this file is the last file in the compaction. So there
|
||||
// will be no overlapping between this file and others.
|
||||
assert(comp_end == nullptr || upper_bound == nullptr ||
|
||||
ucmp->Compare(*upper_bound, *comp_end) <= 0);
|
||||
auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
|
||||
has_overlapping_endpoints);
|
||||
// Position the range tombstone output iterator. There may be tombstone
|
||||
// fragments that are entirely out of range, so make sure that we do not
|
||||
// include those.
|
||||
if (lower_bound != nullptr) {
|
||||
it->Seek(*lower_bound);
|
||||
} else {
|
||||
it->SeekToFirst();
|
||||
}
|
||||
for (; it->Valid(); it->Next()) {
|
||||
auto tombstone = it->Tombstone();
|
||||
if (upper_bound != nullptr) {
|
||||
int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
|
||||
if ((has_overlapping_endpoints && cmp < 0) ||
|
||||
(!has_overlapping_endpoints && cmp <= 0)) {
|
||||
// Tombstones starting after upper_bound only need to be included in
|
||||
// the next table. If the current SST ends before upper_bound, i.e.,
|
||||
// `has_overlapping_endpoints == false`, we can also skip over range
|
||||
// tombstones that start exactly at upper_bound. Such range
|
||||
// tombstones will be included in the next file and are not relevant
|
||||
// to the point keys or endpoints of the current file.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (bottommost_level && tombstone.seq_ <= earliest_snapshot) {
|
||||
// TODO(andrewkr): tombstones that span multiple output files are
|
||||
// counted for each compaction output file, so lots of double
|
||||
// counting.
|
||||
range_del_out_stats.num_range_del_drop_obsolete++;
|
||||
range_del_out_stats.num_record_drop_obsolete++;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto kv = tombstone.Serialize();
|
||||
assert(lower_bound == nullptr ||
|
||||
ucmp->Compare(*lower_bound, kv.second) < 0);
|
||||
// Range tombstone is not supported by output validator yet.
|
||||
builder_->Add(kv.first.Encode(), kv.second);
|
||||
InternalKey smallest_candidate = std::move(kv.first);
|
||||
if (lower_bound != nullptr &&
|
||||
ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) {
|
||||
// Pretend the smallest key has the same user key as lower_bound
|
||||
// (the max key in the previous table or subcompaction) in order for
|
||||
// files to appear key-space partitioned.
|
||||
//
|
||||
// When lower_bound is chosen by a subcompaction, we know that
|
||||
// subcompactions over smaller keys cannot contain any keys at
|
||||
// lower_bound. We also know that smaller subcompactions exist,
|
||||
// because otherwise the subcompaction woud be unbounded on the left.
|
||||
// As a result, we know that no other files on the output level will
|
||||
// contain actual keys at lower_bound (an output file may have a
|
||||
// largest key of lower_bound@kMaxSequenceNumber, but this only
|
||||
// indicates a large range tombstone was truncated). Therefore, it is
|
||||
// safe to use the tombstone's sequence number, to ensure that keys at
|
||||
// lower_bound at lower levels are covered by truncated tombstones.
|
||||
//
|
||||
// If lower_bound was chosen by the smallest data key in the file,
|
||||
// choose lowest seqnum so this file's smallest internal key comes
|
||||
// after the previous file's largest. The fake seqnum is OK because
|
||||
// the read path's file-picking code only considers user key.
|
||||
smallest_candidate = InternalKey(
|
||||
*lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0,
|
||||
kTypeRangeDeletion);
|
||||
}
|
||||
InternalKey largest_candidate = tombstone.SerializeEndKey();
|
||||
if (upper_bound != nullptr &&
|
||||
ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) {
|
||||
// Pretend the largest key has the same user key as upper_bound (the
|
||||
// min key in the following table or subcompaction) in order for files
|
||||
// to appear key-space partitioned.
|
||||
//
|
||||
// Choose highest seqnum so this file's largest internal key comes
|
||||
// before the next file's/subcompaction's smallest. The fake seqnum is
|
||||
// OK because the read path's file-picking code only considers the
|
||||
// user key portion.
|
||||
//
|
||||
// Note Seek() also creates InternalKey with (user_key,
|
||||
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
|
||||
// kTypeRangeDeletion (0xF), so the range tombstone comes before the
|
||||
// Seek() key in InternalKey's ordering. So Seek() will look in the
|
||||
// next file for the user key.
|
||||
largest_candidate =
|
||||
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
|
||||
if (meta.smallest.size() > 0) {
|
||||
smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode());
|
||||
}
|
||||
#endif
|
||||
meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
|
||||
tombstone.seq_, icmp);
|
||||
// The smallest key in a file is used for range tombstone truncation, so
|
||||
// it cannot have a seqnum of 0 (unless the smallest data key in a file
|
||||
// has a seqnum of 0). Otherwise, the truncated tombstone may expose
|
||||
// deleted keys at lower levels.
|
||||
assert(smallest_ikey_seqnum == 0 ||
|
||||
ExtractInternalKeyFooter(meta.smallest.Encode()) !=
|
||||
PackSequenceAndType(0, kTypeRangeDeletion));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace ROCKSDB_NAMESPACE
|
|
@ -0,0 +1,328 @@
|
|||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
//
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "db/blob/blob_garbage_meter.h"
|
||||
#include "db/compaction/compaction.h"
|
||||
#include "db/compaction/compaction_iterator.h"
|
||||
#include "db/internal_stats.h"
|
||||
#include "db/output_validator.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
class CompactionOutputs;
|
||||
using CompactionFileOpenFunc = std::function<Status(CompactionOutputs&)>;
|
||||
using CompactionFileCloseFunc =
|
||||
std::function<Status(CompactionOutputs&, const Status&, const Slice&)>;
|
||||
|
||||
// Files produced by subcompaction, most of the functions are used by
|
||||
// compaction_job Open/Close compaction file functions.
|
||||
class CompactionOutputs {
|
||||
public:
|
||||
// compaction output file
|
||||
struct Output {
|
||||
Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
|
||||
bool _enable_order_check, bool _enable_hash, bool _finished,
|
||||
uint64_t precalculated_hash)
|
||||
: meta(std::move(_meta)),
|
||||
validator(_icmp, _enable_order_check, _enable_hash,
|
||||
precalculated_hash),
|
||||
finished(_finished) {}
|
||||
FileMetaData meta;
|
||||
OutputValidator validator;
|
||||
bool finished;
|
||||
std::shared_ptr<const TableProperties> table_properties;
|
||||
};
|
||||
|
||||
CompactionOutputs() = delete;
|
||||
|
||||
explicit CompactionOutputs(const Compaction* compaction,
|
||||
const bool is_penultimate_level)
|
||||
: compaction_(compaction), is_penultimate_level_(is_penultimate_level) {
|
||||
partitioner_ = compaction->output_level() == 0
|
||||
? nullptr
|
||||
: compaction->CreateSstPartitioner();
|
||||
}
|
||||
|
||||
// Add generated output to the list
|
||||
void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp,
|
||||
bool enable_order_check, bool enable_hash,
|
||||
bool finished = false, uint64_t precalculated_hash = 0) {
|
||||
outputs_.emplace_back(std::move(meta), icmp, enable_order_check,
|
||||
enable_hash, finished, precalculated_hash);
|
||||
}
|
||||
|
||||
// Set new table builder for the current output
|
||||
void NewBuilder(const TableBuilderOptions& tboptions);
|
||||
|
||||
// Assign a new WritableFileWriter to the current output
|
||||
void AssignFileWriter(WritableFileWriter* writer) {
|
||||
file_writer_.reset(writer);
|
||||
}
|
||||
|
||||
// TODO: Remove it when remote compaction support tiered compaction
|
||||
void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; }
|
||||
void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; }
|
||||
|
||||
// TODO: Move the BlobDB builder into CompactionOutputs
|
||||
const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
|
||||
if (is_penultimate_level_) {
|
||||
assert(blob_file_additions_.empty());
|
||||
}
|
||||
return blob_file_additions_;
|
||||
}
|
||||
|
||||
std::vector<BlobFileAddition>* GetBlobFileAdditionsPtr() {
|
||||
assert(!is_penultimate_level_);
|
||||
return &blob_file_additions_;
|
||||
}
|
||||
|
||||
bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); }
|
||||
|
||||
BlobGarbageMeter* CreateBlobGarbageMeter() {
|
||||
assert(!is_penultimate_level_);
|
||||
blob_garbage_meter_ = std::make_unique<BlobGarbageMeter>();
|
||||
return blob_garbage_meter_.get();
|
||||
}
|
||||
|
||||
BlobGarbageMeter* GetBlobGarbageMeter() const {
|
||||
if (is_penultimate_level_) {
|
||||
// blobdb doesn't support per_key_placement yet
|
||||
assert(blob_garbage_meter_ == nullptr);
|
||||
return nullptr;
|
||||
}
|
||||
return blob_garbage_meter_.get();
|
||||
}
|
||||
|
||||
void UpdateBlobStats() {
|
||||
assert(!is_penultimate_level_);
|
||||
stats_.num_output_files_blob = blob_file_additions_.size();
|
||||
for (const auto& blob : blob_file_additions_) {
|
||||
stats_.bytes_written_blob += blob.GetTotalBlobBytes();
|
||||
}
|
||||
}
|
||||
|
||||
// Finish the current output file
|
||||
Status Finish(const Status& intput_status);
|
||||
|
||||
// Update output table properties from table builder
|
||||
void UpdateTableProperties() {
|
||||
current_output().table_properties =
|
||||
std::make_shared<TableProperties>(GetTableProperties());
|
||||
}
|
||||
|
||||
IOStatus WriterSyncClose(const Status& intput_status, SystemClock* clock,
|
||||
Statistics* statistics, bool use_fsync);
|
||||
|
||||
TableProperties GetTableProperties() {
|
||||
return builder_->GetTableProperties();
|
||||
}
|
||||
|
||||
Slice SmallestUserKey() const {
|
||||
if (!outputs_.empty() && outputs_[0].finished) {
|
||||
return outputs_[0].meta.smallest.user_key();
|
||||
} else {
|
||||
return Slice{nullptr, 0};
|
||||
}
|
||||
}
|
||||
|
||||
Slice LargestUserKey() const {
|
||||
if (!outputs_.empty() && outputs_.back().finished) {
|
||||
return outputs_.back().meta.largest.user_key();
|
||||
} else {
|
||||
return Slice{nullptr, 0};
|
||||
}
|
||||
}
|
||||
|
||||
// In case the last output file is empty, which doesn't need to keep.
|
||||
void RemoveLastEmptyOutput() {
|
||||
if (!outputs_.empty() && !outputs_.back().meta.fd.file_size) {
|
||||
// An error occurred, so ignore the last output.
|
||||
outputs_.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
// Remove the last output, for example the last output doesn't have data (no
|
||||
// entry and no range-dels), but file_size might not be 0, as it has SST
|
||||
// metadata.
|
||||
void RemoveLastOutput() {
|
||||
assert(!outputs_.empty());
|
||||
outputs_.pop_back();
|
||||
}
|
||||
|
||||
bool HasBuilder() const { return builder_ != nullptr; }
|
||||
|
||||
FileMetaData* GetMetaData() { return ¤t_output().meta; }
|
||||
|
||||
bool HasOutput() const { return !outputs_.empty(); }
|
||||
|
||||
uint64_t NumEntries() const { return builder_->NumEntries(); }
|
||||
|
||||
void ResetBuilder() {
|
||||
builder_.reset();
|
||||
current_output_file_size_ = 0;
|
||||
}
|
||||
|
||||
// Add range-dels from the aggregator to the current output file
|
||||
Status AddRangeDels(const Slice* comp_start, const Slice* comp_end,
|
||||
CompactionIterationStats& range_del_out_stats,
|
||||
bool bottommost_level, const InternalKeyComparator& icmp,
|
||||
SequenceNumber earliest_snapshot,
|
||||
const Slice& next_table_min_key);
|
||||
|
||||
// Is the current file is already pending for close
|
||||
bool IsPendingClose() const { return pending_close_; }
|
||||
|
||||
// Current file should close before adding a new key
|
||||
void SetPendingClose() { pending_close_ = true; }
|
||||
|
||||
// if the outputs have range delete, range delete is also data
|
||||
bool HasRangeDel() const {
|
||||
return range_del_agg_ && !range_del_agg_->IsEmpty();
|
||||
}
|
||||
|
||||
private:
|
||||
friend class SubcompactionState;
|
||||
|
||||
void Cleanup() {
|
||||
if (builder_ != nullptr) {
|
||||
// May happen if we get a shutdown call in the middle of compaction
|
||||
builder_->Abandon();
|
||||
builder_.reset();
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t GetCurrentOutputFileSize() const {
|
||||
return current_output_file_size_;
|
||||
}
|
||||
|
||||
// Add curent key from compaction_iterator to the output file. If needed
|
||||
// close and open new compaction output with the functions provided.
|
||||
Status AddToOutput(const CompactionIterator& c_iter,
|
||||
const CompactionFileOpenFunc& open_file_func,
|
||||
const CompactionFileCloseFunc& close_file_func);
|
||||
|
||||
// Close the current output. `open_file_func` is needed for creating new file
|
||||
// for range-dels only output file.
|
||||
Status CloseOutput(const Status& curr_status,
|
||||
const CompactionFileOpenFunc& open_file_func,
|
||||
const CompactionFileCloseFunc& close_file_func) {
|
||||
Status status = curr_status;
|
||||
// handle subcompaction containing only range deletions
|
||||
if (status.ok() && !HasBuilder() && !HasOutput() && HasRangeDel()) {
|
||||
status = open_file_func(*this);
|
||||
}
|
||||
if (HasBuilder()) {
|
||||
const Slice empty_key{};
|
||||
Status s = close_file_func(*this, status, empty_key);
|
||||
if (!s.ok() && status.ok()) {
|
||||
status = s;
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
// This subcompaction's output could be empty if compaction was aborted before
|
||||
// this subcompaction had a chance to generate any output files. When
|
||||
// subcompactions are executed sequentially this is more likely and will be
|
||||
// particularly likely for the later subcompactions to be empty. Once they are
|
||||
// run in parallel however it should be much rarer.
|
||||
// It's caller's responsibility to make sure it's not empty.
|
||||
Output& current_output() {
|
||||
assert(!outputs_.empty());
|
||||
return outputs_.back();
|
||||
}
|
||||
|
||||
// Assign the range_del_agg to the target output level. There's only one
|
||||
// range-del-aggregator per compaction outputs, for
|
||||
// output_to_penultimate_level compaction it is only assigned to the
|
||||
// penultimate level.
|
||||
void AssignRangeDelAggregator(
|
||||
std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
|
||||
assert(range_del_agg_ == nullptr);
|
||||
range_del_agg_ = std::move(range_del_agg);
|
||||
}
|
||||
|
||||
const Compaction* compaction_;
|
||||
|
||||
// The current file is pending close, which needs to run `close_file_func()`
|
||||
// first to add a new key.
|
||||
bool pending_close_ = false;
|
||||
|
||||
// current output builder and writer
|
||||
std::unique_ptr<TableBuilder> builder_;
|
||||
std::unique_ptr<WritableFileWriter> file_writer_;
|
||||
uint64_t current_output_file_size_ = 0;
|
||||
|
||||
// all the compaction outputs so far
|
||||
std::vector<Output> outputs_;
|
||||
|
||||
// BlobDB info
|
||||
std::vector<BlobFileAddition> blob_file_additions_;
|
||||
std::unique_ptr<BlobGarbageMeter> blob_garbage_meter_;
|
||||
|
||||
// Basic compaction output stats for this level's outputs
|
||||
InternalStats::CompactionOutputsStats stats_;
|
||||
|
||||
// indicate if this CompactionOutputs obj for penultimate_level, should always
|
||||
// be false if per_key_placement feature is not enabled.
|
||||
const bool is_penultimate_level_;
|
||||
std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_ = nullptr;
|
||||
|
||||
// partitioner information
|
||||
std::string last_key_for_partitioner_;
|
||||
std::unique_ptr<SstPartitioner> partitioner_;
|
||||
};
|
||||
|
||||
// helper struct to concatenate the last level and penultimate level outputs
|
||||
// which could be replaced by std::ranges::join_view() in c++20
|
||||
struct OutputIterator {
|
||||
public:
|
||||
explicit OutputIterator(const std::vector<CompactionOutputs::Output>& a,
|
||||
const std::vector<CompactionOutputs::Output>& b)
|
||||
: a_(a), b_(b) {
|
||||
within_a = !a_.empty();
|
||||
idx_ = 0;
|
||||
}
|
||||
|
||||
OutputIterator begin() { return *this; }
|
||||
|
||||
OutputIterator end() { return *this; }
|
||||
|
||||
size_t size() { return a_.size() + b_.size(); }
|
||||
|
||||
const CompactionOutputs::Output& operator*() const {
|
||||
return within_a ? a_[idx_] : b_[idx_];
|
||||
}
|
||||
|
||||
OutputIterator& operator++() {
|
||||
idx_++;
|
||||
if (within_a && idx_ >= a_.size()) {
|
||||
within_a = false;
|
||||
idx_ = 0;
|
||||
}
|
||||
assert(within_a || idx_ <= b_.size());
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool operator!=(const OutputIterator& /*rhs*/) const {
|
||||
return within_a || idx_ < b_.size();
|
||||
}
|
||||
|
||||
private:
|
||||
const std::vector<CompactionOutputs::Output>& a_;
|
||||
const std::vector<CompactionOutputs::Output>& b_;
|
||||
bool within_a;
|
||||
size_t idx_;
|
||||
};
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
|
@ -214,13 +214,13 @@ void CompactionPicker::GetRange(const CompactionInputFiles& inputs1,
|
|||
}
|
||||
|
||||
void CompactionPicker::GetRange(const std::vector<CompactionInputFiles>& inputs,
|
||||
InternalKey* smallest,
|
||||
InternalKey* largest) const {
|
||||
InternalKey* smallest, InternalKey* largest,
|
||||
int exclude_level) const {
|
||||
InternalKey current_smallest;
|
||||
InternalKey current_largest;
|
||||
bool initialized = false;
|
||||
for (const auto& in : inputs) {
|
||||
if (in.empty()) {
|
||||
if (in.empty() || in.level == exclude_level) {
|
||||
continue;
|
||||
}
|
||||
GetRange(in, ¤t_smallest, ¤t_largest);
|
||||
|
@ -293,6 +293,12 @@ bool CompactionPicker::RangeOverlapWithCompaction(
|
|||
// Overlap
|
||||
return true;
|
||||
}
|
||||
if (c->SupportsPerKeyPlacement()) {
|
||||
if (c->OverlapPenultimateLevelOutputRange(smallest_user_key,
|
||||
largest_user_key)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Did not overlap with any running compaction in level `level`
|
||||
return false;
|
||||
|
@ -301,9 +307,11 @@ bool CompactionPicker::RangeOverlapWithCompaction(
|
|||
bool CompactionPicker::FilesRangeOverlapWithCompaction(
|
||||
const std::vector<CompactionInputFiles>& inputs, int level) const {
|
||||
bool is_empty = true;
|
||||
int start_level = -1;
|
||||
for (auto& in : inputs) {
|
||||
if (!in.empty()) {
|
||||
is_empty = false;
|
||||
start_level = in.level; // inputs are sorted by level
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -313,7 +321,19 @@ bool CompactionPicker::FilesRangeOverlapWithCompaction(
|
|||
}
|
||||
|
||||
InternalKey smallest, largest;
|
||||
GetRange(inputs, &smallest, &largest);
|
||||
GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel);
|
||||
int penultimate_level =
|
||||
Compaction::EvaluatePenultimateLevel(ioptions_, start_level, level);
|
||||
if (penultimate_level != Compaction::kInvalidLevel) {
|
||||
InternalKey penultimate_smallest, penultimate_largest;
|
||||
GetRange(inputs, &penultimate_smallest, &penultimate_largest, level);
|
||||
if (RangeOverlapWithCompaction(penultimate_smallest.user_key(),
|
||||
penultimate_largest.user_key(),
|
||||
penultimate_level)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
|
||||
level);
|
||||
}
|
||||
|
|
|
@ -154,7 +154,8 @@ class CompactionPicker {
|
|||
// in *smallest, *largest.
|
||||
// REQUIRES: inputs is not empty (at least on entry have one file)
|
||||
void GetRange(const std::vector<CompactionInputFiles>& inputs,
|
||||
InternalKey* smallest, InternalKey* largest) const;
|
||||
InternalKey* smallest, InternalKey* largest,
|
||||
int exclude_level) const;
|
||||
|
||||
int NumberLevels() const { return ioptions_.num_levels; }
|
||||
|
||||
|
|
|
@ -430,8 +430,7 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
|
|||
#ifndef ROCKSDB_LITE
|
||||
TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
|
||||
NewVersionStorage(1, kCompactionStyleUniversal);
|
||||
UniversalCompactionPicker universal_compaction_picker(
|
||||
ioptions_, &icmp_);
|
||||
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
|
||||
UpdateVersionStorageInfo();
|
||||
// must return false when there's no files.
|
||||
ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
|
||||
|
@ -3048,6 +3047,192 @@ TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) {
|
|||
ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size());
|
||||
}
|
||||
|
||||
class PerKeyPlacementCompactionPickerTest
|
||||
: public CompactionPickerTest,
|
||||
public testing::WithParamInterface<bool> {
|
||||
public:
|
||||
PerKeyPlacementCompactionPickerTest() : CompactionPickerTest() {}
|
||||
|
||||
void SetUp() override { enable_per_key_placement_ = GetParam(); }
|
||||
|
||||
protected:
|
||||
bool enable_per_key_placement_ = false;
|
||||
};
|
||||
|
||||
TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) {
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
|
||||
auto supports_per_key_placement = static_cast<bool*>(arg);
|
||||
*supports_per_key_placement = enable_per_key_placement_;
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
int num_levels = ioptions_.num_levels;
|
||||
NewVersionStorage(num_levels, kCompactionStyleLevel);
|
||||
|
||||
Add(0, 21U, "100", "150", 60000000U);
|
||||
Add(0, 22U, "300", "350", 60000000U);
|
||||
Add(5, 40U, "200", "250", 60000000U);
|
||||
Add(6, 50U, "101", "351", 60000000U);
|
||||
UpdateVersionStorageInfo();
|
||||
|
||||
CompactionOptions comp_options;
|
||||
std::unordered_set<uint64_t> input_set;
|
||||
input_set.insert(40);
|
||||
std::vector<CompactionInputFiles> input_files;
|
||||
ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
|
||||
&input_files, &input_set, vstorage_.get(), comp_options));
|
||||
|
||||
std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
|
||||
comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
|
||||
mutable_db_options_, 0));
|
||||
|
||||
input_set.clear();
|
||||
input_files.clear();
|
||||
input_set.insert(21);
|
||||
input_set.insert(22);
|
||||
input_set.insert(50);
|
||||
ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
|
||||
&input_files, &input_set, vstorage_.get(), comp_options));
|
||||
|
||||
ASSERT_EQ(
|
||||
enable_per_key_placement_,
|
||||
level_compaction_picker.FilesRangeOverlapWithCompaction(input_files, 6));
|
||||
}
|
||||
|
||||
TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlap) {
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
|
||||
auto supports_per_key_placement = static_cast<bool*>(arg);
|
||||
*supports_per_key_placement = enable_per_key_placement_;
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
int num_levels = ioptions_.num_levels;
|
||||
NewVersionStorage(num_levels, kCompactionStyleLevel);
|
||||
|
||||
Add(0, 21U, "100", "150", 60000000U);
|
||||
Add(0, 22U, "300", "350", 60000000U);
|
||||
Add(4, 40U, "200", "220", 60000000U);
|
||||
Add(4, 41U, "230", "250", 60000000U);
|
||||
Add(6, 50U, "101", "351", 60000000U);
|
||||
UpdateVersionStorageInfo();
|
||||
|
||||
CompactionOptions comp_options;
|
||||
std::unordered_set<uint64_t> input_set;
|
||||
input_set.insert(21);
|
||||
input_set.insert(22);
|
||||
input_set.insert(50);
|
||||
std::vector<CompactionInputFiles> input_files;
|
||||
ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
|
||||
&input_files, &input_set, vstorage_.get(), comp_options));
|
||||
|
||||
std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
|
||||
comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
|
||||
mutable_db_options_, 0));
|
||||
|
||||
input_set.clear();
|
||||
input_files.clear();
|
||||
input_set.insert(40);
|
||||
input_set.insert(41);
|
||||
ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
|
||||
&input_files, &input_set, vstorage_.get(), comp_options));
|
||||
|
||||
ASSERT_EQ(
|
||||
enable_per_key_placement_,
|
||||
level_compaction_picker.FilesRangeOverlapWithCompaction(input_files, 5));
|
||||
}
|
||||
|
||||
TEST_P(PerKeyPlacementCompactionPickerTest,
|
||||
OverlapWithNormalCompactionUniveral) {
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
|
||||
auto supports_per_key_placement = static_cast<bool*>(arg);
|
||||
*supports_per_key_placement = enable_per_key_placement_;
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
int num_levels = ioptions_.num_levels;
|
||||
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
|
||||
NewVersionStorage(num_levels, kCompactionStyleUniversal);
|
||||
|
||||
Add(0, 21U, "100", "150", 60000000U);
|
||||
Add(0, 22U, "300", "350", 60000000U);
|
||||
Add(5, 40U, "200", "250", 60000000U);
|
||||
Add(6, 50U, "101", "351", 60000000U);
|
||||
UpdateVersionStorageInfo();
|
||||
|
||||
CompactionOptions comp_options;
|
||||
std::unordered_set<uint64_t> input_set;
|
||||
input_set.insert(40);
|
||||
std::vector<CompactionInputFiles> input_files;
|
||||
ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
|
||||
&input_files, &input_set, vstorage_.get(), comp_options));
|
||||
|
||||
std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
|
||||
comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
|
||||
mutable_db_options_, 0));
|
||||
|
||||
input_set.clear();
|
||||
input_files.clear();
|
||||
input_set.insert(21);
|
||||
input_set.insert(22);
|
||||
input_set.insert(50);
|
||||
ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
|
||||
&input_files, &input_set, vstorage_.get(), comp_options));
|
||||
|
||||
ASSERT_EQ(enable_per_key_placement_,
|
||||
universal_compaction_picker.FilesRangeOverlapWithCompaction(
|
||||
input_files, 6));
|
||||
}
|
||||
|
||||
TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) {
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
|
||||
auto supports_per_key_placement = static_cast<bool*>(arg);
|
||||
*supports_per_key_placement = enable_per_key_placement_;
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
int num_levels = ioptions_.num_levels;
|
||||
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
|
||||
NewVersionStorage(num_levels, kCompactionStyleUniversal);
|
||||
|
||||
Add(0, 21U, "100", "150", 60000000U);
|
||||
Add(0, 22U, "300", "350", 60000000U);
|
||||
Add(4, 40U, "200", "220", 60000000U);
|
||||
Add(4, 41U, "230", "250", 60000000U);
|
||||
Add(6, 50U, "101", "351", 60000000U);
|
||||
UpdateVersionStorageInfo();
|
||||
|
||||
CompactionOptions comp_options;
|
||||
std::unordered_set<uint64_t> input_set;
|
||||
input_set.insert(21);
|
||||
input_set.insert(22);
|
||||
input_set.insert(50);
|
||||
std::vector<CompactionInputFiles> input_files;
|
||||
ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
|
||||
&input_files, &input_set, vstorage_.get(), comp_options));
|
||||
|
||||
std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
|
||||
comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
|
||||
mutable_db_options_, 0));
|
||||
|
||||
input_set.clear();
|
||||
input_files.clear();
|
||||
input_set.insert(40);
|
||||
input_set.insert(41);
|
||||
ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
|
||||
&input_files, &input_set, vstorage_.get(), comp_options));
|
||||
|
||||
ASSERT_EQ(enable_per_key_placement_,
|
||||
universal_compaction_picker.FilesRangeOverlapWithCompaction(
|
||||
input_files, 5));
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest,
|
||||
PerKeyPlacementCompactionPickerTest, ::testing::Bool());
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
|
@ -0,0 +1,825 @@
|
|||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
//
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/compaction/compaction_job.h"
|
||||
#include "db/compaction/compaction_state.h"
|
||||
#include "logging/logging.h"
|
||||
#include "monitoring/iostats_context_imp.h"
|
||||
#include "monitoring/thread_status_util.h"
|
||||
#include "options/options_helper.h"
|
||||
#include "rocksdb/utilities/options_type.h"
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
class SubcompactionState;
|
||||
|
||||
CompactionServiceJobStatus
|
||||
CompactionJob::ProcessKeyValueCompactionWithCompactionService(
|
||||
SubcompactionState* sub_compact) {
|
||||
assert(sub_compact);
|
||||
assert(sub_compact->compaction);
|
||||
assert(db_options_.compaction_service);
|
||||
|
||||
const Compaction* compaction = sub_compact->compaction;
|
||||
CompactionServiceInput compaction_input;
|
||||
compaction_input.output_level = compaction->output_level();
|
||||
compaction_input.db_id = db_id_;
|
||||
|
||||
const std::vector<CompactionInputFiles>& inputs =
|
||||
*(compact_->compaction->inputs());
|
||||
for (const auto& files_per_level : inputs) {
|
||||
for (const auto& file : files_per_level.files) {
|
||||
compaction_input.input_files.emplace_back(
|
||||
MakeTableFileName(file->fd.GetNumber()));
|
||||
}
|
||||
}
|
||||
compaction_input.column_family.name =
|
||||
compaction->column_family_data()->GetName();
|
||||
compaction_input.column_family.options =
|
||||
compaction->column_family_data()->GetLatestCFOptions();
|
||||
compaction_input.db_options =
|
||||
BuildDBOptions(db_options_, mutable_db_options_copy_);
|
||||
compaction_input.snapshots = existing_snapshots_;
|
||||
compaction_input.has_begin = sub_compact->start;
|
||||
compaction_input.begin =
|
||||
compaction_input.has_begin ? sub_compact->start->ToString() : "";
|
||||
compaction_input.has_end = sub_compact->end;
|
||||
compaction_input.end =
|
||||
compaction_input.has_end ? sub_compact->end->ToString() : "";
|
||||
|
||||
std::string compaction_input_binary;
|
||||
Status s = compaction_input.Write(&compaction_input_binary);
|
||||
if (!s.ok()) {
|
||||
sub_compact->status = s;
|
||||
return CompactionServiceJobStatus::kFailure;
|
||||
}
|
||||
|
||||
std::ostringstream input_files_oss;
|
||||
bool is_first_one = true;
|
||||
for (const auto& file : compaction_input.input_files) {
|
||||
input_files_oss << (is_first_one ? "" : ", ") << file;
|
||||
is_first_one = false;
|
||||
}
|
||||
|
||||
ROCKS_LOG_INFO(
|
||||
db_options_.info_log,
|
||||
"[%s] [JOB %d] Starting remote compaction (output level: %d): %s",
|
||||
compaction_input.column_family.name.c_str(), job_id_,
|
||||
compaction_input.output_level, input_files_oss.str().c_str());
|
||||
CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_,
|
||||
GetCompactionId(sub_compact), thread_pri_);
|
||||
CompactionServiceJobStatus compaction_status =
|
||||
db_options_.compaction_service->StartV2(info, compaction_input_binary);
|
||||
switch (compaction_status) {
|
||||
case CompactionServiceJobStatus::kSuccess:
|
||||
break;
|
||||
case CompactionServiceJobStatus::kFailure:
|
||||
sub_compact->status = Status::Incomplete(
|
||||
"CompactionService failed to start compaction job.");
|
||||
ROCKS_LOG_WARN(db_options_.info_log,
|
||||
"[%s] [JOB %d] Remote compaction failed to start.",
|
||||
compaction_input.column_family.name.c_str(), job_id_);
|
||||
return compaction_status;
|
||||
case CompactionServiceJobStatus::kUseLocal:
|
||||
ROCKS_LOG_INFO(
|
||||
db_options_.info_log,
|
||||
"[%s] [JOB %d] Remote compaction fallback to local by API Start.",
|
||||
compaction_input.column_family.name.c_str(), job_id_);
|
||||
return compaction_status;
|
||||
default:
|
||||
assert(false); // unknown status
|
||||
break;
|
||||
}
|
||||
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
"[%s] [JOB %d] Waiting for remote compaction...",
|
||||
compaction_input.column_family.name.c_str(), job_id_);
|
||||
std::string compaction_result_binary;
|
||||
compaction_status = db_options_.compaction_service->WaitForCompleteV2(
|
||||
info, &compaction_result_binary);
|
||||
|
||||
if (compaction_status == CompactionServiceJobStatus::kUseLocal) {
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
"[%s] [JOB %d] Remote compaction fallback to local by API "
|
||||
"WaitForComplete.",
|
||||
compaction_input.column_family.name.c_str(), job_id_);
|
||||
return compaction_status;
|
||||
}
|
||||
|
||||
CompactionServiceResult compaction_result;
|
||||
s = CompactionServiceResult::Read(compaction_result_binary,
|
||||
&compaction_result);
|
||||
|
||||
if (compaction_status == CompactionServiceJobStatus::kFailure) {
|
||||
if (s.ok()) {
|
||||
if (compaction_result.status.ok()) {
|
||||
sub_compact->status = Status::Incomplete(
|
||||
"CompactionService failed to run the compaction job (even though "
|
||||
"the internal status is okay).");
|
||||
} else {
|
||||
// set the current sub compaction status with the status returned from
|
||||
// remote
|
||||
sub_compact->status = compaction_result.status;
|
||||
}
|
||||
} else {
|
||||
sub_compact->status = Status::Incomplete(
|
||||
"CompactionService failed to run the compaction job (and no valid "
|
||||
"result is returned).");
|
||||
compaction_result.status.PermitUncheckedError();
|
||||
}
|
||||
ROCKS_LOG_WARN(db_options_.info_log,
|
||||
"[%s] [JOB %d] Remote compaction failed.",
|
||||
compaction_input.column_family.name.c_str(), job_id_);
|
||||
return compaction_status;
|
||||
}
|
||||
|
||||
if (!s.ok()) {
|
||||
sub_compact->status = s;
|
||||
compaction_result.status.PermitUncheckedError();
|
||||
return CompactionServiceJobStatus::kFailure;
|
||||
}
|
||||
sub_compact->status = compaction_result.status;
|
||||
|
||||
std::ostringstream output_files_oss;
|
||||
is_first_one = true;
|
||||
for (const auto& file : compaction_result.output_files) {
|
||||
output_files_oss << (is_first_one ? "" : ", ") << file.file_name;
|
||||
is_first_one = false;
|
||||
}
|
||||
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
"[%s] [JOB %d] Receive remote compaction result, output path: "
|
||||
"%s, files: %s",
|
||||
compaction_input.column_family.name.c_str(), job_id_,
|
||||
compaction_result.output_path.c_str(),
|
||||
output_files_oss.str().c_str());
|
||||
|
||||
if (!s.ok()) {
|
||||
sub_compact->status = s;
|
||||
return CompactionServiceJobStatus::kFailure;
|
||||
}
|
||||
|
||||
for (const auto& file : compaction_result.output_files) {
|
||||
uint64_t file_num = versions_->NewFileNumber();
|
||||
auto src_file = compaction_result.output_path + "/" + file.file_name;
|
||||
auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths,
|
||||
file_num, compaction->output_path_id());
|
||||
s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
|
||||
if (!s.ok()) {
|
||||
sub_compact->status = s;
|
||||
return CompactionServiceJobStatus::kFailure;
|
||||
}
|
||||
|
||||
FileMetaData meta;
|
||||
uint64_t file_size;
|
||||
s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
|
||||
if (!s.ok()) {
|
||||
sub_compact->status = s;
|
||||
return CompactionServiceJobStatus::kFailure;
|
||||
}
|
||||
meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
|
||||
file.smallest_seqno, file.largest_seqno);
|
||||
meta.smallest.DecodeFrom(file.smallest_internal_key);
|
||||
meta.largest.DecodeFrom(file.largest_internal_key);
|
||||
meta.oldest_ancester_time = file.oldest_ancester_time;
|
||||
meta.file_creation_time = file.file_creation_time;
|
||||
meta.marked_for_compaction = file.marked_for_compaction;
|
||||
meta.unique_id = file.unique_id;
|
||||
|
||||
auto cfd = compaction->column_family_data();
|
||||
sub_compact->Current().AddOutput(std::move(meta),
|
||||
cfd->internal_comparator(), false, false,
|
||||
true, file.paranoid_hash);
|
||||
}
|
||||
sub_compact->compaction_job_stats = compaction_result.stats;
|
||||
sub_compact->Current().SetNumOutputRecords(
|
||||
compaction_result.num_output_records);
|
||||
sub_compact->Current().SetTotalBytes(compaction_result.total_bytes);
|
||||
RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
|
||||
RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
|
||||
compaction_result.bytes_written);
|
||||
return CompactionServiceJobStatus::kSuccess;
|
||||
}
|
||||
|
||||
std::string CompactionServiceCompactionJob::GetTableFileName(
|
||||
uint64_t file_number) {
|
||||
return MakeTableFileName(output_path_, file_number);
|
||||
}
|
||||
|
||||
void CompactionServiceCompactionJob::RecordCompactionIOStats() {
|
||||
compaction_result_->bytes_read += IOSTATS(bytes_read);
|
||||
compaction_result_->bytes_written += IOSTATS(bytes_written);
|
||||
CompactionJob::RecordCompactionIOStats();
|
||||
}
|
||||
|
||||
CompactionServiceCompactionJob::CompactionServiceCompactionJob(
|
||||
int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
|
||||
const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
|
||||
VersionSet* versions, const std::atomic<bool>* shutting_down,
|
||||
LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats,
|
||||
InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
|
||||
std::vector<SequenceNumber> existing_snapshots,
|
||||
std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
|
||||
const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
|
||||
const std::atomic<bool>& manual_compaction_canceled,
|
||||
const std::string& db_id, const std::string& db_session_id,
|
||||
std::string output_path,
|
||||
const CompactionServiceInput& compaction_service_input,
|
||||
CompactionServiceResult* compaction_service_result)
|
||||
: CompactionJob(
|
||||
job_id, compaction, db_options, mutable_db_options, file_options,
|
||||
versions, shutting_down, log_buffer, nullptr, output_directory,
|
||||
nullptr, stats, db_mutex, db_error_handler,
|
||||
std::move(existing_snapshots), kMaxSequenceNumber, nullptr, nullptr,
|
||||
std::move(table_cache), event_logger,
|
||||
compaction->mutable_cf_options()->paranoid_file_checks,
|
||||
compaction->mutable_cf_options()->report_bg_io_stats, dbname,
|
||||
&(compaction_service_result->stats), Env::Priority::USER, io_tracer,
|
||||
manual_compaction_canceled, db_id, db_session_id,
|
||||
compaction->column_family_data()->GetFullHistoryTsLow()),
|
||||
output_path_(std::move(output_path)),
|
||||
compaction_input_(compaction_service_input),
|
||||
compaction_result_(compaction_service_result) {}
|
||||
|
||||
Status CompactionServiceCompactionJob::Run() {
|
||||
AutoThreadOperationStageUpdater stage_updater(
|
||||
ThreadStatus::STAGE_COMPACTION_RUN);
|
||||
|
||||
auto* c = compact_->compaction;
|
||||
assert(c->column_family_data() != nullptr);
|
||||
assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
|
||||
compact_->compaction->level()) > 0);
|
||||
|
||||
write_hint_ =
|
||||
c->column_family_data()->CalculateSSTWriteHint(c->output_level());
|
||||
bottommost_level_ = c->bottommost_level();
|
||||
|
||||
Slice begin = compaction_input_.begin;
|
||||
Slice end = compaction_input_.end;
|
||||
compact_->sub_compact_states.emplace_back(
|
||||
c, compaction_input_.has_begin ? &begin : nullptr,
|
||||
compaction_input_.has_end ? &end : nullptr, /*sub_job_id*/ 0);
|
||||
|
||||
log_buffer_->FlushBufferToLog();
|
||||
LogCompaction();
|
||||
const uint64_t start_micros = db_options_.clock->NowMicros();
|
||||
// Pick the only sub-compaction we should have
|
||||
assert(compact_->sub_compact_states.size() == 1);
|
||||
SubcompactionState* sub_compact = compact_->sub_compact_states.data();
|
||||
|
||||
ProcessKeyValueCompaction(sub_compact);
|
||||
|
||||
compaction_stats_.stats.micros =
|
||||
db_options_.clock->NowMicros() - start_micros;
|
||||
compaction_stats_.stats.cpu_micros =
|
||||
sub_compact->compaction_job_stats.cpu_micros;
|
||||
|
||||
RecordTimeToHistogram(stats_, COMPACTION_TIME,
|
||||
compaction_stats_.stats.micros);
|
||||
RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
|
||||
compaction_stats_.stats.cpu_micros);
|
||||
|
||||
Status status = sub_compact->status;
|
||||
IOStatus io_s = sub_compact->io_status;
|
||||
|
||||
if (io_status_.ok()) {
|
||||
io_status_ = io_s;
|
||||
}
|
||||
|
||||
if (status.ok()) {
|
||||
constexpr IODebugContext* dbg = nullptr;
|
||||
|
||||
if (output_directory_) {
|
||||
io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg,
|
||||
DirFsyncOptions());
|
||||
}
|
||||
}
|
||||
if (io_status_.ok()) {
|
||||
io_status_ = io_s;
|
||||
}
|
||||
if (status.ok()) {
|
||||
status = io_s;
|
||||
}
|
||||
if (status.ok()) {
|
||||
// TODO: Add verify_table()
|
||||
}
|
||||
|
||||
// Finish up all book-keeping to unify the subcompaction results
|
||||
compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
|
||||
UpdateCompactionStats();
|
||||
RecordCompactionIOStats();
|
||||
|
||||
LogFlush(db_options_.info_log);
|
||||
compact_->status = status;
|
||||
compact_->status.PermitUncheckedError();
|
||||
|
||||
// Build compaction result
|
||||
compaction_result_->output_level = compact_->compaction->output_level();
|
||||
compaction_result_->output_path = output_path_;
|
||||
for (const auto& output_file : sub_compact->GetOutputs()) {
|
||||
auto& meta = output_file.meta;
|
||||
compaction_result_->output_files.emplace_back(
|
||||
MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
|
||||
meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
|
||||
meta.largest.Encode().ToString(), meta.oldest_ancester_time,
|
||||
meta.file_creation_time, output_file.validator.GetHash(),
|
||||
meta.marked_for_compaction, meta.unique_id);
|
||||
}
|
||||
InternalStats::CompactionStatsFull compaction_stats;
|
||||
sub_compact->AggregateCompactionStats(compaction_stats);
|
||||
compaction_result_->num_output_records =
|
||||
compaction_stats.stats.num_output_records;
|
||||
compaction_result_->total_bytes = compaction_stats.TotalBytesWritten();
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void CompactionServiceCompactionJob::CleanupCompaction() {
|
||||
CompactionJob::CleanupCompaction();
|
||||
}
|
||||
|
||||
// Internal binary format for the input and result data
|
||||
enum BinaryFormatVersion : uint32_t {
|
||||
kOptionsString = 1, // Use string format similar to Option string format
|
||||
};
|
||||
|
||||
static std::unordered_map<std::string, OptionTypeInfo> cfd_type_info = {
|
||||
{"name",
|
||||
{offsetof(struct ColumnFamilyDescriptor, name), OptionType::kEncodedString,
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
|
||||
{"options",
|
||||
{offsetof(struct ColumnFamilyDescriptor, options),
|
||||
OptionType::kConfigurable, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone,
|
||||
[](const ConfigOptions& opts, const std::string& /*name*/,
|
||||
const std::string& value, void* addr) {
|
||||
auto cf_options = static_cast<ColumnFamilyOptions*>(addr);
|
||||
return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(),
|
||||
value, cf_options);
|
||||
},
|
||||
[](const ConfigOptions& opts, const std::string& /*name*/,
|
||||
const void* addr, std::string* value) {
|
||||
const auto cf_options = static_cast<const ColumnFamilyOptions*>(addr);
|
||||
std::string result;
|
||||
auto status =
|
||||
GetStringFromColumnFamilyOptions(opts, *cf_options, &result);
|
||||
*value = "{" + result + "}";
|
||||
return status;
|
||||
},
|
||||
[](const ConfigOptions& opts, const std::string& name, const void* addr1,
|
||||
const void* addr2, std::string* mismatch) {
|
||||
const auto this_one = static_cast<const ColumnFamilyOptions*>(addr1);
|
||||
const auto that_one = static_cast<const ColumnFamilyOptions*>(addr2);
|
||||
auto this_conf = CFOptionsAsConfigurable(*this_one);
|
||||
auto that_conf = CFOptionsAsConfigurable(*that_one);
|
||||
std::string mismatch_opt;
|
||||
bool result =
|
||||
this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
|
||||
if (!result) {
|
||||
*mismatch = name + "." + mismatch_opt;
|
||||
}
|
||||
return result;
|
||||
}}},
|
||||
};
|
||||
|
||||
static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
|
||||
{"column_family",
|
||||
OptionTypeInfo::Struct(
|
||||
"column_family", &cfd_type_info,
|
||||
offsetof(struct CompactionServiceInput, column_family),
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
|
||||
{"db_options",
|
||||
{offsetof(struct CompactionServiceInput, db_options),
|
||||
OptionType::kConfigurable, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone,
|
||||
[](const ConfigOptions& opts, const std::string& /*name*/,
|
||||
const std::string& value, void* addr) {
|
||||
auto options = static_cast<DBOptions*>(addr);
|
||||
return GetDBOptionsFromString(opts, DBOptions(), value, options);
|
||||
},
|
||||
[](const ConfigOptions& opts, const std::string& /*name*/,
|
||||
const void* addr, std::string* value) {
|
||||
const auto options = static_cast<const DBOptions*>(addr);
|
||||
std::string result;
|
||||
auto status = GetStringFromDBOptions(opts, *options, &result);
|
||||
*value = "{" + result + "}";
|
||||
return status;
|
||||
},
|
||||
[](const ConfigOptions& opts, const std::string& name, const void* addr1,
|
||||
const void* addr2, std::string* mismatch) {
|
||||
const auto this_one = static_cast<const DBOptions*>(addr1);
|
||||
const auto that_one = static_cast<const DBOptions*>(addr2);
|
||||
auto this_conf = DBOptionsAsConfigurable(*this_one);
|
||||
auto that_conf = DBOptionsAsConfigurable(*that_one);
|
||||
std::string mismatch_opt;
|
||||
bool result =
|
||||
this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
|
||||
if (!result) {
|
||||
*mismatch = name + "." + mismatch_opt;
|
||||
}
|
||||
return result;
|
||||
}}},
|
||||
{"snapshots", OptionTypeInfo::Vector<uint64_t>(
|
||||
offsetof(struct CompactionServiceInput, snapshots),
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone,
|
||||
{0, OptionType::kUInt64T})},
|
||||
{"input_files", OptionTypeInfo::Vector<std::string>(
|
||||
offsetof(struct CompactionServiceInput, input_files),
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone,
|
||||
{0, OptionType::kEncodedString})},
|
||||
{"output_level",
|
||||
{offsetof(struct CompactionServiceInput, output_level), OptionType::kInt,
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
|
||||
{"db_id",
|
||||
{offsetof(struct CompactionServiceInput, db_id),
|
||||
OptionType::kEncodedString}},
|
||||
{"has_begin",
|
||||
{offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean,
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
|
||||
{"begin",
|
||||
{offsetof(struct CompactionServiceInput, begin),
|
||||
OptionType::kEncodedString, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"has_end",
|
||||
{offsetof(struct CompactionServiceInput, has_end), OptionType::kBoolean,
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
|
||||
{"end",
|
||||
{offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString,
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
|
||||
};
|
||||
|
||||
static std::unordered_map<std::string, OptionTypeInfo>
|
||||
cs_output_file_type_info = {
|
||||
{"file_name",
|
||||
{offsetof(struct CompactionServiceOutputFile, file_name),
|
||||
OptionType::kEncodedString, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"smallest_seqno",
|
||||
{offsetof(struct CompactionServiceOutputFile, smallest_seqno),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"largest_seqno",
|
||||
{offsetof(struct CompactionServiceOutputFile, largest_seqno),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"smallest_internal_key",
|
||||
{offsetof(struct CompactionServiceOutputFile, smallest_internal_key),
|
||||
OptionType::kEncodedString, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"largest_internal_key",
|
||||
{offsetof(struct CompactionServiceOutputFile, largest_internal_key),
|
||||
OptionType::kEncodedString, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"oldest_ancester_time",
|
||||
{offsetof(struct CompactionServiceOutputFile, oldest_ancester_time),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"file_creation_time",
|
||||
{offsetof(struct CompactionServiceOutputFile, file_creation_time),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"paranoid_hash",
|
||||
{offsetof(struct CompactionServiceOutputFile, paranoid_hash),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"marked_for_compaction",
|
||||
{offsetof(struct CompactionServiceOutputFile, marked_for_compaction),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"unique_id",
|
||||
OptionTypeInfo::Array<uint64_t, 2>(
|
||||
offsetof(struct CompactionServiceOutputFile, unique_id),
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone,
|
||||
{0, OptionType::kUInt64T})},
|
||||
};
|
||||
|
||||
static std::unordered_map<std::string, OptionTypeInfo>
|
||||
compaction_job_stats_type_info = {
|
||||
{"elapsed_micros",
|
||||
{offsetof(struct CompactionJobStats, elapsed_micros),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"cpu_micros",
|
||||
{offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T,
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
|
||||
{"num_input_records",
|
||||
{offsetof(struct CompactionJobStats, num_input_records),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_blobs_read",
|
||||
{offsetof(struct CompactionJobStats, num_blobs_read),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_input_files",
|
||||
{offsetof(struct CompactionJobStats, num_input_files),
|
||||
OptionType::kSizeT, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_input_files_at_output_level",
|
||||
{offsetof(struct CompactionJobStats, num_input_files_at_output_level),
|
||||
OptionType::kSizeT, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_output_records",
|
||||
{offsetof(struct CompactionJobStats, num_output_records),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_output_files",
|
||||
{offsetof(struct CompactionJobStats, num_output_files),
|
||||
OptionType::kSizeT, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_output_files_blob",
|
||||
{offsetof(struct CompactionJobStats, num_output_files_blob),
|
||||
OptionType::kSizeT, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"is_full_compaction",
|
||||
{offsetof(struct CompactionJobStats, is_full_compaction),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"is_manual_compaction",
|
||||
{offsetof(struct CompactionJobStats, is_manual_compaction),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"total_input_bytes",
|
||||
{offsetof(struct CompactionJobStats, total_input_bytes),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"total_blob_bytes_read",
|
||||
{offsetof(struct CompactionJobStats, total_blob_bytes_read),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"total_output_bytes",
|
||||
{offsetof(struct CompactionJobStats, total_output_bytes),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"total_output_bytes_blob",
|
||||
{offsetof(struct CompactionJobStats, total_output_bytes_blob),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_records_replaced",
|
||||
{offsetof(struct CompactionJobStats, num_records_replaced),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"total_input_raw_key_bytes",
|
||||
{offsetof(struct CompactionJobStats, total_input_raw_key_bytes),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"total_input_raw_value_bytes",
|
||||
{offsetof(struct CompactionJobStats, total_input_raw_value_bytes),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_input_deletion_records",
|
||||
{offsetof(struct CompactionJobStats, num_input_deletion_records),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_expired_deletion_records",
|
||||
{offsetof(struct CompactionJobStats, num_expired_deletion_records),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_corrupt_keys",
|
||||
{offsetof(struct CompactionJobStats, num_corrupt_keys),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"file_write_nanos",
|
||||
{offsetof(struct CompactionJobStats, file_write_nanos),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"file_range_sync_nanos",
|
||||
{offsetof(struct CompactionJobStats, file_range_sync_nanos),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"file_fsync_nanos",
|
||||
{offsetof(struct CompactionJobStats, file_fsync_nanos),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"file_prepare_write_nanos",
|
||||
{offsetof(struct CompactionJobStats, file_prepare_write_nanos),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"smallest_output_key_prefix",
|
||||
{offsetof(struct CompactionJobStats, smallest_output_key_prefix),
|
||||
OptionType::kEncodedString, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"largest_output_key_prefix",
|
||||
{offsetof(struct CompactionJobStats, largest_output_key_prefix),
|
||||
OptionType::kEncodedString, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_single_del_fallthru",
|
||||
{offsetof(struct CompactionJobStats, num_single_del_fallthru),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_single_del_mismatch",
|
||||
{offsetof(struct CompactionJobStats, num_single_del_mismatch),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
};
|
||||
|
||||
namespace {
|
||||
// this is a helper struct to serialize and deserialize class Status, because
|
||||
// Status's members are not public.
|
||||
struct StatusSerializationAdapter {
|
||||
uint8_t code;
|
||||
uint8_t subcode;
|
||||
uint8_t severity;
|
||||
std::string message;
|
||||
|
||||
StatusSerializationAdapter() = default;
|
||||
explicit StatusSerializationAdapter(const Status& s) {
|
||||
code = s.code();
|
||||
subcode = s.subcode();
|
||||
severity = s.severity();
|
||||
auto msg = s.getState();
|
||||
message = msg ? msg : "";
|
||||
}
|
||||
|
||||
Status GetStatus() const {
|
||||
return Status{static_cast<Status::Code>(code),
|
||||
static_cast<Status::SubCode>(subcode),
|
||||
static_cast<Status::Severity>(severity), message};
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
static std::unordered_map<std::string, OptionTypeInfo>
|
||||
status_adapter_type_info = {
|
||||
{"code",
|
||||
{offsetof(struct StatusSerializationAdapter, code),
|
||||
OptionType::kUInt8T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"subcode",
|
||||
{offsetof(struct StatusSerializationAdapter, subcode),
|
||||
OptionType::kUInt8T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"severity",
|
||||
{offsetof(struct StatusSerializationAdapter, severity),
|
||||
OptionType::kUInt8T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"message",
|
||||
{offsetof(struct StatusSerializationAdapter, message),
|
||||
OptionType::kEncodedString, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
};
|
||||
|
||||
static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = {
|
||||
{"status",
|
||||
{offsetof(struct CompactionServiceResult, status),
|
||||
OptionType::kCustomizable, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone,
|
||||
[](const ConfigOptions& opts, const std::string& /*name*/,
|
||||
const std::string& value, void* addr) {
|
||||
auto status_obj = static_cast<Status*>(addr);
|
||||
StatusSerializationAdapter adapter;
|
||||
Status s = OptionTypeInfo::ParseType(
|
||||
opts, value, status_adapter_type_info, &adapter);
|
||||
*status_obj = adapter.GetStatus();
|
||||
return s;
|
||||
},
|
||||
[](const ConfigOptions& opts, const std::string& /*name*/,
|
||||
const void* addr, std::string* value) {
|
||||
const auto status_obj = static_cast<const Status*>(addr);
|
||||
StatusSerializationAdapter adapter(*status_obj);
|
||||
std::string result;
|
||||
Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info,
|
||||
&adapter, &result);
|
||||
*value = "{" + result + "}";
|
||||
return s;
|
||||
},
|
||||
[](const ConfigOptions& opts, const std::string& /*name*/,
|
||||
const void* addr1, const void* addr2, std::string* mismatch) {
|
||||
const auto status1 = static_cast<const Status*>(addr1);
|
||||
const auto status2 = static_cast<const Status*>(addr2);
|
||||
|
||||
StatusSerializationAdapter adatper1(*status1);
|
||||
StatusSerializationAdapter adapter2(*status2);
|
||||
return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info,
|
||||
&adatper1, &adapter2, mismatch);
|
||||
}}},
|
||||
{"output_files",
|
||||
OptionTypeInfo::Vector<CompactionServiceOutputFile>(
|
||||
offsetof(struct CompactionServiceResult, output_files),
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone,
|
||||
OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0,
|
||||
OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone))},
|
||||
{"output_level",
|
||||
{offsetof(struct CompactionServiceResult, output_level), OptionType::kInt,
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
|
||||
{"output_path",
|
||||
{offsetof(struct CompactionServiceResult, output_path),
|
||||
OptionType::kEncodedString, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_output_records",
|
||||
{offsetof(struct CompactionServiceResult, num_output_records),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"total_bytes",
|
||||
{offsetof(struct CompactionServiceResult, total_bytes),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"bytes_read",
|
||||
{offsetof(struct CompactionServiceResult, bytes_read),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"bytes_written",
|
||||
{offsetof(struct CompactionServiceResult, bytes_written),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"stats", OptionTypeInfo::Struct(
|
||||
"stats", &compaction_job_stats_type_info,
|
||||
offsetof(struct CompactionServiceResult, stats),
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
|
||||
};
|
||||
|
||||
Status CompactionServiceInput::Read(const std::string& data_str,
|
||||
CompactionServiceInput* obj) {
|
||||
if (data_str.size() <= sizeof(BinaryFormatVersion)) {
|
||||
return Status::InvalidArgument("Invalid CompactionServiceInput string");
|
||||
}
|
||||
auto format_version = DecodeFixed32(data_str.data());
|
||||
if (format_version == kOptionsString) {
|
||||
ConfigOptions cf;
|
||||
cf.invoke_prepare_options = false;
|
||||
cf.ignore_unknown_options = true;
|
||||
return OptionTypeInfo::ParseType(
|
||||
cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info,
|
||||
obj);
|
||||
} else {
|
||||
return Status::NotSupported(
|
||||
"Compaction Service Input data version not supported: " +
|
||||
std::to_string(format_version));
|
||||
}
|
||||
}
|
||||
|
||||
Status CompactionServiceInput::Write(std::string* output) {
|
||||
char buf[sizeof(BinaryFormatVersion)];
|
||||
EncodeFixed32(buf, kOptionsString);
|
||||
output->append(buf, sizeof(BinaryFormatVersion));
|
||||
ConfigOptions cf;
|
||||
cf.invoke_prepare_options = false;
|
||||
return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output);
|
||||
}
|
||||
|
||||
Status CompactionServiceResult::Read(const std::string& data_str,
|
||||
CompactionServiceResult* obj) {
|
||||
if (data_str.size() <= sizeof(BinaryFormatVersion)) {
|
||||
return Status::InvalidArgument("Invalid CompactionServiceResult string");
|
||||
}
|
||||
auto format_version = DecodeFixed32(data_str.data());
|
||||
if (format_version == kOptionsString) {
|
||||
ConfigOptions cf;
|
||||
cf.invoke_prepare_options = false;
|
||||
cf.ignore_unknown_options = true;
|
||||
return OptionTypeInfo::ParseType(
|
||||
cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info,
|
||||
obj);
|
||||
} else {
|
||||
return Status::NotSupported(
|
||||
"Compaction Service Result data version not supported: " +
|
||||
std::to_string(format_version));
|
||||
}
|
||||
}
|
||||
|
||||
Status CompactionServiceResult::Write(std::string* output) {
|
||||
char buf[sizeof(BinaryFormatVersion)];
|
||||
EncodeFixed32(buf, kOptionsString);
|
||||
output->append(buf, sizeof(BinaryFormatVersion));
|
||||
ConfigOptions cf;
|
||||
cf.invoke_prepare_options = false;
|
||||
return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output);
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) {
|
||||
std::string mismatch;
|
||||
return TEST_Equals(other, &mismatch);
|
||||
}
|
||||
|
||||
bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other,
|
||||
std::string* mismatch) {
|
||||
ConfigOptions cf;
|
||||
cf.invoke_prepare_options = false;
|
||||
return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other,
|
||||
mismatch);
|
||||
}
|
||||
|
||||
bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) {
|
||||
std::string mismatch;
|
||||
return TEST_Equals(other, &mismatch);
|
||||
}
|
||||
|
||||
bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other,
|
||||
std::string* mismatch) {
|
||||
ConfigOptions cf;
|
||||
cf.invoke_prepare_options = false;
|
||||
return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other,
|
||||
mismatch);
|
||||
}
|
||||
#endif // NDEBUG
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
||||
#endif // !ROCKSDB_LITE
|
|
@ -0,0 +1,46 @@
|
|||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
//
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/compaction/compaction_state.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
Slice CompactionState::SmallestUserKey() {
|
||||
for (const auto& sub_compact_state : sub_compact_states) {
|
||||
Slice smallest = sub_compact_state.SmallestUserKey();
|
||||
if (!smallest.empty()) {
|
||||
return smallest;
|
||||
}
|
||||
}
|
||||
// If there is no finished output, return an empty slice.
|
||||
return Slice{nullptr, 0};
|
||||
}
|
||||
|
||||
Slice CompactionState::LargestUserKey() {
|
||||
for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend();
|
||||
++it) {
|
||||
Slice largest = it->LargestUserKey();
|
||||
if (!largest.empty()) {
|
||||
return largest;
|
||||
}
|
||||
}
|
||||
// If there is no finished output, return an empty slice.
|
||||
return Slice{nullptr, 0};
|
||||
}
|
||||
|
||||
void CompactionState::AggregateCompactionStats(
|
||||
InternalStats::CompactionStatsFull& compaction_stats,
|
||||
CompactionJobStats& compaction_job_stats) {
|
||||
for (const auto& sc : sub_compact_states) {
|
||||
sc.AggregateCompactionStats(compaction_stats);
|
||||
compaction_job_stats.Add(sc.compaction_job_stats);
|
||||
}
|
||||
}
|
||||
} // namespace ROCKSDB_NAMESPACE
|
|
@ -0,0 +1,42 @@
|
|||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
//
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "db/compaction/compaction.h"
|
||||
#include "db/compaction/subcompaction_state.h"
|
||||
#include "db/internal_stats.h"
|
||||
|
||||
// Data structures used for compaction_job and compaction_service_job which has
|
||||
// the list of sub_compact_states and the aggregated information for the
|
||||
// compaction.
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
// Maintains state for the entire compaction
|
||||
class CompactionState {
|
||||
public:
|
||||
Compaction* const compaction;
|
||||
|
||||
// REQUIRED: subcompaction states are stored in order of increasing key-range
|
||||
std::vector<SubcompactionState> sub_compact_states;
|
||||
Status status;
|
||||
|
||||
void AggregateCompactionStats(
|
||||
InternalStats::CompactionStatsFull& compaction_stats,
|
||||
CompactionJobStats& compaction_job_stats);
|
||||
|
||||
explicit CompactionState(Compaction* c) : compaction(c) {}
|
||||
|
||||
Slice SmallestUserKey();
|
||||
|
||||
Slice LargestUserKey();
|
||||
};
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
|
@ -0,0 +1,223 @@
|
|||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
//
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/compaction/subcompaction_state.h"
|
||||
|
||||
#include "rocksdb/sst_partitioner.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
void SubcompactionState::AggregateCompactionStats(
|
||||
InternalStats::CompactionStatsFull& compaction_stats) const {
|
||||
compaction_stats.stats.Add(compaction_outputs_.stats_);
|
||||
if (HasPenultimateLevelOutputs()) {
|
||||
compaction_stats.has_penultimate_level_output = true;
|
||||
compaction_stats.penultimate_level_stats.Add(
|
||||
penultimate_level_outputs_.stats_);
|
||||
}
|
||||
}
|
||||
|
||||
void SubcompactionState::FillFilesToCutForTtl() {
|
||||
if (compaction->immutable_options()->compaction_style !=
|
||||
CompactionStyle::kCompactionStyleLevel ||
|
||||
compaction->immutable_options()->compaction_pri !=
|
||||
CompactionPri::kMinOverlappingRatio ||
|
||||
compaction->mutable_cf_options()->ttl == 0 ||
|
||||
compaction->num_input_levels() < 2 || compaction->bottommost_level()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// We define new file with the oldest ancestor time to be younger than 1/4
|
||||
// TTL, and an old one to be older than 1/2 TTL time.
|
||||
int64_t temp_current_time;
|
||||
auto get_time_status = compaction->immutable_options()->clock->GetCurrentTime(
|
||||
&temp_current_time);
|
||||
if (!get_time_status.ok()) {
|
||||
return;
|
||||
}
|
||||
auto current_time = static_cast<uint64_t>(temp_current_time);
|
||||
if (current_time < compaction->mutable_cf_options()->ttl) {
|
||||
return;
|
||||
}
|
||||
uint64_t old_age_thres =
|
||||
current_time - compaction->mutable_cf_options()->ttl / 2;
|
||||
|
||||
const std::vector<FileMetaData*>& olevel =
|
||||
*(compaction->inputs(compaction->num_input_levels() - 1));
|
||||
for (FileMetaData* file : olevel) {
|
||||
// Worth filtering out by start and end?
|
||||
uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
|
||||
// We put old files if they are not too small to prevent a flood
|
||||
// of small files.
|
||||
if (oldest_ancester_time < old_age_thres &&
|
||||
file->fd.GetFileSize() >
|
||||
compaction->mutable_cf_options()->target_file_size_base / 2) {
|
||||
files_to_cut_for_ttl_.push_back(file);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OutputIterator SubcompactionState::GetOutputs() const {
|
||||
return OutputIterator(penultimate_level_outputs_.outputs_,
|
||||
compaction_outputs_.outputs_);
|
||||
}
|
||||
|
||||
void SubcompactionState::Cleanup(Cache* cache) {
|
||||
penultimate_level_outputs_.Cleanup();
|
||||
compaction_outputs_.Cleanup();
|
||||
|
||||
if (!status.ok()) {
|
||||
for (const auto& out : GetOutputs()) {
|
||||
// If this file was inserted into the table cache then remove
|
||||
// them here because this compaction was not committed.
|
||||
TableCache::Evict(cache, out.meta.fd.GetNumber());
|
||||
}
|
||||
}
|
||||
// TODO: sub_compact.io_status is not checked like status. Not sure if thats
|
||||
// intentional. So ignoring the io_status as of now.
|
||||
io_status.PermitUncheckedError();
|
||||
}
|
||||
|
||||
Slice SubcompactionState::SmallestUserKey() const {
|
||||
if (has_penultimate_level_outputs_) {
|
||||
Slice a = compaction_outputs_.SmallestUserKey();
|
||||
Slice b = penultimate_level_outputs_.SmallestUserKey();
|
||||
if (a.empty()) {
|
||||
return b;
|
||||
}
|
||||
if (b.empty()) {
|
||||
return a;
|
||||
}
|
||||
const Comparator* user_cmp =
|
||||
compaction->column_family_data()->user_comparator();
|
||||
if (user_cmp->Compare(a, b) > 0) {
|
||||
return b;
|
||||
} else {
|
||||
return a;
|
||||
}
|
||||
} else {
|
||||
return compaction_outputs_.SmallestUserKey();
|
||||
}
|
||||
}
|
||||
|
||||
Slice SubcompactionState::LargestUserKey() const {
|
||||
if (has_penultimate_level_outputs_) {
|
||||
Slice a = compaction_outputs_.LargestUserKey();
|
||||
Slice b = penultimate_level_outputs_.LargestUserKey();
|
||||
if (a.empty()) {
|
||||
return b;
|
||||
}
|
||||
if (b.empty()) {
|
||||
return a;
|
||||
}
|
||||
const Comparator* user_cmp =
|
||||
compaction->column_family_data()->user_comparator();
|
||||
if (user_cmp->Compare(a, b) < 0) {
|
||||
return b;
|
||||
} else {
|
||||
return a;
|
||||
}
|
||||
} else {
|
||||
return compaction_outputs_.LargestUserKey();
|
||||
}
|
||||
}
|
||||
|
||||
bool SubcompactionState::ShouldStopBefore(const Slice& internal_key) {
|
||||
uint64_t curr_file_size = Current().GetCurrentOutputFileSize();
|
||||
const InternalKeyComparator* icmp =
|
||||
&compaction->column_family_data()->internal_comparator();
|
||||
|
||||
// Invalid local_output_split_key indicates that we do not need to split
|
||||
if (local_output_split_key_ != nullptr && !is_split_) {
|
||||
// Split occurs when the next key is larger than/equal to the cursor
|
||||
if (icmp->Compare(internal_key, local_output_split_key_->Encode()) >= 0) {
|
||||
is_split_ = true;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<FileMetaData*>& grandparents = compaction->grandparents();
|
||||
bool grandparant_file_switched = false;
|
||||
// Scan to find the earliest grandparent file that contains key.
|
||||
while (grandparent_index_ < grandparents.size() &&
|
||||
icmp->Compare(internal_key,
|
||||
grandparents[grandparent_index_]->largest.Encode()) >
|
||||
0) {
|
||||
if (seen_key_) {
|
||||
overlapped_bytes_ += grandparents[grandparent_index_]->fd.GetFileSize();
|
||||
grandparant_file_switched = true;
|
||||
}
|
||||
assert(grandparent_index_ + 1 >= grandparents.size() ||
|
||||
icmp->Compare(
|
||||
grandparents[grandparent_index_]->largest.Encode(),
|
||||
grandparents[grandparent_index_ + 1]->smallest.Encode()) <= 0);
|
||||
grandparent_index_++;
|
||||
}
|
||||
seen_key_ = true;
|
||||
|
||||
if (grandparant_file_switched &&
|
||||
overlapped_bytes_ + curr_file_size > compaction->max_compaction_bytes()) {
|
||||
// Too much overlap for current output; start new output
|
||||
overlapped_bytes_ = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!files_to_cut_for_ttl_.empty()) {
|
||||
if (cur_files_to_cut_for_ttl_ != -1) {
|
||||
// Previous key is inside the range of a file
|
||||
if (icmp->Compare(internal_key,
|
||||
files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
|
||||
->largest.Encode()) > 0) {
|
||||
next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
|
||||
cur_files_to_cut_for_ttl_ = -1;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
// Look for the key position
|
||||
while (next_files_to_cut_for_ttl_ <
|
||||
static_cast<int>(files_to_cut_for_ttl_.size())) {
|
||||
if (icmp->Compare(internal_key,
|
||||
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
|
||||
->smallest.Encode()) >= 0) {
|
||||
if (icmp->Compare(internal_key,
|
||||
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
|
||||
->largest.Encode()) <= 0) {
|
||||
// With in the current file
|
||||
cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
|
||||
return true;
|
||||
}
|
||||
// Beyond the current file
|
||||
next_files_to_cut_for_ttl_++;
|
||||
} else {
|
||||
// Still fall into the gap
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
Status SubcompactionState::AddToOutput(
|
||||
const CompactionIterator& iter,
|
||||
const CompactionFileOpenFunc& open_file_func,
|
||||
const CompactionFileCloseFunc& close_file_func) {
|
||||
// update target output first
|
||||
is_current_penultimate_level_ = iter.output_to_penultimate_level();
|
||||
current_outputs_ = is_current_penultimate_level_ ? &penultimate_level_outputs_
|
||||
: &compaction_outputs_;
|
||||
if (is_current_penultimate_level_) {
|
||||
has_penultimate_level_outputs_ = true;
|
||||
}
|
||||
|
||||
return Current().AddToOutput(iter, open_file_func, close_file_func);
|
||||
}
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
|
@ -0,0 +1,255 @@
|
|||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
//
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "db/blob/blob_file_addition.h"
|
||||
#include "db/blob/blob_garbage_meter.h"
|
||||
#include "db/compaction/compaction.h"
|
||||
#include "db/compaction/compaction_iterator.h"
|
||||
#include "db/compaction/compaction_outputs.h"
|
||||
#include "db/internal_stats.h"
|
||||
#include "db/output_validator.h"
|
||||
#include "db/range_del_aggregator.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
// Maintains state and outputs for each sub-compaction
|
||||
// It contains 2 `CompactionOutputs`:
|
||||
// 1. one for the normal output files
|
||||
// 2. another for the penultimate level outputs
|
||||
// a `current` pointer maintains the current output group, when calling
|
||||
// `AddToOutput()`, it checks the output of the current compaction_iterator key
|
||||
// and point `current` to the target output group. By default, it just points to
|
||||
// normal compaction_outputs, if the compaction_iterator key should be placed on
|
||||
// the penultimate level, `current` is changed to point to
|
||||
// `penultimate_level_outputs`.
|
||||
// The later operations uses `Current()` to get the target group.
|
||||
//
|
||||
// +----------+ +-----------------------------+ +---------+
|
||||
// | *current |--------> | compaction_outputs |----->| output |
|
||||
// +----------+ +-----------------------------+ +---------+
|
||||
// | | output |
|
||||
// | +---------+
|
||||
// | | ... |
|
||||
// |
|
||||
// | +-----------------------------+ +---------+
|
||||
// +-------------> | penultimate_level_outputs |----->| output |
|
||||
// +-----------------------------+ +---------+
|
||||
// | ... |
|
||||
|
||||
class SubcompactionState {
|
||||
public:
|
||||
const Compaction* compaction;
|
||||
|
||||
// The boundaries of the key-range this compaction is interested in. No two
|
||||
// sub-compactions may have overlapping key-ranges.
|
||||
// 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded
|
||||
const Slice *start, *end;
|
||||
|
||||
// The return status of this sub-compaction
|
||||
Status status;
|
||||
|
||||
// The return IO Status of this sub-compaction
|
||||
IOStatus io_status;
|
||||
|
||||
// Notify on sub-compaction completion only if listener was notified on
|
||||
// sub-compaction begin.
|
||||
bool notify_on_subcompaction_completion = false;
|
||||
|
||||
// compaction job stats for this sub-compaction
|
||||
CompactionJobStats compaction_job_stats;
|
||||
|
||||
// sub-compaction job id, which is used to identify different sub-compaction
|
||||
// within the same compaction job.
|
||||
const uint32_t sub_job_id;
|
||||
|
||||
Slice SmallestUserKey() const;
|
||||
|
||||
Slice LargestUserKey() const;
|
||||
|
||||
// Get all outputs from the subcompaction. For per_key_placement compaction,
|
||||
// it returns both the last level outputs and penultimate level outputs.
|
||||
OutputIterator GetOutputs() const;
|
||||
|
||||
// Assign range dels aggregator, for each range_del, it can only be assigned
|
||||
// to one output level, for per_key_placement, it's going to be the
|
||||
// penultimate level.
|
||||
void AssignRangeDelAggregator(
|
||||
std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
|
||||
if (compaction->SupportsPerKeyPlacement()) {
|
||||
penultimate_level_outputs_.AssignRangeDelAggregator(
|
||||
std::move(range_del_agg));
|
||||
} else {
|
||||
compaction_outputs_.AssignRangeDelAggregator(std::move(range_del_agg));
|
||||
}
|
||||
}
|
||||
|
||||
void RemoveLastEmptyOutput() {
|
||||
compaction_outputs_.RemoveLastEmptyOutput();
|
||||
penultimate_level_outputs_.RemoveLastEmptyOutput();
|
||||
}
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
void BuildSubcompactionJobInfo(
|
||||
SubcompactionJobInfo& subcompaction_job_info) const {
|
||||
const Compaction* c = compaction;
|
||||
const ColumnFamilyData* cfd = c->column_family_data();
|
||||
|
||||
subcompaction_job_info.cf_id = cfd->GetID();
|
||||
subcompaction_job_info.cf_name = cfd->GetName();
|
||||
subcompaction_job_info.status = status;
|
||||
subcompaction_job_info.subcompaction_job_id = static_cast<int>(sub_job_id);
|
||||
subcompaction_job_info.base_input_level = c->start_level();
|
||||
subcompaction_job_info.output_level = c->output_level();
|
||||
subcompaction_job_info.stats = compaction_job_stats;
|
||||
}
|
||||
#endif // !ROCKSDB_LITE
|
||||
|
||||
SubcompactionState() = delete;
|
||||
SubcompactionState(const SubcompactionState&) = delete;
|
||||
SubcompactionState& operator=(const SubcompactionState&) = delete;
|
||||
|
||||
SubcompactionState(Compaction* c, Slice* _start, Slice* _end,
|
||||
uint32_t _sub_job_id)
|
||||
: compaction(c),
|
||||
start(_start),
|
||||
end(_end),
|
||||
sub_job_id(_sub_job_id),
|
||||
compaction_outputs_(c, /*is_penultimate_level=*/false),
|
||||
penultimate_level_outputs_(c, /*is_penultimate_level=*/true) {
|
||||
assert(compaction != nullptr);
|
||||
const InternalKeyComparator* icmp =
|
||||
&compaction->column_family_data()->internal_comparator();
|
||||
const InternalKey* output_split_key = compaction->GetOutputSplitKey();
|
||||
// Invalid output_split_key indicates that we do not need to split
|
||||
if (output_split_key != nullptr) {
|
||||
// We may only split the output when the cursor is in the range. Split
|
||||
if ((end == nullptr || icmp->user_comparator()->Compare(
|
||||
ExtractUserKey(output_split_key->Encode()),
|
||||
ExtractUserKey(*end)) < 0) &&
|
||||
(start == nullptr || icmp->user_comparator()->Compare(
|
||||
ExtractUserKey(output_split_key->Encode()),
|
||||
ExtractUserKey(*start)) > 0)) {
|
||||
local_output_split_key_ = output_split_key;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SubcompactionState(SubcompactionState&& state) noexcept
|
||||
: compaction(state.compaction),
|
||||
start(state.start),
|
||||
end(state.end),
|
||||
status(std::move(state.status)),
|
||||
io_status(std::move(state.io_status)),
|
||||
notify_on_subcompaction_completion(
|
||||
state.notify_on_subcompaction_completion),
|
||||
compaction_job_stats(std::move(state.compaction_job_stats)),
|
||||
sub_job_id(state.sub_job_id),
|
||||
files_to_cut_for_ttl_(std::move(state.files_to_cut_for_ttl_)),
|
||||
cur_files_to_cut_for_ttl_(state.cur_files_to_cut_for_ttl_),
|
||||
next_files_to_cut_for_ttl_(state.next_files_to_cut_for_ttl_),
|
||||
grandparent_index_(state.grandparent_index_),
|
||||
overlapped_bytes_(state.overlapped_bytes_),
|
||||
seen_key_(state.seen_key_),
|
||||
compaction_outputs_(std::move(state.compaction_outputs_)),
|
||||
penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)),
|
||||
is_current_penultimate_level_(state.is_current_penultimate_level_),
|
||||
has_penultimate_level_outputs_(state.has_penultimate_level_outputs_) {
|
||||
current_outputs_ = is_current_penultimate_level_
|
||||
? &penultimate_level_outputs_
|
||||
: &compaction_outputs_;
|
||||
}
|
||||
|
||||
bool HasPenultimateLevelOutputs() const {
|
||||
return has_penultimate_level_outputs_ ||
|
||||
penultimate_level_outputs_.HasRangeDel();
|
||||
}
|
||||
|
||||
void FillFilesToCutForTtl();
|
||||
|
||||
// Returns true iff we should stop building the current output
|
||||
// before processing "internal_key".
|
||||
bool ShouldStopBefore(const Slice& internal_key);
|
||||
|
||||
bool IsCurrentPenultimateLevel() const {
|
||||
return is_current_penultimate_level_;
|
||||
}
|
||||
|
||||
// Add all the new files from this compaction to version_edit
|
||||
void AddOutputsEdit(VersionEdit* out_edit) const {
|
||||
for (const auto& file : penultimate_level_outputs_.outputs_) {
|
||||
out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta);
|
||||
}
|
||||
for (const auto& file : compaction_outputs_.outputs_) {
|
||||
out_edit->AddFile(compaction->output_level(), file.meta);
|
||||
}
|
||||
}
|
||||
|
||||
void Cleanup(Cache* cache);
|
||||
|
||||
void AggregateCompactionStats(
|
||||
InternalStats::CompactionStatsFull& compaction_stats) const;
|
||||
|
||||
CompactionOutputs& Current() const {
|
||||
assert(current_outputs_);
|
||||
return *current_outputs_;
|
||||
}
|
||||
|
||||
// Add compaction_iterator key/value to the `Current` output group.
|
||||
Status AddToOutput(const CompactionIterator& iter,
|
||||
const CompactionFileOpenFunc& open_file_func,
|
||||
const CompactionFileCloseFunc& close_file_func);
|
||||
|
||||
// Close all compaction output files, both output_to_penultimate_level outputs
|
||||
// and normal outputs.
|
||||
Status CloseCompactionFiles(const Status& curr_status,
|
||||
const CompactionFileOpenFunc& open_file_func,
|
||||
const CompactionFileCloseFunc& close_file_func) {
|
||||
// Call FinishCompactionOutputFile() even if status is not ok: it needs to
|
||||
// close the output file.
|
||||
Status s = penultimate_level_outputs_.CloseOutput(
|
||||
curr_status, open_file_func, close_file_func);
|
||||
s = compaction_outputs_.CloseOutput(s, open_file_func, close_file_func);
|
||||
return s;
|
||||
}
|
||||
|
||||
private:
|
||||
// Some identified files with old oldest ancester time and the range should be
|
||||
// isolated out so that the output file(s) in that range can be merged down
|
||||
// for TTL and clear the timestamps for the range.
|
||||
std::vector<FileMetaData*> files_to_cut_for_ttl_;
|
||||
int cur_files_to_cut_for_ttl_ = -1;
|
||||
int next_files_to_cut_for_ttl_ = 0;
|
||||
|
||||
// An index that used to speed up ShouldStopBefore().
|
||||
size_t grandparent_index_ = 0;
|
||||
// The number of bytes overlapping between the current output and
|
||||
// grandparent files used in ShouldStopBefore().
|
||||
uint64_t overlapped_bytes_ = 0;
|
||||
// A flag determines whether the key has been seen in ShouldStopBefore()
|
||||
bool seen_key_ = false;
|
||||
|
||||
// A flag determines if this subcompaction has been split by the cursor
|
||||
bool is_split_ = false;
|
||||
|
||||
// We also maintain the output split key for each subcompaction to avoid
|
||||
// repetitive comparison in ShouldStopBefore()
|
||||
const InternalKey* local_output_split_key_ = nullptr;
|
||||
|
||||
// State kept for output being generated
|
||||
CompactionOutputs compaction_outputs_;
|
||||
CompactionOutputs penultimate_level_outputs_;
|
||||
CompactionOutputs* current_outputs_ = &compaction_outputs_;
|
||||
bool is_current_penultimate_level_ = false;
|
||||
bool has_penultimate_level_outputs_ = false;
|
||||
};
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
File diff suppressed because it is too large
Load Diff
|
@ -30,10 +30,100 @@ namespace ROCKSDB_NAMESPACE {
|
|||
// SYNC_POINT is not supported in released Windows mode.
|
||||
#if !defined(ROCKSDB_LITE)
|
||||
|
||||
class CompactionStatsCollector : public EventListener {
|
||||
public:
|
||||
CompactionStatsCollector()
|
||||
: compaction_completed_(
|
||||
static_cast<int>(CompactionReason::kNumOfReasons)) {
|
||||
for (auto& v : compaction_completed_) {
|
||||
v.store(0);
|
||||
}
|
||||
}
|
||||
|
||||
~CompactionStatsCollector() override {}
|
||||
|
||||
void OnCompactionCompleted(DB* /* db */,
|
||||
const CompactionJobInfo& info) override {
|
||||
int k = static_cast<int>(info.compaction_reason);
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
assert(k >= 0 && k < num_of_reasons);
|
||||
compaction_completed_[k]++;
|
||||
}
|
||||
|
||||
void OnExternalFileIngested(
|
||||
DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
|
||||
int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
|
||||
compaction_completed_[k]++;
|
||||
}
|
||||
|
||||
void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
|
||||
int k = static_cast<int>(CompactionReason::kFlush);
|
||||
compaction_completed_[k]++;
|
||||
}
|
||||
|
||||
int NumberOfCompactions(CompactionReason reason) const {
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
int k = static_cast<int>(reason);
|
||||
assert(k >= 0 && k < num_of_reasons);
|
||||
return compaction_completed_.at(k).load();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::atomic<int>> compaction_completed_;
|
||||
};
|
||||
|
||||
class DBCompactionTest : public DBTestBase {
|
||||
public:
|
||||
DBCompactionTest()
|
||||
: DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {}
|
||||
|
||||
protected:
|
||||
#ifndef ROCKSDB_LITE
|
||||
uint64_t GetSstSizeHelper(Temperature temperature) {
|
||||
std::string prop;
|
||||
EXPECT_TRUE(dbfull()->GetProperty(
|
||||
DB::Properties::kLiveSstFilesSizeAtTemperature +
|
||||
std::to_string(static_cast<uint8_t>(temperature)),
|
||||
&prop));
|
||||
return static_cast<uint64_t>(std::atoi(prop.c_str()));
|
||||
}
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
/*
|
||||
* Verifies compaction stats of cfd are valid.
|
||||
*
|
||||
* For each level of cfd, its compaction stats are valid if
|
||||
* 1) sum(stat.counts) == stat.count, and
|
||||
* 2) stat.counts[i] == collector.NumberOfCompactions(i)
|
||||
*/
|
||||
void VerifyCompactionStats(ColumnFamilyData& cfd,
|
||||
const CompactionStatsCollector& collector) {
|
||||
#ifndef NDEBUG
|
||||
InternalStats* internal_stats_ptr = cfd.internal_stats();
|
||||
ASSERT_NE(internal_stats_ptr, nullptr);
|
||||
const std::vector<InternalStats::CompactionStats>& comp_stats =
|
||||
internal_stats_ptr->TEST_GetCompactionStats();
|
||||
const int num_of_reasons =
|
||||
static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
std::vector<int> counts(num_of_reasons, 0);
|
||||
// Count the number of compactions caused by each CompactionReason across
|
||||
// all levels.
|
||||
for (const auto& stat : comp_stats) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < num_of_reasons; i++) {
|
||||
counts[i] += stat.counts[i];
|
||||
sum += stat.counts[i];
|
||||
}
|
||||
ASSERT_EQ(sum, stat.count);
|
||||
}
|
||||
// Verify InternalStats bookkeeping matches that of
|
||||
// CompactionStatsCollector, assuming that all compactions complete.
|
||||
for (int i = 0; i < num_of_reasons; i++) {
|
||||
ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)),
|
||||
counts[i]);
|
||||
}
|
||||
#endif /* NDEBUG */
|
||||
}
|
||||
};
|
||||
|
||||
class DBCompactionTestWithParam
|
||||
|
@ -110,47 +200,6 @@ class FlushedFileCollector : public EventListener {
|
|||
std::mutex mutex_;
|
||||
};
|
||||
|
||||
class CompactionStatsCollector : public EventListener {
|
||||
public:
|
||||
CompactionStatsCollector()
|
||||
: compaction_completed_(static_cast<int>(CompactionReason::kNumOfReasons)) {
|
||||
for (auto& v : compaction_completed_) {
|
||||
v.store(0);
|
||||
}
|
||||
}
|
||||
|
||||
~CompactionStatsCollector() override {}
|
||||
|
||||
void OnCompactionCompleted(DB* /* db */,
|
||||
const CompactionJobInfo& info) override {
|
||||
int k = static_cast<int>(info.compaction_reason);
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
assert(k >= 0 && k < num_of_reasons);
|
||||
compaction_completed_[k]++;
|
||||
}
|
||||
|
||||
void OnExternalFileIngested(
|
||||
DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
|
||||
int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
|
||||
compaction_completed_[k]++;
|
||||
}
|
||||
|
||||
void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
|
||||
int k = static_cast<int>(CompactionReason::kFlush);
|
||||
compaction_completed_[k]++;
|
||||
}
|
||||
|
||||
int NumberOfCompactions(CompactionReason reason) const {
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
int k = static_cast<int>(reason);
|
||||
assert(k >= 0 && k < num_of_reasons);
|
||||
return compaction_completed_.at(k).load();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::atomic<int>> compaction_completed_;
|
||||
};
|
||||
|
||||
class SstStatsCollector : public EventListener {
|
||||
public:
|
||||
SstStatsCollector() : num_ssts_creation_started_(0) {}
|
||||
|
@ -247,40 +296,6 @@ void VerifyCompactionResult(
|
|||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Verifies compaction stats of cfd are valid.
|
||||
*
|
||||
* For each level of cfd, its compaction stats are valid if
|
||||
* 1) sum(stat.counts) == stat.count, and
|
||||
* 2) stat.counts[i] == collector.NumberOfCompactions(i)
|
||||
*/
|
||||
void VerifyCompactionStats(ColumnFamilyData& cfd,
|
||||
const CompactionStatsCollector& collector) {
|
||||
#ifndef NDEBUG
|
||||
InternalStats* internal_stats_ptr = cfd.internal_stats();
|
||||
ASSERT_NE(internal_stats_ptr, nullptr);
|
||||
const std::vector<InternalStats::CompactionStats>& comp_stats =
|
||||
internal_stats_ptr->TEST_GetCompactionStats();
|
||||
const int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
std::vector<int> counts(num_of_reasons, 0);
|
||||
// Count the number of compactions caused by each CompactionReason across
|
||||
// all levels.
|
||||
for (const auto& stat : comp_stats) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < num_of_reasons; i++) {
|
||||
counts[i] += stat.counts[i];
|
||||
sum += stat.counts[i];
|
||||
}
|
||||
ASSERT_EQ(sum, stat.count);
|
||||
}
|
||||
// Verify InternalStats bookkeeping matches that of CompactionStatsCollector,
|
||||
// assuming that all compactions complete.
|
||||
for (int i = 0; i < num_of_reasons; i++) {
|
||||
ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)), counts[i]);
|
||||
}
|
||||
#endif /* NDEBUG */
|
||||
}
|
||||
|
||||
const SstFileMetaData* PickFileRandomly(
|
||||
const ColumnFamilyMetaData& cf_meta,
|
||||
Random* rand,
|
||||
|
|
|
@ -20,8 +20,6 @@
|
|||
#include "rocksdb/system_clock.h"
|
||||
#include "util/hash_containers.h"
|
||||
|
||||
class ColumnFamilyData;
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
template <class Stats>
|
||||
|
@ -140,6 +138,23 @@ class InternalStats {
|
|||
|
||||
InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd);
|
||||
|
||||
// Per level compaction stats
|
||||
struct CompactionOutputsStats {
|
||||
uint64_t num_output_records = 0;
|
||||
uint64_t bytes_written = 0;
|
||||
uint64_t bytes_written_blob = 0;
|
||||
uint64_t num_output_files = 0;
|
||||
uint64_t num_output_files_blob = 0;
|
||||
|
||||
void Add(const CompactionOutputsStats& stats) {
|
||||
this->num_output_records += stats.num_output_records;
|
||||
this->bytes_written += stats.bytes_written;
|
||||
this->bytes_written_blob += stats.bytes_written_blob;
|
||||
this->num_output_files += stats.num_output_files;
|
||||
this->num_output_files_blob += stats.num_output_files_blob;
|
||||
}
|
||||
};
|
||||
|
||||
// Per level compaction stats. comp_stats_[level] stores the stats for
|
||||
// compactions that produced data for the specified "level".
|
||||
struct CompactionStats {
|
||||
|
@ -184,11 +199,14 @@ class InternalStats {
|
|||
// (num input entries - num output entries) for compaction levels N and N+1
|
||||
uint64_t num_dropped_records;
|
||||
|
||||
// Total output entries from compaction
|
||||
uint64_t num_output_records;
|
||||
|
||||
// Number of compactions done
|
||||
int count;
|
||||
|
||||
// Number of compactions done per CompactionReason
|
||||
int counts[static_cast<int>(CompactionReason::kNumOfReasons)];
|
||||
int counts[static_cast<int>(CompactionReason::kNumOfReasons)]{};
|
||||
|
||||
explicit CompactionStats()
|
||||
: micros(0),
|
||||
|
@ -205,6 +223,7 @@ class InternalStats {
|
|||
num_output_files_blob(0),
|
||||
num_input_records(0),
|
||||
num_dropped_records(0),
|
||||
num_output_records(0),
|
||||
count(0) {
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
for (int i = 0; i < num_of_reasons; i++) {
|
||||
|
@ -227,6 +246,7 @@ class InternalStats {
|
|||
num_output_files_blob(0),
|
||||
num_input_records(0),
|
||||
num_dropped_records(0),
|
||||
num_output_records(0),
|
||||
count(c) {
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
for (int i = 0; i < num_of_reasons; i++) {
|
||||
|
@ -240,7 +260,7 @@ class InternalStats {
|
|||
}
|
||||
}
|
||||
|
||||
explicit CompactionStats(const CompactionStats& c)
|
||||
CompactionStats(const CompactionStats& c)
|
||||
: micros(c.micros),
|
||||
cpu_micros(c.cpu_micros),
|
||||
bytes_read_non_output_levels(c.bytes_read_non_output_levels),
|
||||
|
@ -256,6 +276,7 @@ class InternalStats {
|
|||
num_output_files_blob(c.num_output_files_blob),
|
||||
num_input_records(c.num_input_records),
|
||||
num_dropped_records(c.num_dropped_records),
|
||||
num_output_records(c.num_output_records),
|
||||
count(c.count) {
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
for (int i = 0; i < num_of_reasons; i++) {
|
||||
|
@ -279,6 +300,7 @@ class InternalStats {
|
|||
num_output_files_blob = c.num_output_files_blob;
|
||||
num_input_records = c.num_input_records;
|
||||
num_dropped_records = c.num_dropped_records;
|
||||
num_output_records = c.num_output_records;
|
||||
count = c.count;
|
||||
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
|
@ -303,6 +325,7 @@ class InternalStats {
|
|||
this->num_output_files_blob = 0;
|
||||
this->num_input_records = 0;
|
||||
this->num_dropped_records = 0;
|
||||
this->num_output_records = 0;
|
||||
this->count = 0;
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
for (int i = 0; i < num_of_reasons; i++) {
|
||||
|
@ -327,6 +350,7 @@ class InternalStats {
|
|||
this->num_output_files_blob += c.num_output_files_blob;
|
||||
this->num_input_records += c.num_input_records;
|
||||
this->num_dropped_records += c.num_dropped_records;
|
||||
this->num_output_records += c.num_output_records;
|
||||
this->count += c.count;
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
for (int i = 0; i< num_of_reasons; i++) {
|
||||
|
@ -334,6 +358,15 @@ class InternalStats {
|
|||
}
|
||||
}
|
||||
|
||||
void Add(const CompactionOutputsStats& stats) {
|
||||
this->num_output_files += static_cast<int>(stats.num_output_files);
|
||||
this->num_output_records += stats.num_output_records;
|
||||
this->bytes_written += stats.bytes_written;
|
||||
this->bytes_written_blob += stats.bytes_written_blob;
|
||||
this->num_output_files_blob +=
|
||||
static_cast<int>(stats.num_output_files_blob);
|
||||
}
|
||||
|
||||
void Subtract(const CompactionStats& c) {
|
||||
this->micros -= c.micros;
|
||||
this->cpu_micros -= c.cpu_micros;
|
||||
|
@ -351,12 +384,70 @@ class InternalStats {
|
|||
this->num_output_files_blob -= c.num_output_files_blob;
|
||||
this->num_input_records -= c.num_input_records;
|
||||
this->num_dropped_records -= c.num_dropped_records;
|
||||
this->num_output_records -= c.num_output_records;
|
||||
this->count -= c.count;
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
for (int i = 0; i < num_of_reasons; i++) {
|
||||
counts[i] -= c.counts[i];
|
||||
}
|
||||
}
|
||||
|
||||
void ResetCompactionReason(CompactionReason reason) {
|
||||
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
||||
assert(count == 1); // only support update one compaction reason
|
||||
for (int i = 0; i < num_of_reasons; i++) {
|
||||
counts[i] = 0;
|
||||
}
|
||||
int r = static_cast<int>(reason);
|
||||
assert(r >= 0 && r < num_of_reasons);
|
||||
counts[r] = 1;
|
||||
}
|
||||
};
|
||||
|
||||
// Compaction stats, for per_key_placement compaction, it includes 2 levels
|
||||
// stats: the last level and the penultimate level.
|
||||
struct CompactionStatsFull {
|
||||
// the stats for the target primary output level
|
||||
CompactionStats stats;
|
||||
|
||||
// stats for penultimate level output if exist
|
||||
bool has_penultimate_level_output = false;
|
||||
CompactionStats penultimate_level_stats;
|
||||
|
||||
explicit CompactionStatsFull() : stats(), penultimate_level_stats() {}
|
||||
|
||||
explicit CompactionStatsFull(CompactionReason reason, int c)
|
||||
: stats(reason, c), penultimate_level_stats(reason, c){};
|
||||
|
||||
uint64_t TotalBytesWritten() const {
|
||||
uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
|
||||
if (has_penultimate_level_output) {
|
||||
bytes_written += penultimate_level_stats.bytes_written +
|
||||
penultimate_level_stats.bytes_written_blob;
|
||||
}
|
||||
return bytes_written;
|
||||
}
|
||||
|
||||
uint64_t DroppedRecords() {
|
||||
uint64_t output_records = stats.num_output_records;
|
||||
if (has_penultimate_level_output) {
|
||||
output_records += penultimate_level_stats.num_output_records;
|
||||
}
|
||||
if (stats.num_input_records > output_records) {
|
||||
return stats.num_input_records - output_records;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void SetMicros(uint64_t val) {
|
||||
stats.micros = val;
|
||||
penultimate_level_stats.micros = val;
|
||||
}
|
||||
|
||||
void AddCpuMicros(uint64_t val) {
|
||||
stats.cpu_micros += val;
|
||||
penultimate_level_stats.cpu_micros += val;
|
||||
}
|
||||
};
|
||||
|
||||
// For use with CacheEntryStatsCollector
|
||||
|
@ -403,6 +494,7 @@ class InternalStats {
|
|||
for (auto& comp_stat : comp_stats_) {
|
||||
comp_stat.Clear();
|
||||
}
|
||||
per_key_placement_comp_stats_.Clear();
|
||||
for (auto& h : file_read_latency_) {
|
||||
h.Clear();
|
||||
}
|
||||
|
@ -419,6 +511,15 @@ class InternalStats {
|
|||
comp_stats_by_pri_[thread_pri].Add(stats);
|
||||
}
|
||||
|
||||
void AddCompactionStats(int level, Env::Priority thread_pri,
|
||||
const CompactionStatsFull& comp_stats_full) {
|
||||
AddCompactionStats(level, thread_pri, comp_stats_full.stats);
|
||||
if (comp_stats_full.has_penultimate_level_output) {
|
||||
per_key_placement_comp_stats_.Add(
|
||||
comp_stats_full.penultimate_level_stats);
|
||||
}
|
||||
}
|
||||
|
||||
void IncBytesMoved(int level, uint64_t amount) {
|
||||
comp_stats_[level].bytes_moved += amount;
|
||||
}
|
||||
|
@ -479,6 +580,10 @@ class InternalStats {
|
|||
return comp_stats_;
|
||||
}
|
||||
|
||||
const CompactionStats& TEST_GetPerKeyPlacementCompactionStats() const {
|
||||
return per_key_placement_comp_stats_;
|
||||
}
|
||||
|
||||
void TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, bool foreground);
|
||||
|
||||
// Store a mapping from the user-facing DB::Properties string to our
|
||||
|
@ -518,6 +623,7 @@ class InternalStats {
|
|||
// Per-ColumnFamily/level compaction stats
|
||||
std::vector<CompactionStats> comp_stats_;
|
||||
std::vector<CompactionStats> comp_stats_by_pri_;
|
||||
CompactionStats per_key_placement_comp_stats_;
|
||||
std::vector<HistogramImpl> file_read_latency_;
|
||||
HistogramImpl blob_file_read_latency_;
|
||||
|
||||
|
@ -749,6 +855,23 @@ class InternalStats {
|
|||
InternalStats(int /*num_levels*/, SystemClock* /*clock*/,
|
||||
ColumnFamilyData* /*cfd*/) {}
|
||||
|
||||
// Per level compaction stats
|
||||
struct CompactionOutputsStats {
|
||||
uint64_t num_output_records = 0;
|
||||
uint64_t bytes_written = 0;
|
||||
uint64_t bytes_written_blob = 0;
|
||||
uint64_t num_output_files = 0;
|
||||
uint64_t num_output_files_blob = 0;
|
||||
|
||||
void Add(const CompactionOutputsStats& stats) {
|
||||
this->num_output_records += stats.num_output_records;
|
||||
this->bytes_written += stats.bytes_written;
|
||||
this->bytes_written_blob += stats.bytes_written_blob;
|
||||
this->num_output_files += stats.num_output_files;
|
||||
this->num_output_files_blob += stats.num_output_files_blob;
|
||||
}
|
||||
};
|
||||
|
||||
struct CompactionStats {
|
||||
uint64_t micros;
|
||||
uint64_t cpu_micros;
|
||||
|
@ -764,6 +887,7 @@ class InternalStats {
|
|||
int num_output_files_blob;
|
||||
uint64_t num_input_records;
|
||||
uint64_t num_dropped_records;
|
||||
uint64_t num_output_records;
|
||||
int count;
|
||||
|
||||
explicit CompactionStats() {}
|
||||
|
@ -774,12 +898,38 @@ class InternalStats {
|
|||
|
||||
void Add(const CompactionStats& /*c*/) {}
|
||||
|
||||
void Add(const CompactionOutputsStats& /*c*/) {}
|
||||
|
||||
void Subtract(const CompactionStats& /*c*/) {}
|
||||
};
|
||||
|
||||
struct CompactionStatsFull {
|
||||
// the stats for the target primary output level (per level stats)
|
||||
CompactionStats stats;
|
||||
|
||||
// stats for output_to_penultimate_level level (per level stats)
|
||||
bool has_penultimate_level_output = false;
|
||||
CompactionStats penultimate_level_stats;
|
||||
|
||||
explicit CompactionStatsFull(){};
|
||||
|
||||
explicit CompactionStatsFull(CompactionReason /*reason*/, int /*c*/){};
|
||||
|
||||
uint64_t TotalBytesWritten() const { return 0; }
|
||||
|
||||
uint64_t DroppedRecords() { return 0; }
|
||||
|
||||
void SetMicros(uint64_t /*val*/){};
|
||||
|
||||
void AddCpuMicros(uint64_t /*val*/){};
|
||||
};
|
||||
|
||||
void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/,
|
||||
const CompactionStats& /*stats*/) {}
|
||||
|
||||
void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/,
|
||||
const CompactionStatsFull& /*unmerged_stats*/) {}
|
||||
|
||||
void IncBytesMoved(int /*level*/, uint64_t /*amount*/) {}
|
||||
|
||||
void AddCFStats(InternalCFStatsType /*type*/, uint64_t /*value*/) {}
|
||||
|
|
|
@ -102,5 +102,7 @@ struct CompactionJobStats {
|
|||
|
||||
// number of single-deletes which meet something other than a put
|
||||
uint64_t num_single_del_mismatch;
|
||||
|
||||
// TODO: Add output_to_penultimate_level output information
|
||||
};
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
5
src.mk
5
src.mk
|
@ -33,7 +33,11 @@ LIB_SOURCES = \
|
|||
db/compaction/compaction_picker_fifo.cc \
|
||||
db/compaction/compaction_picker_level.cc \
|
||||
db/compaction/compaction_picker_universal.cc \
|
||||
db/compaction/compaction_service_job.cc \
|
||||
db/compaction/compaction_state.cc \
|
||||
db/compaction/compaction_outputs.cc \
|
||||
db/compaction/sst_partitioner.cc \
|
||||
db/compaction/subcompaction_state.cc \
|
||||
db/convenience.cc \
|
||||
db/db_filesnapshot.cc \
|
||||
db/db_impl/compacted_db_impl.cc \
|
||||
|
@ -433,6 +437,7 @@ TEST_MAIN_SOURCES = \
|
|||
db/compaction/compaction_job_stats_test.cc \
|
||||
db/compaction/compaction_picker_test.cc \
|
||||
db/compaction/compaction_service_test.cc \
|
||||
db/compaction/tiered_compaction_test.cc \
|
||||
db/comparator_db_test.cc \
|
||||
db/corruption_test.cc \
|
||||
db/cuckoo_table_db_test.cc \
|
||||
|
|
Loading…
Reference in New Issue