diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index 99e5dd5ac6..e868d785a8 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -115,6 +115,42 @@ void Compaction::GetBoundaryKeys( } } +void Compaction::GetBoundaryInternalKeys( + VersionStorageInfo* vstorage, + const std::vector& inputs, InternalKey* smallest_key, + InternalKey* largest_key, int exclude_level) { + bool initialized = false; + const InternalKeyComparator* icmp = vstorage->InternalComparator(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i].files.empty() || inputs[i].level == exclude_level) { + continue; + } + if (inputs[i].level == 0) { + // we need to consider all files on level 0 + for (const auto* f : inputs[i].files) { + if (!initialized || icmp->Compare(f->smallest, *smallest_key) < 0) { + *smallest_key = f->smallest; + } + if (!initialized || icmp->Compare(f->largest, *largest_key) > 0) { + *largest_key = f->largest; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + if (!initialized || + icmp->Compare(inputs[i].files[0]->smallest, *smallest_key) < 0) { + *smallest_key = inputs[i].files[0]->smallest; + } + if (!initialized || + icmp->Compare(inputs[i].files.back()->largest, *largest_key) > 0) { + *largest_key = inputs[i].files.back()->largest; + } + initialized = true; + } + } +} + std::vector Compaction::PopulateWithAtomicBoundaries( VersionStorageInfo* vstorage, std::vector inputs) { const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); @@ -399,9 +435,14 @@ void Compaction::PopulatePenultimateLevelOutputRange() { } } - GetBoundaryKeys(input_vstorage_, inputs_, - &penultimate_level_smallest_user_key_, - &penultimate_level_largest_user_key_, exclude_level); + // FIXME: should make use of `penultimate_output_range_type_`. + // FIXME: when last level's input range does not overlap with + // penultimate level, and penultimate level input is empty, + // this call will not set penultimate_level_smallest_ or + // penultimate_level_largest_. No keys will be compacted up. + GetBoundaryInternalKeys(input_vstorage_, inputs_, + &penultimate_level_smallest_, + &penultimate_level_largest_, exclude_level); } Compaction::~Compaction() { @@ -426,33 +467,39 @@ bool Compaction::OverlapPenultimateLevelOutputRange( if (!SupportsPerKeyPlacement()) { return false; } + + // See FIXME in Compaction::PopulatePenultimateLevelOutputRange(). + // We do not compact any key up in this case. + if (penultimate_level_smallest_.size() == 0 || + penultimate_level_largest_.size() == 0) { + return false; + } + const Comparator* ucmp = input_vstorage_->InternalComparator()->user_comparator(); return ucmp->CompareWithoutTimestamp( - smallest_key, penultimate_level_largest_user_key_) <= 0 && + smallest_key, penultimate_level_largest_.user_key()) <= 0 && ucmp->CompareWithoutTimestamp( - largest_key, penultimate_level_smallest_user_key_) >= 0; + largest_key, penultimate_level_smallest_.user_key()) >= 0; } // key includes timestamp if user-defined timestamp is enabled. -bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const { +bool Compaction::WithinPenultimateLevelOutputRange( + const ParsedInternalKey& ikey) const { if (!SupportsPerKeyPlacement()) { return false; } - if (penultimate_level_smallest_user_key_.empty() || - penultimate_level_largest_user_key_.empty()) { + if (penultimate_level_smallest_.size() == 0 || + penultimate_level_largest_.size() == 0) { return false; } - const Comparator* ucmp = - input_vstorage_->InternalComparator()->user_comparator(); + const InternalKeyComparator* icmp = input_vstorage_->InternalComparator(); - return ucmp->CompareWithoutTimestamp( - key, penultimate_level_smallest_user_key_) >= 0 && - ucmp->CompareWithoutTimestamp( - key, penultimate_level_largest_user_key_) <= 0; + return icmp->Compare(ikey, penultimate_level_smallest_.Encode()) >= 0 && + icmp->Compare(ikey, penultimate_level_largest_.Encode()) <= 0; } bool Compaction::InputCompressionMatchesOutput() const { diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index 22ce202593..50c75f70b2 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -353,14 +353,6 @@ class Compaction { Slice GetLargestUserKey() const { return largest_user_key_; } - Slice GetPenultimateLevelSmallestUserKey() const { - return penultimate_level_smallest_user_key_; - } - - Slice GetPenultimateLevelLargestUserKey() const { - return penultimate_level_largest_user_key_; - } - PenultimateOutputRangeType GetPenultimateOutputRangeType() const { return penultimate_output_range_type_; } @@ -383,10 +375,8 @@ class Compaction { // per_key_placement feature, which is safe to place the key to the // penultimate level. different compaction strategy has different rules. // If per_key_placement is not supported, always return false. - // TODO: currently it doesn't support moving data from the last level to the - // penultimate level // key includes timestamp if user-defined timestamp is enabled. - bool WithinPenultimateLevelOutputRange(const Slice& key) const; + bool WithinPenultimateLevelOutputRange(const ParsedInternalKey& ikey) const; CompactionReason compaction_reason() const { return compaction_reason_; } @@ -456,6 +446,13 @@ class Compaction { Slice* smallest_key, Slice* largest_key, int exclude_level = -1); + // get the smallest and largest internal key present in files to be compacted + static void GetBoundaryInternalKeys( + VersionStorageInfo* vstorage, + const std::vector& inputs, + InternalKey* smallest_key, InternalKey* largest_key, + int exclude_level = -1); + // populate penultimate level output range, which will be used to determine if // a key is safe to output to the penultimate level (details see // `Compaction::WithinPenultimateLevelOutputRange()`. @@ -568,8 +565,8 @@ class Compaction { // Key range for penultimate level output // includes timestamp if user-defined timestamp is enabled. // penultimate_output_range_type_ shows the range type - Slice penultimate_level_smallest_user_key_; - Slice penultimate_level_largest_user_key_; + InternalKey penultimate_level_smallest_; + InternalKey penultimate_level_largest_; PenultimateOutputRangeType penultimate_output_range_type_ = PenultimateOutputRangeType::kNotSupported; }; diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index abfa7a692e..85d1c039bd 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -1227,7 +1227,7 @@ void CompactionIterator::DecideOutputLevel() { // not from this compaction. // TODO: add statistic for declined output_to_penultimate_level bool safe_to_penultimate_level = - compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key); + compaction_->WithinPenultimateLevelOutputRange(ikey_); if (!safe_to_penultimate_level) { output_to_penultimate_level_ = false; // It could happen when disable/enable `last_level_temperature` while @@ -1256,10 +1256,13 @@ void CompactionIterator::PrepareOutput() { } else if (ikey_.type == kTypeBlobIndex) { GarbageCollectBlobIfNeeded(); } - } - if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) { - DecideOutputLevel(); + // For range del sentinel, we don't use it to cut files for bottommost + // compaction. So it should not make a difference which output level we + // decide. + if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) { + DecideOutputLevel(); + } } // Zeroing out the sequence number leads to better compression. diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index 15193b5871..1ff9c88692 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -119,7 +119,8 @@ class CompactionIterator { virtual bool SupportsPerKeyPlacement() const = 0; // `key` includes timestamp if user-defined timestamp is enabled. - virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0; + virtual bool WithinPenultimateLevelOutputRange( + const ParsedInternalKey&) const = 0; }; class RealCompaction : public CompactionProxy { @@ -186,8 +187,9 @@ class CompactionIterator { // Check if key is within penultimate level output range, to see if it's // safe to output to the penultimate level for per_key_placement feature. // `key` includes timestamp if user-defined timestamp is enabled. - bool WithinPenultimateLevelOutputRange(const Slice& key) const override { - return compaction_->WithinPenultimateLevelOutputRange(key); + bool WithinPenultimateLevelOutputRange( + const ParsedInternalKey& ikey) const override { + return compaction_->WithinPenultimateLevelOutputRange(ikey); } private: diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc index 20428a586d..699e629693 100644 --- a/db/compaction/compaction_iterator_test.cc +++ b/db/compaction/compaction_iterator_test.cc @@ -184,8 +184,9 @@ class FakeCompaction : public CompactionIterator::CompactionProxy { return supports_per_key_placement; } - bool WithinPenultimateLevelOutputRange(const Slice& key) const override { - return (!key.starts_with("unsafe_pb")); + bool WithinPenultimateLevelOutputRange( + const ParsedInternalKey& key) const override { + return (!key.user_key.starts_with("unsafe_pb")); } bool key_not_exists_beyond_output_level = false; diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 257848e46e..99b099759d 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -1900,6 +1900,8 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, sub_compact->start.has_value() ? &tmp_start : nullptr, sub_compact->end.has_value() ? &tmp_end : nullptr); if (oldest_ancester_time == std::numeric_limits::max()) { + // TODO: fix DBSSTTest.GetTotalSstFilesSize and use + // kUnknownOldestAncesterTime oldest_ancester_time = current_time; } diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index c4e985c8b4..a168911100 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -1527,13 +1527,15 @@ TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) { {files0, files1, files2, files3}, input_levels, /*verify_func=*/[&](Compaction& comp) { for (char c = 'a'; c <= 'z'; c++) { - std::string c_str; - c_str = c; - const Slice key(c_str); if (c == 'a') { - ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key)); + ParsedInternalKey pik("a", 0U, kTypeValue); + ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(pik)); } else { - ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key)); + std::string c_str{c}; + // WithinPenultimateLevelOutputRange checks internal key range. + // 'z' is the last key, so set seqno properly. + ParsedInternalKey pik(c_str, c == 'z' ? 12U : 0U, kTypeValue); + ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(pik)); } } }); diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc index 0c6d1aa353..779b980d82 100644 --- a/db/compaction/tiered_compaction_test.cc +++ b/db/compaction/tiered_compaction_test.cc @@ -1202,15 +1202,120 @@ TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) { ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); - ASSERT_EQ( options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), 1); + + // Tests that we only compact keys up to penultimate level + // that are within penultimate level input's internal key range. + { + MutexLock l(&mutex); + hot_start = Key(0); + hot_end = Key(100); + } + const Snapshot* temp_snap = db_->GetSnapshot(); + // Key(0) and Key(1) here are inserted with higher sequence number + // than Key(0) and Key(1) inserted above. + // Only Key(0) in last level will be compacted up, not Key(1). + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Put(Key(1), "value" + std::to_string(100))); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + { + std::vector metas; + db_->GetLiveFilesMetaData(&metas); + for (const auto& f : metas) { + if (f.temperature == Temperature::kUnknown) { + // Expect Key(0), Key(0), Key(1) + ASSERT_EQ(f.num_entries, 3); + ASSERT_EQ(f.smallestkey, Key(0)); + ASSERT_EQ(f.largestkey, Key(1)); + } else { + ASSERT_EQ(f.temperature, Temperature::kCold); + // Key(2)-Key(49) and Key(100). + ASSERT_EQ(f.num_entries, 50); + } + } + } + db_->ReleaseSnapshot(temp_snap); } INSTANTIATE_TEST_CASE_P(TieredCompactionTest, TieredCompactionTest, testing::Bool()); +TEST_P(TieredCompactionTest, CheckInternalKeyRange) { + // When compacting keys from the last level to penultimate level, + // output to penultimate level should be within internal key range + // of input files from penultimate level. + // Set up: + // L5: + // File 1: DeleteRange[1, 3)@4, File 2: [3@5, 100@6] + // L6: + // File 3: [2@1, 3@2], File 4: [50@3] + // + // When File 1 and File 3 are being compacted, + // Key(3) cannot be compacted up, otherwise it causes + // inconsistency where File 3's Key(3) has a lower sequence number + // than File 2's Key(3). + const int kNumLevels = 7; + auto options = CurrentOptions(); + SetColdTemperature(options); + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + options.preclude_last_level_data_seconds = 10000; + DestroyAndReopen(options); + auto cmp = options.comparator; + + std::string hot_start = Key(0); + std::string hot_end = Key(0); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + cmp->Compare(context->key, hot_start) >= 0 && + cmp->Compare(context->key, hot_end) < 0; + }); + SyncPoint::GetInstance()->EnableProcessing(); + // File 1 + ASSERT_OK(Put(Key(2), "val2")); + ASSERT_OK(Put(Key(3), "val3")); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + // File 2 + ASSERT_OK(Put(Key(50), "val50")); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + + const Snapshot* snapshot = db_->GetSnapshot(); + hot_end = Key(100); + std::string start = Key(1); + std::string end = Key(3); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + // File 3 + ASSERT_OK(Put(Key(3), "vall")); + ASSERT_OK(Put(Key(100), "val100")); + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + // Try to compact keys up + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + start = Key(1); + end = Key(2); + Slice begin_slice(start); + Slice end_slice(end); + ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice)); + // Without internal key range checking, we get the following error: + // Corruption: force_consistency_checks(DEBUG): VersionBuilder: L5 has + // overlapping ranges: file #18 largest key: '6B6579303030303033' seq:102, + // type:1 vs. file #15 smallest key: '6B6579303030303033' seq:104, type:1 + db_->ReleaseSnapshot(snapshot); +} + class PrecludeLastLevelTest : public DBTestBase { public: PrecludeLastLevelTest() diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 95ed405a22..7590aa2f11 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -1538,6 +1538,11 @@ TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFilesSubjectToMemoryLimit) { } TEST_F(DBSSTTest, GetTotalSstFilesSize) { + // FIXME: L0 file and L1+ file also differ in size of `oldest_key_time`. + // L0 file has non-zero `oldest_key_time` while L1+ files have 0. + // The test passes since L1+ file uses current time instead of 0 + // as oldest_ancestor_time. + // // We don't propagate oldest-key-time table property on compaction and // just write 0 as default value. This affect the exact table size, since // we encode table properties as varint64. Force time to be 0 to work around