Check internal key range when compacting from last level to penultimate level (#12063)

Summary:
The test failure in https://github.com/facebook/rocksdb/issues/11909 shows that we may compact keys outside of internal key range of penultimate level input files from last level to penultimate level, which can potentially cause overlapping files in the penultimate level. This PR updates the  `Compaction::WithinPenultimateLevelOutputRange()` to check internal key range instead of user key.

Other fixes:
* skip range del sentinels when deciding output level for tiered compaction

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12063

Test Plan:
- existing unit tests
- apply the fix to https://github.com/facebook/rocksdb/issues/11905 and run `./tiered_compaction_test --gtest_filter="*RangeDelsCauseFileEndpointsToOverlap*"`

Reviewed By: ajkr

Differential Revision: D51288985

Pulled By: cbi42

fbshipit-source-id: 70085db5f5c3b15300bcbc39057d57b83fd9902a
This commit is contained in:
Changyu Bi 2023-11-17 10:50:40 -08:00 committed by Facebook GitHub Bot
parent 2f9ea8193f
commit 4e58cc6437
9 changed files with 206 additions and 42 deletions

View File

@ -115,6 +115,42 @@ void Compaction::GetBoundaryKeys(
}
}
void Compaction::GetBoundaryInternalKeys(
VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs, InternalKey* smallest_key,
InternalKey* largest_key, int exclude_level) {
bool initialized = false;
const InternalKeyComparator* icmp = vstorage->InternalComparator();
for (size_t i = 0; i < inputs.size(); ++i) {
if (inputs[i].files.empty() || inputs[i].level == exclude_level) {
continue;
}
if (inputs[i].level == 0) {
// we need to consider all files on level 0
for (const auto* f : inputs[i].files) {
if (!initialized || icmp->Compare(f->smallest, *smallest_key) < 0) {
*smallest_key = f->smallest;
}
if (!initialized || icmp->Compare(f->largest, *largest_key) > 0) {
*largest_key = f->largest;
}
initialized = true;
}
} else {
// we only need to consider the first and last file
if (!initialized ||
icmp->Compare(inputs[i].files[0]->smallest, *smallest_key) < 0) {
*smallest_key = inputs[i].files[0]->smallest;
}
if (!initialized ||
icmp->Compare(inputs[i].files.back()->largest, *largest_key) > 0) {
*largest_key = inputs[i].files.back()->largest;
}
initialized = true;
}
}
}
std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
@ -399,9 +435,14 @@ void Compaction::PopulatePenultimateLevelOutputRange() {
}
}
GetBoundaryKeys(input_vstorage_, inputs_,
&penultimate_level_smallest_user_key_,
&penultimate_level_largest_user_key_, exclude_level);
// FIXME: should make use of `penultimate_output_range_type_`.
// FIXME: when last level's input range does not overlap with
// penultimate level, and penultimate level input is empty,
// this call will not set penultimate_level_smallest_ or
// penultimate_level_largest_. No keys will be compacted up.
GetBoundaryInternalKeys(input_vstorage_, inputs_,
&penultimate_level_smallest_,
&penultimate_level_largest_, exclude_level);
}
Compaction::~Compaction() {
@ -426,33 +467,39 @@ bool Compaction::OverlapPenultimateLevelOutputRange(
if (!SupportsPerKeyPlacement()) {
return false;
}
// See FIXME in Compaction::PopulatePenultimateLevelOutputRange().
// We do not compact any key up in this case.
if (penultimate_level_smallest_.size() == 0 ||
penultimate_level_largest_.size() == 0) {
return false;
}
const Comparator* ucmp =
input_vstorage_->InternalComparator()->user_comparator();
return ucmp->CompareWithoutTimestamp(
smallest_key, penultimate_level_largest_user_key_) <= 0 &&
smallest_key, penultimate_level_largest_.user_key()) <= 0 &&
ucmp->CompareWithoutTimestamp(
largest_key, penultimate_level_smallest_user_key_) >= 0;
largest_key, penultimate_level_smallest_.user_key()) >= 0;
}
// key includes timestamp if user-defined timestamp is enabled.
bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const {
bool Compaction::WithinPenultimateLevelOutputRange(
const ParsedInternalKey& ikey) const {
if (!SupportsPerKeyPlacement()) {
return false;
}
if (penultimate_level_smallest_user_key_.empty() ||
penultimate_level_largest_user_key_.empty()) {
if (penultimate_level_smallest_.size() == 0 ||
penultimate_level_largest_.size() == 0) {
return false;
}
const Comparator* ucmp =
input_vstorage_->InternalComparator()->user_comparator();
const InternalKeyComparator* icmp = input_vstorage_->InternalComparator();
return ucmp->CompareWithoutTimestamp(
key, penultimate_level_smallest_user_key_) >= 0 &&
ucmp->CompareWithoutTimestamp(
key, penultimate_level_largest_user_key_) <= 0;
return icmp->Compare(ikey, penultimate_level_smallest_.Encode()) >= 0 &&
icmp->Compare(ikey, penultimate_level_largest_.Encode()) <= 0;
}
bool Compaction::InputCompressionMatchesOutput() const {

View File

@ -353,14 +353,6 @@ class Compaction {
Slice GetLargestUserKey() const { return largest_user_key_; }
Slice GetPenultimateLevelSmallestUserKey() const {
return penultimate_level_smallest_user_key_;
}
Slice GetPenultimateLevelLargestUserKey() const {
return penultimate_level_largest_user_key_;
}
PenultimateOutputRangeType GetPenultimateOutputRangeType() const {
return penultimate_output_range_type_;
}
@ -383,10 +375,8 @@ class Compaction {
// per_key_placement feature, which is safe to place the key to the
// penultimate level. different compaction strategy has different rules.
// If per_key_placement is not supported, always return false.
// TODO: currently it doesn't support moving data from the last level to the
// penultimate level
// key includes timestamp if user-defined timestamp is enabled.
bool WithinPenultimateLevelOutputRange(const Slice& key) const;
bool WithinPenultimateLevelOutputRange(const ParsedInternalKey& ikey) const;
CompactionReason compaction_reason() const { return compaction_reason_; }
@ -456,6 +446,13 @@ class Compaction {
Slice* smallest_key, Slice* largest_key,
int exclude_level = -1);
// get the smallest and largest internal key present in files to be compacted
static void GetBoundaryInternalKeys(
VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs,
InternalKey* smallest_key, InternalKey* largest_key,
int exclude_level = -1);
// populate penultimate level output range, which will be used to determine if
// a key is safe to output to the penultimate level (details see
// `Compaction::WithinPenultimateLevelOutputRange()`.
@ -568,8 +565,8 @@ class Compaction {
// Key range for penultimate level output
// includes timestamp if user-defined timestamp is enabled.
// penultimate_output_range_type_ shows the range type
Slice penultimate_level_smallest_user_key_;
Slice penultimate_level_largest_user_key_;
InternalKey penultimate_level_smallest_;
InternalKey penultimate_level_largest_;
PenultimateOutputRangeType penultimate_output_range_type_ =
PenultimateOutputRangeType::kNotSupported;
};

View File

@ -1227,7 +1227,7 @@ void CompactionIterator::DecideOutputLevel() {
// not from this compaction.
// TODO: add statistic for declined output_to_penultimate_level
bool safe_to_penultimate_level =
compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key);
compaction_->WithinPenultimateLevelOutputRange(ikey_);
if (!safe_to_penultimate_level) {
output_to_penultimate_level_ = false;
// It could happen when disable/enable `last_level_temperature` while
@ -1256,10 +1256,13 @@ void CompactionIterator::PrepareOutput() {
} else if (ikey_.type == kTypeBlobIndex) {
GarbageCollectBlobIfNeeded();
}
}
if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
DecideOutputLevel();
// For range del sentinel, we don't use it to cut files for bottommost
// compaction. So it should not make a difference which output level we
// decide.
if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
DecideOutputLevel();
}
}
// Zeroing out the sequence number leads to better compression.

View File

@ -119,7 +119,8 @@ class CompactionIterator {
virtual bool SupportsPerKeyPlacement() const = 0;
// `key` includes timestamp if user-defined timestamp is enabled.
virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0;
virtual bool WithinPenultimateLevelOutputRange(
const ParsedInternalKey&) const = 0;
};
class RealCompaction : public CompactionProxy {
@ -186,8 +187,9 @@ class CompactionIterator {
// Check if key is within penultimate level output range, to see if it's
// safe to output to the penultimate level for per_key_placement feature.
// `key` includes timestamp if user-defined timestamp is enabled.
bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
return compaction_->WithinPenultimateLevelOutputRange(key);
bool WithinPenultimateLevelOutputRange(
const ParsedInternalKey& ikey) const override {
return compaction_->WithinPenultimateLevelOutputRange(ikey);
}
private:

View File

@ -184,8 +184,9 @@ class FakeCompaction : public CompactionIterator::CompactionProxy {
return supports_per_key_placement;
}
bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
return (!key.starts_with("unsafe_pb"));
bool WithinPenultimateLevelOutputRange(
const ParsedInternalKey& key) const override {
return (!key.user_key.starts_with("unsafe_pb"));
}
bool key_not_exists_beyond_output_level = false;

View File

@ -1900,6 +1900,8 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
sub_compact->start.has_value() ? &tmp_start : nullptr,
sub_compact->end.has_value() ? &tmp_end : nullptr);
if (oldest_ancester_time == std::numeric_limits<uint64_t>::max()) {
// TODO: fix DBSSTTest.GetTotalSstFilesSize and use
// kUnknownOldestAncesterTime
oldest_ancester_time = current_time;
}

View File

@ -1527,13 +1527,15 @@ TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
{files0, files1, files2, files3}, input_levels,
/*verify_func=*/[&](Compaction& comp) {
for (char c = 'a'; c <= 'z'; c++) {
std::string c_str;
c_str = c;
const Slice key(c_str);
if (c == 'a') {
ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key));
ParsedInternalKey pik("a", 0U, kTypeValue);
ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(pik));
} else {
ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key));
std::string c_str{c};
// WithinPenultimateLevelOutputRange checks internal key range.
// 'z' is the last key, so set seqno properly.
ParsedInternalKey pik(c_str, c == 'z' ? 12U : 0U, kTypeValue);
ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(pik));
}
}
});

View File

@ -1202,15 +1202,120 @@ TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) {
ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
ASSERT_EQ(
options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
1);
// Tests that we only compact keys up to penultimate level
// that are within penultimate level input's internal key range.
{
MutexLock l(&mutex);
hot_start = Key(0);
hot_end = Key(100);
}
const Snapshot* temp_snap = db_->GetSnapshot();
// Key(0) and Key(1) here are inserted with higher sequence number
// than Key(0) and Key(1) inserted above.
// Only Key(0) in last level will be compacted up, not Key(1).
ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
ASSERT_OK(Put(Key(1), "value" + std::to_string(100)));
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
{
std::vector<LiveFileMetaData> metas;
db_->GetLiveFilesMetaData(&metas);
for (const auto& f : metas) {
if (f.temperature == Temperature::kUnknown) {
// Expect Key(0), Key(0), Key(1)
ASSERT_EQ(f.num_entries, 3);
ASSERT_EQ(f.smallestkey, Key(0));
ASSERT_EQ(f.largestkey, Key(1));
} else {
ASSERT_EQ(f.temperature, Temperature::kCold);
// Key(2)-Key(49) and Key(100).
ASSERT_EQ(f.num_entries, 50);
}
}
}
db_->ReleaseSnapshot(temp_snap);
}
INSTANTIATE_TEST_CASE_P(TieredCompactionTest, TieredCompactionTest,
testing::Bool());
TEST_P(TieredCompactionTest, CheckInternalKeyRange) {
// When compacting keys from the last level to penultimate level,
// output to penultimate level should be within internal key range
// of input files from penultimate level.
// Set up:
// L5:
// File 1: DeleteRange[1, 3)@4, File 2: [3@5, 100@6]
// L6:
// File 3: [2@1, 3@2], File 4: [50@3]
//
// When File 1 and File 3 are being compacted,
// Key(3) cannot be compacted up, otherwise it causes
// inconsistency where File 3's Key(3) has a lower sequence number
// than File 2's Key(3).
const int kNumLevels = 7;
auto options = CurrentOptions();
SetColdTemperature(options);
options.level_compaction_dynamic_level_bytes = true;
options.num_levels = kNumLevels;
options.statistics = CreateDBStatistics();
options.max_subcompactions = 10;
options.preclude_last_level_data_seconds = 10000;
DestroyAndReopen(options);
auto cmp = options.comparator;
std::string hot_start = Key(0);
std::string hot_end = Key(0);
SyncPoint::GetInstance()->SetCallBack(
"CompactionIterator::PrepareOutput.context", [&](void* arg) {
auto context = static_cast<PerKeyPlacementContext*>(arg);
context->output_to_penultimate_level =
cmp->Compare(context->key, hot_start) >= 0 &&
cmp->Compare(context->key, hot_end) < 0;
});
SyncPoint::GetInstance()->EnableProcessing();
// File 1
ASSERT_OK(Put(Key(2), "val2"));
ASSERT_OK(Put(Key(3), "val3"));
ASSERT_OK(Flush());
MoveFilesToLevel(6);
// File 2
ASSERT_OK(Put(Key(50), "val50"));
ASSERT_OK(Flush());
MoveFilesToLevel(6);
const Snapshot* snapshot = db_->GetSnapshot();
hot_end = Key(100);
std::string start = Key(1);
std::string end = Key(3);
ASSERT_OK(
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
ASSERT_OK(Flush());
MoveFilesToLevel(5);
// File 3
ASSERT_OK(Put(Key(3), "vall"));
ASSERT_OK(Put(Key(100), "val100"));
ASSERT_OK(Flush());
MoveFilesToLevel(5);
// Try to compact keys up
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
start = Key(1);
end = Key(2);
Slice begin_slice(start);
Slice end_slice(end);
ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice));
// Without internal key range checking, we get the following error:
// Corruption: force_consistency_checks(DEBUG): VersionBuilder: L5 has
// overlapping ranges: file #18 largest key: '6B6579303030303033' seq:102,
// type:1 vs. file #15 smallest key: '6B6579303030303033' seq:104, type:1
db_->ReleaseSnapshot(snapshot);
}
class PrecludeLastLevelTest : public DBTestBase {
public:
PrecludeLastLevelTest()

View File

@ -1538,6 +1538,11 @@ TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFilesSubjectToMemoryLimit) {
}
TEST_F(DBSSTTest, GetTotalSstFilesSize) {
// FIXME: L0 file and L1+ file also differ in size of `oldest_key_time`.
// L0 file has non-zero `oldest_key_time` while L1+ files have 0.
// The test passes since L1+ file uses current time instead of 0
// as oldest_ancestor_time.
//
// We don't propagate oldest-key-time table property on compaction and
// just write 0 as default value. This affect the exact table size, since
// we encode table properties as varint64. Force time to be 0 to work around