Support tiering when file endpoints overlap (#10961)

Summary:
Enabled output to penultimate level when file endpoints overlap. This is probably only possible when range tombstones span files. Otherwise the overlapping files would all be included in the penultimate level inputs thanks to our atomic compaction unit logic.

Also, corrected `penultimate_output_range_type_`, which is a minor fix as it appears only used for logging.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10961

Test Plan: updated unit test

Reviewed By: cbi42

Differential Revision: D41370615

Pulled By: ajkr

fbshipit-source-id: 7e75ec369a3b41b8382b336446c81825a4c4f572
This commit is contained in:
Andrew Kryczka 2022-11-23 09:20:58 -08:00 committed by Facebook GitHub Bot
parent 3d0d6b8140
commit 54c2542df2
2 changed files with 112 additions and 149 deletions

View File

@ -332,6 +332,7 @@ void Compaction::PopulatePenultimateLevelOutputRange() {
// the case that the penultimate level is empty).
if (immutable_options_.compaction_style == kCompactionStyleUniversal) {
exclude_level = kInvalidLevel;
penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange;
std::set<uint64_t> penultimate_inputs;
for (const auto& input_lvl : inputs_) {
if (input_lvl.level == penultimate_level_) {
@ -345,7 +346,8 @@ void Compaction::PopulatePenultimateLevelOutputRange() {
if (penultimate_inputs.find(file->fd.GetNumber()) ==
penultimate_inputs.end()) {
exclude_level = number_levels_ - 1;
penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange;
penultimate_output_range_type_ =
PenultimateOutputRangeType::kNonLastRange;
break;
}
}
@ -354,35 +356,6 @@ void Compaction::PopulatePenultimateLevelOutputRange() {
GetBoundaryKeys(input_vstorage_, inputs_,
&penultimate_level_smallest_user_key_,
&penultimate_level_largest_user_key_, exclude_level);
// If there's a case that the penultimate level output range is overlapping
// with the existing files, disable the penultimate level output by setting
// the range to empty. One example is the range delete could have overlap
// boundary with the next file. (which is actually a false overlap)
// TODO: Exclude such false overlap, so it won't disable the penultimate
// output.
std::set<uint64_t> penultimate_inputs;
for (const auto& input_lvl : inputs_) {
if (input_lvl.level == penultimate_level_) {
for (const auto& file : input_lvl.files) {
penultimate_inputs.emplace(file->fd.GetNumber());
}
}
}
auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
for (const auto& file : penultimate_files) {
if (penultimate_inputs.find(file->fd.GetNumber()) ==
penultimate_inputs.end() &&
OverlapPenultimateLevelOutputRange(file->smallest.user_key(),
file->largest.user_key())) {
// basically disable the penultimate range output. which should be rare
// or a false overlap caused by range del
penultimate_level_smallest_user_key_ = "";
penultimate_level_largest_user_key_ = "";
penultimate_output_range_type_ = PenultimateOutputRangeType::kDisabled;
}
}
}
Compaction::~Compaction() {

View File

@ -1869,155 +1869,145 @@ TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) {
Close();
}
struct TestPropertiesCollector : public TablePropertiesCollector {
Status AddUserKey(const Slice& key, const Slice& /*value*/,
EntryType /*type*/, SequenceNumber /*seq*/,
uint64_t /*file_size*/) override {
if (cmp->Compare(key, DBTestBase::Key(100)) == 0) {
has_key_100 = true;
}
if (cmp->Compare(key, DBTestBase::Key(200)) == 0) {
has_key_200 = true;
}
return Status::OK();
}
const char* Name() const override { return "TestTablePropertiesCollector"; }
UserCollectedProperties GetReadableProperties() const override {
UserCollectedProperties ret;
return ret;
}
Status Finish(UserCollectedProperties* /*properties*/) override {
// The LSM tree would be like:
// L5: [0,19] [20,39] [40,299]
// L6: [0, 299]
// the 3rd file @L5 has both 100 and 200, which will be marked for
// compaction
// Also avoid marking flushed SST for compaction, which won't have both 100
// and 200
if (has_key_100 && has_key_200) {
need_compact_ = true;
} else {
need_compact_ = false;
}
has_key_100 = false;
has_key_200 = false;
return Status::OK();
}
bool NeedCompact() const override { return need_compact_; }
const Comparator* cmp = BytewiseComparator();
private:
bool has_key_100 = false;
bool has_key_200 = false;
bool need_compact_ = false;
};
class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
public:
TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context /*context*/) override {
return new TestPropertiesCollector;
}
const char* Name() const override { return "TestTablePropertiesCollector"; }
};
TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompactionWithRangeDel) {
const int kNumTrigger = 4;
TEST_F(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) {
const int kNumLevels = 7;
const int kKeyPerSec = 10;
const int kSecondsPerKey = 10;
const int kNumFiles = 3;
const int kValueBytes = 4 << 10;
const int kFileBytes = 4 * kValueBytes;
// `kNumKeysPerFile == 5` is determined by the current file cutting heuristics
// for this choice of `kValueBytes` and `kFileBytes`.
const int kNumKeysPerFile = 5;
const int kNumKeys = kNumFiles * kNumKeysPerFile;
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal;
options.env = mock_env_.get();
options.level0_file_num_compaction_trigger = kNumTrigger;
options.preserve_internal_time_seconds = 10000;
options.last_level_temperature = Temperature::kCold;
options.preserve_internal_time_seconds = 600;
options.preclude_last_level_data_seconds = 1;
options.num_levels = kNumLevels;
// set a small max_compaction_bytes to avoid input level expansion
options.max_compaction_bytes = 30000;
options.ignore_max_compaction_bytes_for_input = false;
options.target_file_size_base = kFileBytes;
DestroyAndReopen(options);
// pass some time first, otherwise the first a few keys write time are going
// to be zero, and internally zero has special meaning: kUnknownSeqnoTime
dbfull()->TEST_WaitForPeridicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
dbfull()->TEST_WaitForPeridicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kSecondsPerKey));
});
Random rnd(301);
for (int i = 0; i < 300; i++) {
ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
dbfull()->TEST_WaitForPeridicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); });
}
ASSERT_OK(Flush());
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// make sure all data is compacted to the last level
ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
// Create 3 L5 files
auto factory = std::make_shared<ThreeRangesPartitionerFactory>();
options.sst_partitioner_factory = factory;
// the user defined properties_collector will mark the 3rd file for compaction
auto collector_factory = std::make_shared<TestPropertiesCollectorFactory>();
options.table_properties_collector_factories.resize(1);
options.table_properties_collector_factories[0] = collector_factory;
// enable tiered storage feature
options.preclude_last_level_data_seconds = 10000;
options.last_level_temperature = Temperature::kCold;
Reopen(options);
for (int i = 0; i < kNumTrigger - 2; i++) {
for (int j = 0; j < 100; j++) {
ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10)));
// Flush an L0 file with the following contents (new to old):
//
// Range deletions [4, 6) [7, 8) [9, 11)
// --- snap2 ---
// Key(0) .. Key(14)
// --- snap1 ---
// Key(3) .. Key(17)
const auto verify_db = [&]() {
for (int i = 0; i < kNumKeys; i++) {
std::string value;
auto s = db_->Get(ReadOptions(), Key(i), &value);
if (i == 4 || i == 5 || i == 7 || i == 9 || i == 10) {
ASSERT_TRUE(s.IsNotFound());
} else {
ASSERT_OK(s);
}
}
ASSERT_OK(Flush());
};
Random rnd(301);
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(i + 3), rnd.RandomString(kValueBytes)));
dbfull()->TEST_WaitForPeridicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerKey); });
}
auto* snap1 = db_->GetSnapshot();
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(i), rnd.RandomString(kValueBytes)));
dbfull()->TEST_WaitForPeridicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerKey); });
}
auto* snap2 = db_->GetSnapshot();
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
Key(kNumKeysPerFile - 1),
Key(kNumKeysPerFile + 1)));
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
Key(kNumKeysPerFile + 2),
Key(kNumKeysPerFile + 3)));
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
Key(2 * kNumKeysPerFile - 1),
Key(2 * kNumKeysPerFile + 1)));
ASSERT_OK(Flush());
dbfull()->TEST_WaitForPeridicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerKey); });
verify_db();
// make sure there is one and only one compaction supports per-key placement
// but has the penultimate level output disabled.
// Count compactions supporting per-key placement
std::atomic_int per_key_comp_num = 0;
SyncPoint::GetInstance()->SetCallBack(
"UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
auto compaction = static_cast<Compaction*>(arg);
if (compaction->SupportsPerKeyPlacement()) {
ASSERT_EQ(compaction->GetPenultimateOutputRangeType(),
Compaction::PenultimateOutputRangeType::kDisabled);
Compaction::PenultimateOutputRangeType::kNonLastRange);
per_key_comp_num++;
}
});
SyncPoint::GetInstance()->EnableProcessing();
for (int j = 0; j < 100; j++) {
ASSERT_OK(Put(Key(200 + j), rnd.RandomString(10)));
}
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
Key(32), Key(40)));
ASSERT_OK(Flush());
// Before the per-key placement compaction, the LSM tress should be like:
// L5: [0,19] [20,40] [40,299]
// L6: [0, 299]
// The 2nd file @L5 has the largest key 40 because of range del
// The `CompactRange()` writes the following files to L5.
//
// [key000000#16,kTypeValue,
// key000005#kMaxSequenceNumber,kTypeRangeDeletion]
// [key000005#21,kTypeValue,
// key000010#kMaxSequenceNumber,kTypeRangeDeletion]
// [key000010#26,kTypeValue, key000014#30,kTypeValue]
//
// And it writes the following files to L6.
//
// [key000003#1,kTypeValue, key000007#5,kTypeValue]
// [key000008#6,kTypeValue, key000012#10,kTypeValue]
// [key000013#11,kTypeValue, key000017#15,kTypeValue]
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
ASSERT_EQ("0,0,0,0,0,3,3", FilesPerLevel());
verify_db();
// Rewrite the middle file only. File endpoints should not change.
std::string begin_key_buf = Key(kNumKeysPerFile + 1),
end_key_buf = Key(kNumKeysPerFile + 2);
Slice begin_key(begin_key_buf), end_key(end_key_buf);
ASSERT_OK(db_->SuggestCompactRange(db_->DefaultColumnFamily(), &begin_key,
&end_key));
ASSERT_OK(dbfull()->WaitForCompact(true));
ASSERT_EQ("0,0,0,0,0,3,3", FilesPerLevel());
ASSERT_EQ(1, per_key_comp_num);
verify_db();
ASSERT_EQ(per_key_comp_num, 1);
// Rewrite the middle file again after releasing snap2. Still file endpoints
// should not change.
db_->ReleaseSnapshot(snap2);
ASSERT_OK(db_->SuggestCompactRange(db_->DefaultColumnFamily(), &begin_key,
&end_key));
ASSERT_OK(dbfull()->WaitForCompact(true));
ASSERT_EQ("0,0,0,0,0,3,3", FilesPerLevel());
ASSERT_EQ(2, per_key_comp_num);
verify_db();
// the compaction won't move any data to the penultimate level
// Middle file once more after releasing snap1. This time the data in the
// middle L5 file can all be compacted to the last level.
db_->ReleaseSnapshot(snap1);
ASSERT_OK(db_->SuggestCompactRange(db_->DefaultColumnFamily(), &begin_key,
&end_key));
ASSERT_OK(dbfull()->WaitForCompact(true));
ASSERT_EQ("0,0,0,0,0,2,3", FilesPerLevel());
ASSERT_EQ(3, per_key_comp_num);
verify_db();
// Finish off the penultimate level.
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
ASSERT_EQ("0,0,0,0,0,0,3", FilesPerLevel());
verify_db();
Close();
}