mirror of https://github.com/facebook/rocksdb.git
Fix range tombstones written to more files than necessary (#4592)
Summary: When there's a gap between files, we do not need to output tombstones starting at the next output file's begin key to the current output file. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4592 Differential Revision: D12808627 Pulled By: ajkr fbshipit-source-id: 77c8b2e7523a95b1cd6611194144092c06acb505
This commit is contained in:
parent
806ff34b61
commit
cae540ebef
|
@ -1209,15 +1209,28 @@ Status CompactionJob::FinishCompactionOutputFile(
|
|||
if (lower_bound != nullptr) {
|
||||
it->Seek(*lower_bound);
|
||||
}
|
||||
|
||||
bool has_overlapping_endpoints;
|
||||
if (upper_bound != nullptr && meta->largest.size() > 0) {
|
||||
has_overlapping_endpoints =
|
||||
ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0;
|
||||
} else {
|
||||
has_overlapping_endpoints = false;
|
||||
}
|
||||
for (; it->Valid(); it->Next()) {
|
||||
auto tombstone = it->Tombstone();
|
||||
if (upper_bound != nullptr &&
|
||||
ucmp->Compare(*upper_bound, tombstone.start_key_) < 0) {
|
||||
// Tombstones starting after upper_bound only need to be included in the
|
||||
// next table (if the SSTs overlap, then upper_bound is contained in
|
||||
// this SST and hence must be covered). Break because subsequent
|
||||
// tombstones will start even later.
|
||||
break;
|
||||
if (upper_bound != nullptr) {
|
||||
int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
|
||||
if ((has_overlapping_endpoints && cmp < 0) ||
|
||||
(!has_overlapping_endpoints && cmp <= 0)) {
|
||||
// Tombstones starting after upper_bound only need to be included in
|
||||
// the next table. If the current SST ends before upper_bound, i.e.,
|
||||
// `has_overlapping_endpoints == false`, we can also skip over range
|
||||
// tombstones that start exactly at upper_bound. Such range tombstones
|
||||
// will be included in the next file and are not relevant to the point
|
||||
// keys or endpoints of the current file.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {
|
||||
|
|
|
@ -1389,6 +1389,100 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
|
|||
db_->ReleaseSnapshot(snapshot);
|
||||
}
|
||||
|
||||
TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
|
||||
// Adapted from
|
||||
// https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
|
||||
// Regression test for issue where range tombstone was written to more files
|
||||
// than necessary when it began exactly at the begin key in the next
|
||||
// compaction output file.
|
||||
const int kFileBytes = 1 << 20;
|
||||
const int kValueBytes = 4 << 10;
|
||||
Options options = CurrentOptions();
|
||||
options.compression = kNoCompression;
|
||||
options.disable_auto_compactions = true;
|
||||
// Have a bit of slack in the size limits but we enforce them more strictly
|
||||
// when manually flushing/compacting.
|
||||
options.max_compaction_bytes = 2 * kFileBytes;
|
||||
options.target_file_size_base = 2 * kFileBytes;
|
||||
options.write_buffer_size = 2 * kFileBytes;
|
||||
Reopen(options);
|
||||
|
||||
Random rnd(301);
|
||||
for (char first_char : {'a', 'b', 'c'}) {
|
||||
for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
|
||||
std::string key(1, first_char);
|
||||
key.append(Key(i));
|
||||
std::string value = RandomString(&rnd, kValueBytes);
|
||||
ASSERT_OK(Put(key, value));
|
||||
}
|
||||
db_->Flush(FlushOptions());
|
||||
MoveFilesToLevel(2);
|
||||
}
|
||||
ASSERT_EQ(0, NumTableFilesAtLevel(0));
|
||||
ASSERT_EQ(3, NumTableFilesAtLevel(2));
|
||||
|
||||
// Populate the memtable lightly while spanning the whole key-space. The
|
||||
// setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
|
||||
// files to prevent a large L1->L2 compaction later.
|
||||
ASSERT_OK(Put("a", "val"));
|
||||
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
|
||||
"c" + Key(1), "d"));
|
||||
// Our compaction output file cutting logic currently only considers point
|
||||
// keys. So, in order for the range tombstone to have a chance at landing at
|
||||
// the start of a new file, we need a point key at the range tombstone's
|
||||
// start.
|
||||
// TODO(ajkr): remove this `Put` after file cutting accounts for range
|
||||
// tombstones (#3977).
|
||||
ASSERT_OK(Put("c" + Key(1), "value"));
|
||||
db_->Flush(FlushOptions());
|
||||
|
||||
// Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
|
||||
// and the range tombstone is only placed in the second SST.
|
||||
std::string begin_key_storage("c" + Key(1));
|
||||
Slice begin_key(begin_key_storage);
|
||||
std::string end_key_storage("d");
|
||||
Slice end_key(end_key_storage);
|
||||
dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
|
||||
&end_key /* end */, nullptr /* column_family */,
|
||||
true /* disallow_trivial_move */);
|
||||
ASSERT_EQ(2, NumTableFilesAtLevel(1));
|
||||
|
||||
std::vector<LiveFileMetaData> all_metadata;
|
||||
std::vector<LiveFileMetaData> l1_metadata;
|
||||
db_->GetLiveFilesMetaData(&all_metadata);
|
||||
for (const auto& metadata : all_metadata) {
|
||||
if (metadata.level == 1) {
|
||||
l1_metadata.push_back(metadata);
|
||||
}
|
||||
}
|
||||
std::sort(l1_metadata.begin(), l1_metadata.end(),
|
||||
[&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
|
||||
return options.comparator->Compare(a.smallestkey, b.smallestkey) <
|
||||
0;
|
||||
});
|
||||
ASSERT_EQ("a", l1_metadata[0].smallestkey);
|
||||
ASSERT_EQ("a", l1_metadata[0].largestkey);
|
||||
ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
|
||||
ASSERT_EQ("d", l1_metadata[1].largestkey);
|
||||
|
||||
TablePropertiesCollection all_table_props;
|
||||
ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
|
||||
int64_t num_range_deletions = 0;
|
||||
for (const auto& name_and_table_props : all_table_props) {
|
||||
const auto& name = name_and_table_props.first;
|
||||
const auto& table_props = name_and_table_props.second;
|
||||
// The range tombstone should only be output to the second L1 SST.
|
||||
if (name.size() >= l1_metadata[1].name.size() &&
|
||||
name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) {
|
||||
ASSERT_EQ(1, table_props->num_range_deletions);
|
||||
++num_range_deletions;
|
||||
} else {
|
||||
ASSERT_EQ(0, table_props->num_range_deletions);
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(1, num_range_deletions);
|
||||
}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
} // namespace rocksdb
|
||||
|
|
Loading…
Reference in New Issue