mirror of https://github.com/facebook/rocksdb.git
Fix range tombstones written to more files than necessary (#4592)
Summary: When there's a gap between files, we do not need to output tombstones starting at the next output file's begin key to the current output file. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4592 Differential Revision: D12808627 Pulled By: ajkr fbshipit-source-id: 77c8b2e7523a95b1cd6611194144092c06acb505
This commit is contained in:
parent
806ff34b61
commit
cae540ebef
|
@ -1209,15 +1209,28 @@ Status CompactionJob::FinishCompactionOutputFile(
|
||||||
if (lower_bound != nullptr) {
|
if (lower_bound != nullptr) {
|
||||||
it->Seek(*lower_bound);
|
it->Seek(*lower_bound);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool has_overlapping_endpoints;
|
||||||
|
if (upper_bound != nullptr && meta->largest.size() > 0) {
|
||||||
|
has_overlapping_endpoints =
|
||||||
|
ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0;
|
||||||
|
} else {
|
||||||
|
has_overlapping_endpoints = false;
|
||||||
|
}
|
||||||
for (; it->Valid(); it->Next()) {
|
for (; it->Valid(); it->Next()) {
|
||||||
auto tombstone = it->Tombstone();
|
auto tombstone = it->Tombstone();
|
||||||
if (upper_bound != nullptr &&
|
if (upper_bound != nullptr) {
|
||||||
ucmp->Compare(*upper_bound, tombstone.start_key_) < 0) {
|
int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
|
||||||
// Tombstones starting after upper_bound only need to be included in the
|
if ((has_overlapping_endpoints && cmp < 0) ||
|
||||||
// next table (if the SSTs overlap, then upper_bound is contained in
|
(!has_overlapping_endpoints && cmp <= 0)) {
|
||||||
// this SST and hence must be covered). Break because subsequent
|
// Tombstones starting after upper_bound only need to be included in
|
||||||
// tombstones will start even later.
|
// the next table. If the current SST ends before upper_bound, i.e.,
|
||||||
break;
|
// `has_overlapping_endpoints == false`, we can also skip over range
|
||||||
|
// tombstones that start exactly at upper_bound. Such range tombstones
|
||||||
|
// will be included in the next file and are not relevant to the point
|
||||||
|
// keys or endpoints of the current file.
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {
|
if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {
|
||||||
|
|
|
@ -1389,6 +1389,100 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
|
||||||
db_->ReleaseSnapshot(snapshot);
|
db_->ReleaseSnapshot(snapshot);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
|
||||||
|
// Adapted from
|
||||||
|
// https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
|
||||||
|
// Regression test for issue where range tombstone was written to more files
|
||||||
|
// than necessary when it began exactly at the begin key in the next
|
||||||
|
// compaction output file.
|
||||||
|
const int kFileBytes = 1 << 20;
|
||||||
|
const int kValueBytes = 4 << 10;
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
options.compression = kNoCompression;
|
||||||
|
options.disable_auto_compactions = true;
|
||||||
|
// Have a bit of slack in the size limits but we enforce them more strictly
|
||||||
|
// when manually flushing/compacting.
|
||||||
|
options.max_compaction_bytes = 2 * kFileBytes;
|
||||||
|
options.target_file_size_base = 2 * kFileBytes;
|
||||||
|
options.write_buffer_size = 2 * kFileBytes;
|
||||||
|
Reopen(options);
|
||||||
|
|
||||||
|
Random rnd(301);
|
||||||
|
for (char first_char : {'a', 'b', 'c'}) {
|
||||||
|
for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
|
||||||
|
std::string key(1, first_char);
|
||||||
|
key.append(Key(i));
|
||||||
|
std::string value = RandomString(&rnd, kValueBytes);
|
||||||
|
ASSERT_OK(Put(key, value));
|
||||||
|
}
|
||||||
|
db_->Flush(FlushOptions());
|
||||||
|
MoveFilesToLevel(2);
|
||||||
|
}
|
||||||
|
ASSERT_EQ(0, NumTableFilesAtLevel(0));
|
||||||
|
ASSERT_EQ(3, NumTableFilesAtLevel(2));
|
||||||
|
|
||||||
|
// Populate the memtable lightly while spanning the whole key-space. The
|
||||||
|
// setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
|
||||||
|
// files to prevent a large L1->L2 compaction later.
|
||||||
|
ASSERT_OK(Put("a", "val"));
|
||||||
|
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
|
||||||
|
"c" + Key(1), "d"));
|
||||||
|
// Our compaction output file cutting logic currently only considers point
|
||||||
|
// keys. So, in order for the range tombstone to have a chance at landing at
|
||||||
|
// the start of a new file, we need a point key at the range tombstone's
|
||||||
|
// start.
|
||||||
|
// TODO(ajkr): remove this `Put` after file cutting accounts for range
|
||||||
|
// tombstones (#3977).
|
||||||
|
ASSERT_OK(Put("c" + Key(1), "value"));
|
||||||
|
db_->Flush(FlushOptions());
|
||||||
|
|
||||||
|
// Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
|
||||||
|
// and the range tombstone is only placed in the second SST.
|
||||||
|
std::string begin_key_storage("c" + Key(1));
|
||||||
|
Slice begin_key(begin_key_storage);
|
||||||
|
std::string end_key_storage("d");
|
||||||
|
Slice end_key(end_key_storage);
|
||||||
|
dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
|
||||||
|
&end_key /* end */, nullptr /* column_family */,
|
||||||
|
true /* disallow_trivial_move */);
|
||||||
|
ASSERT_EQ(2, NumTableFilesAtLevel(1));
|
||||||
|
|
||||||
|
std::vector<LiveFileMetaData> all_metadata;
|
||||||
|
std::vector<LiveFileMetaData> l1_metadata;
|
||||||
|
db_->GetLiveFilesMetaData(&all_metadata);
|
||||||
|
for (const auto& metadata : all_metadata) {
|
||||||
|
if (metadata.level == 1) {
|
||||||
|
l1_metadata.push_back(metadata);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::sort(l1_metadata.begin(), l1_metadata.end(),
|
||||||
|
[&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
|
||||||
|
return options.comparator->Compare(a.smallestkey, b.smallestkey) <
|
||||||
|
0;
|
||||||
|
});
|
||||||
|
ASSERT_EQ("a", l1_metadata[0].smallestkey);
|
||||||
|
ASSERT_EQ("a", l1_metadata[0].largestkey);
|
||||||
|
ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
|
||||||
|
ASSERT_EQ("d", l1_metadata[1].largestkey);
|
||||||
|
|
||||||
|
TablePropertiesCollection all_table_props;
|
||||||
|
ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
|
||||||
|
int64_t num_range_deletions = 0;
|
||||||
|
for (const auto& name_and_table_props : all_table_props) {
|
||||||
|
const auto& name = name_and_table_props.first;
|
||||||
|
const auto& table_props = name_and_table_props.second;
|
||||||
|
// The range tombstone should only be output to the second L1 SST.
|
||||||
|
if (name.size() >= l1_metadata[1].name.size() &&
|
||||||
|
name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) {
|
||||||
|
ASSERT_EQ(1, table_props->num_range_deletions);
|
||||||
|
++num_range_deletions;
|
||||||
|
} else {
|
||||||
|
ASSERT_EQ(0, table_props->num_range_deletions);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ASSERT_EQ(1, num_range_deletions);
|
||||||
|
}
|
||||||
|
|
||||||
#endif // ROCKSDB_LITE
|
#endif // ROCKSDB_LITE
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
Loading…
Reference in New Issue