Fix range tombstones written to more files than necessary (#4592)

Summary:
When there's a gap between files, we do not need to output tombstones starting at the next output file's begin key to the current output file.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4592

Differential Revision: D12808627

Pulled By: ajkr

fbshipit-source-id: 77c8b2e7523a95b1cd6611194144092c06acb505
This commit is contained in:
Andrew Kryczka 2018-10-29 19:21:24 -07:00 committed by Facebook Github Bot
parent 806ff34b61
commit cae540ebef
2 changed files with 114 additions and 7 deletions

View File

@ -1209,15 +1209,28 @@ Status CompactionJob::FinishCompactionOutputFile(
if (lower_bound != nullptr) {
it->Seek(*lower_bound);
}
bool has_overlapping_endpoints;
if (upper_bound != nullptr && meta->largest.size() > 0) {
has_overlapping_endpoints =
ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0;
} else {
has_overlapping_endpoints = false;
}
for (; it->Valid(); it->Next()) {
auto tombstone = it->Tombstone();
if (upper_bound != nullptr &&
ucmp->Compare(*upper_bound, tombstone.start_key_) < 0) {
// Tombstones starting after upper_bound only need to be included in the
// next table (if the SSTs overlap, then upper_bound is contained in
// this SST and hence must be covered). Break because subsequent
// tombstones will start even later.
break;
if (upper_bound != nullptr) {
int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
if ((has_overlapping_endpoints && cmp < 0) ||
(!has_overlapping_endpoints && cmp <= 0)) {
// Tombstones starting after upper_bound only need to be included in
// the next table. If the current SST ends before upper_bound, i.e.,
// `has_overlapping_endpoints == false`, we can also skip over range
// tombstones that start exactly at upper_bound. Such range tombstones
// will be included in the next file and are not relevant to the point
// keys or endpoints of the current file.
break;
}
}
if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {

View File

@ -1389,6 +1389,100 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
db_->ReleaseSnapshot(snapshot);
}
TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
// Adapted from
// https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
// Regression test for issue where range tombstone was written to more files
// than necessary when it began exactly at the begin key in the next
// compaction output file.
const int kFileBytes = 1 << 20;
const int kValueBytes = 4 << 10;
Options options = CurrentOptions();
options.compression = kNoCompression;
options.disable_auto_compactions = true;
// Have a bit of slack in the size limits but we enforce them more strictly
// when manually flushing/compacting.
options.max_compaction_bytes = 2 * kFileBytes;
options.target_file_size_base = 2 * kFileBytes;
options.write_buffer_size = 2 * kFileBytes;
Reopen(options);
Random rnd(301);
for (char first_char : {'a', 'b', 'c'}) {
for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
std::string key(1, first_char);
key.append(Key(i));
std::string value = RandomString(&rnd, kValueBytes);
ASSERT_OK(Put(key, value));
}
db_->Flush(FlushOptions());
MoveFilesToLevel(2);
}
ASSERT_EQ(0, NumTableFilesAtLevel(0));
ASSERT_EQ(3, NumTableFilesAtLevel(2));
// Populate the memtable lightly while spanning the whole key-space. The
// setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
// files to prevent a large L1->L2 compaction later.
ASSERT_OK(Put("a", "val"));
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
"c" + Key(1), "d"));
// Our compaction output file cutting logic currently only considers point
// keys. So, in order for the range tombstone to have a chance at landing at
// the start of a new file, we need a point key at the range tombstone's
// start.
// TODO(ajkr): remove this `Put` after file cutting accounts for range
// tombstones (#3977).
ASSERT_OK(Put("c" + Key(1), "value"));
db_->Flush(FlushOptions());
// Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
// and the range tombstone is only placed in the second SST.
std::string begin_key_storage("c" + Key(1));
Slice begin_key(begin_key_storage);
std::string end_key_storage("d");
Slice end_key(end_key_storage);
dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
&end_key /* end */, nullptr /* column_family */,
true /* disallow_trivial_move */);
ASSERT_EQ(2, NumTableFilesAtLevel(1));
std::vector<LiveFileMetaData> all_metadata;
std::vector<LiveFileMetaData> l1_metadata;
db_->GetLiveFilesMetaData(&all_metadata);
for (const auto& metadata : all_metadata) {
if (metadata.level == 1) {
l1_metadata.push_back(metadata);
}
}
std::sort(l1_metadata.begin(), l1_metadata.end(),
[&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
return options.comparator->Compare(a.smallestkey, b.smallestkey) <
0;
});
ASSERT_EQ("a", l1_metadata[0].smallestkey);
ASSERT_EQ("a", l1_metadata[0].largestkey);
ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
ASSERT_EQ("d", l1_metadata[1].largestkey);
TablePropertiesCollection all_table_props;
ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
int64_t num_range_deletions = 0;
for (const auto& name_and_table_props : all_table_props) {
const auto& name = name_and_table_props.first;
const auto& table_props = name_and_table_props.second;
// The range tombstone should only be output to the second L1 SST.
if (name.size() >= l1_metadata[1].name.size() &&
name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) {
ASSERT_EQ(1, table_props->num_range_deletions);
++num_range_deletions;
} else {
ASSERT_EQ(0, table_props->num_range_deletions);
}
}
ASSERT_EQ(1, num_range_deletions);
}
#endif // ROCKSDB_LITE
} // namespace rocksdb