diff --git a/db/compaction_job.cc b/db/compaction_job.cc index a3c5488c3a..d4aa28bad6 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1209,15 +1209,28 @@ Status CompactionJob::FinishCompactionOutputFile( if (lower_bound != nullptr) { it->Seek(*lower_bound); } + + bool has_overlapping_endpoints; + if (upper_bound != nullptr && meta->largest.size() > 0) { + has_overlapping_endpoints = + ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0; + } else { + has_overlapping_endpoints = false; + } for (; it->Valid(); it->Next()) { auto tombstone = it->Tombstone(); - if (upper_bound != nullptr && - ucmp->Compare(*upper_bound, tombstone.start_key_) < 0) { - // Tombstones starting after upper_bound only need to be included in the - // next table (if the SSTs overlap, then upper_bound is contained in - // this SST and hence must be covered). Break because subsequent - // tombstones will start even later. - break; + if (upper_bound != nullptr) { + int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_); + if ((has_overlapping_endpoints && cmp < 0) || + (!has_overlapping_endpoints && cmp <= 0)) { + // Tombstones starting after upper_bound only need to be included in + // the next table. If the current SST ends before upper_bound, i.e., + // `has_overlapping_endpoints == false`, we can also skip over range + // tombstones that start exactly at upper_bound. Such range tombstones + // will be included in the next file and are not relevant to the point + // keys or endpoints of the current file. + break; + } } if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) { diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 4e97651b78..508149a340 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -1389,6 +1389,100 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) { db_->ReleaseSnapshot(snapshot); } +TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) { + // Adapted from + // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398. + // Regression test for issue where range tombstone was written to more files + // than necessary when it began exactly at the begin key in the next + // compaction output file. + const int kFileBytes = 1 << 20; + const int kValueBytes = 4 << 10; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + // Have a bit of slack in the size limits but we enforce them more strictly + // when manually flushing/compacting. + options.max_compaction_bytes = 2 * kFileBytes; + options.target_file_size_base = 2 * kFileBytes; + options.write_buffer_size = 2 * kFileBytes; + Reopen(options); + + Random rnd(301); + for (char first_char : {'a', 'b', 'c'}) { + for (int i = 0; i < kFileBytes / kValueBytes; ++i) { + std::string key(1, first_char); + key.append(Key(i)); + std::string value = RandomString(&rnd, kValueBytes); + ASSERT_OK(Put(key, value)); + } + db_->Flush(FlushOptions()); + MoveFilesToLevel(2); + } + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(3, NumTableFilesAtLevel(2)); + + // Populate the memtable lightly while spanning the whole key-space. The + // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple + // files to prevent a large L1->L2 compaction later. + ASSERT_OK(Put("a", "val")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "c" + Key(1), "d")); + // Our compaction output file cutting logic currently only considers point + // keys. So, in order for the range tombstone to have a chance at landing at + // the start of a new file, we need a point key at the range tombstone's + // start. + // TODO(ajkr): remove this `Put` after file cutting accounts for range + // tombstones (#3977). + ASSERT_OK(Put("c" + Key(1), "value")); + db_->Flush(FlushOptions()); + + // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone + // and the range tombstone is only placed in the second SST. + std::string begin_key_storage("c" + Key(1)); + Slice begin_key(begin_key_storage); + std::string end_key_storage("d"); + Slice end_key(end_key_storage); + dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */, + &end_key /* end */, nullptr /* column_family */, + true /* disallow_trivial_move */); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + std::vector all_metadata; + std::vector l1_metadata; + db_->GetLiveFilesMetaData(&all_metadata); + for (const auto& metadata : all_metadata) { + if (metadata.level == 1) { + l1_metadata.push_back(metadata); + } + } + std::sort(l1_metadata.begin(), l1_metadata.end(), + [&](const LiveFileMetaData& a, const LiveFileMetaData& b) { + return options.comparator->Compare(a.smallestkey, b.smallestkey) < + 0; + }); + ASSERT_EQ("a", l1_metadata[0].smallestkey); + ASSERT_EQ("a", l1_metadata[0].largestkey); + ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey); + ASSERT_EQ("d", l1_metadata[1].largestkey); + + TablePropertiesCollection all_table_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props)); + int64_t num_range_deletions = 0; + for (const auto& name_and_table_props : all_table_props) { + const auto& name = name_and_table_props.first; + const auto& table_props = name_and_table_props.second; + // The range tombstone should only be output to the second L1 SST. + if (name.size() >= l1_metadata[1].name.size() && + name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) { + ASSERT_EQ(1, table_props->num_range_deletions); + ++num_range_deletions; + } else { + ASSERT_EQ(0, table_props->num_range_deletions); + } + } + ASSERT_EQ(1, num_range_deletions); +} + #endif // ROCKSDB_LITE } // namespace rocksdb