Fix range tombstones written to more files than necessary (#4592)

Summary: When there's a gap between files, we do not need to output tombstones starting at the next output file's begin key to the current output file. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4592 Differential Revision: D12808627 Pulled By: ajkr fbshipit-source-id: 77c8b2e7523a95b1cd6611194144092c06acb505
2018-10-29 19:21:24 -07:00 · 2018-10-29 19:21:24 -07:00 · cae540ebef
parent 806ff34b61
commit cae540ebef
2 changed files with 114 additions and 7 deletions
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@ -1209,15 +1209,28 @@ Status CompactionJob::FinishCompactionOutputFile(
    if (lower_bound != nullptr) {
      it->Seek(*lower_bound);
    }
    bool has_overlapping_endpoints;
    if (upper_bound != nullptr && meta->largest.size() > 0) {
      has_overlapping_endpoints =
          ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0;
    } else {
      has_overlapping_endpoints = false;
    }
    for (; it->Valid(); it->Next()) {
      auto tombstone = it->Tombstone();
-      if (upper_bound != nullptr &&
+      if (upper_bound != nullptr) {
-          ucmp->Compare(*upper_bound, tombstone.start_key_) < 0) {
+        int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
-        // Tombstones starting after upper_bound only need to be included in the
+        if ((has_overlapping_endpoints && cmp < 0) ||
-        // next table (if the SSTs overlap, then upper_bound is contained in
+            (!has_overlapping_endpoints && cmp <= 0)) {
-        // this SST and hence must be covered). Break because subsequent
+          // Tombstones starting after upper_bound only need to be included in
-        // tombstones will start even later.
+          // the next table. If the current SST ends before upper_bound, i.e.,
-        break;
+          // `has_overlapping_endpoints == false`, we can also skip over range
          // tombstones that start exactly at upper_bound. Such range tombstones
          // will be included in the next file and are not relevant to the point
          // keys or endpoints of the current file.
          break;
        }
      }
      if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@ -1389,6 +1389,100 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
  db_->ReleaseSnapshot(snapshot);
 }
 TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
  // Adapted from
  // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
  // Regression test for issue where range tombstone was written to more files
  // than necessary when it began exactly at the begin key in the next
  // compaction output file.
  const int kFileBytes = 1 << 20;
  const int kValueBytes = 4 << 10;
  Options options = CurrentOptions();
  options.compression = kNoCompression;
  options.disable_auto_compactions = true;
  // Have a bit of slack in the size limits but we enforce them more strictly
  // when manually flushing/compacting.
  options.max_compaction_bytes = 2 * kFileBytes;
  options.target_file_size_base = 2 * kFileBytes;
  options.write_buffer_size = 2 * kFileBytes;
  Reopen(options);
  Random rnd(301);
  for (char first_char : {'a', 'b', 'c'}) {
    for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
      std::string key(1, first_char);
      key.append(Key(i));
      std::string value = RandomString(&rnd, kValueBytes);
      ASSERT_OK(Put(key, value));
    }
    db_->Flush(FlushOptions());
    MoveFilesToLevel(2);
  }
  ASSERT_EQ(0, NumTableFilesAtLevel(0));
  ASSERT_EQ(3, NumTableFilesAtLevel(2));
  // Populate the memtable lightly while spanning the whole key-space. The
  // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
  // files to prevent a large L1->L2 compaction later.
  ASSERT_OK(Put("a", "val"));
  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
                             "c" + Key(1), "d"));
  // Our compaction output file cutting logic currently only considers point
  // keys. So, in order for the range tombstone to have a chance at landing at
  // the start of a new file, we need a point key at the range tombstone's
  // start.
  // TODO(ajkr): remove this `Put` after file cutting accounts for range
  // tombstones (#3977).
  ASSERT_OK(Put("c" + Key(1), "value"));
  db_->Flush(FlushOptions());
  // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
  // and the range tombstone is only placed in the second SST.
  std::string begin_key_storage("c" + Key(1));
  Slice begin_key(begin_key_storage);
  std::string end_key_storage("d");
  Slice end_key(end_key_storage);
  dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
                              &end_key /* end */, nullptr /* column_family */,
                              true /* disallow_trivial_move */);
  ASSERT_EQ(2, NumTableFilesAtLevel(1));
  std::vector<LiveFileMetaData> all_metadata;
  std::vector<LiveFileMetaData> l1_metadata;
  db_->GetLiveFilesMetaData(&all_metadata);
  for (const auto& metadata : all_metadata) {
    if (metadata.level == 1) {
      l1_metadata.push_back(metadata);
    }
  }
  std::sort(l1_metadata.begin(), l1_metadata.end(),
            [&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
              return options.comparator->Compare(a.smallestkey, b.smallestkey) <
                     0;
            });
  ASSERT_EQ("a", l1_metadata[0].smallestkey);
  ASSERT_EQ("a", l1_metadata[0].largestkey);
  ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
  ASSERT_EQ("d", l1_metadata[1].largestkey);
  TablePropertiesCollection all_table_props;
  ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
  int64_t num_range_deletions = 0;
  for (const auto& name_and_table_props : all_table_props) {
    const auto& name = name_and_table_props.first;
    const auto& table_props = name_and_table_props.second;
    // The range tombstone should only be output to the second L1 SST.
    if (name.size() >= l1_metadata[1].name.size() &&
        name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) {
      ASSERT_EQ(1, table_props->num_range_deletions);
      ++num_range_deletions;
    } else {
      ASSERT_EQ(0, table_props->num_range_deletions);
    }
  }
  ASSERT_EQ(1, num_range_deletions);
 }
 #endif  // ROCKSDB_LITE
 }  // namespace rocksdb