From e6b6e741541301cd7905c43d8e301338f8740d43 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Wed, 21 Dec 2022 15:41:10 -0800 Subject: [PATCH] Make CompactRange() more aware of SstPartitionerFactory (#11032) Summary: Some users are at least considering using SstPartitioner to support efficient physical migration of specific key ranges between RocksDB instances. One might expect manual `CompactRange()` over a narrow key range across some partition to enforce partitioning of any SST files crossing that partition boundary, but that currently only works if there are keys within that range. This change makes the overlap logic in CompactRange more aware of the partitioner to automatically select relevant files crossing a partition boundary, even when they otherwise would not be selected due to the compaction range falling in a gap between entries. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11032 Test Plan: unit test included Reviewed By: hx235 Differential Revision: D41981380 Pulled By: pdillinger fbshipit-source-id: 2fe445bdddc73c00276c20f295cc1fa33d15b05a --- HISTORY.md | 3 ++ db/db_compaction_test.cc | 64 ++++++++++++++++++++++++++ db/db_impl/db_impl_compaction_flush.cc | 58 +++++++++++++++++++++-- 3 files changed, 122 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index de1d720ad0..45cf403ad6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -15,6 +15,9 @@ * Fixed a bug in LockWAL() leading to re-locking mutex (#11020). * Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache. +### New Features +* When an SstPartitionerFactory is configured, CompactRange() now automatically selects for compaction any files overlapping a partition boundary that is in the compaction range, even if no actual entries are in the requested compaction range. With this feature, manual compaction can be used to (re-)establish SST partition points when SstPartitioner changes, without a full compaction. + ## 7.9.0 (11/21/2022) ### Performance Improvements * Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877). diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index ed9a5a7aef..0227c06e41 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -1026,6 +1026,70 @@ TEST_F(DBCompactionTest, CompactionSstPartitioner) { ASSERT_EQ("B", Get("bbbb1")); } +TEST_F(DBCompactionTest, CompactionSstPartitionWithManualCompaction) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("000015", "A")); + ASSERT_OK(Put("000025", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // create second file and flush to l0 + ASSERT_OK(Put("000015", "A2")); + ASSERT_OK(Put("000025", "B2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // CONTROL 1: compact without partitioner + CompactRangeOptions compact_options; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Check (compacted but no partitioning yet) + std::vector files; + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(1, files.size()); + + // Install partitioner + std::shared_ptr factory( + NewSstPartitionerFixedPrefixFactory(5)); + options.sst_partitioner_factory = factory; + Reopen(options); + + // CONTROL 2: request compaction on range with no partition boundary and no + // overlap with actual entries + Slice from("000017"); + Slice to("000019"); + ASSERT_OK(dbfull()->CompactRange(compact_options, &from, &to)); + + // Check (no partitioning yet) + files.clear(); + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(1, files.size()); + ASSERT_EQ("A2", Get("000015")); + ASSERT_EQ("B2", Get("000025")); + + // TEST: request compaction overlapping with partition boundary but no + // actual entries + // NOTE: `to` is INCLUSIVE + from = Slice("000019"); + to = Slice("000020"); + ASSERT_OK(dbfull()->CompactRange(compact_options, &from, &to)); + + // Check (must be partitioned) + files.clear(); + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(2, files.size()); + ASSERT_EQ("A2", Get("000015")); + ASSERT_EQ("B2", Get("000025")); +} + TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) { Options options = CurrentOptions(); options.compaction_style = kCompactionStyleLevel; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 31e2d07ce0..095336f643 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1087,6 +1087,22 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, { SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); Version* current_version = super_version->current; + + // Might need to query the partitioner + SstPartitionerFactory* partitioner_factory = + current_version->cfd()->ioptions()->sst_partitioner_factory.get(); + std::unique_ptr partitioner; + if (partitioner_factory && begin != nullptr && end != nullptr) { + SstPartitioner::Context context; + context.is_full_compaction = false; + context.is_manual_compaction = true; + context.output_level = /*unknown*/ -1; + // Small lies about compaction range + context.smallest_user_key = *begin; + context.largest_user_key = *end; + partitioner = partitioner_factory->CreatePartitioner(context); + } + ReadOptions ro; ro.total_order_seek = true; bool overlap; @@ -1094,14 +1110,50 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, level < current_version->storage_info()->num_non_empty_levels(); level++) { overlap = true; + + // Whether to look at specific keys within files for overlap with + // compaction range, other than largest and smallest keys of the file + // known in Version metadata. + bool check_overlap_within_file = false; if (begin != nullptr && end != nullptr) { + // Typically checking overlap within files in this case + check_overlap_within_file = true; + // WART: Not known why we don't check within file in one-sided bound + // cases + if (partitioner) { + // Especially if the partitioner is new, the manual compaction + // might be used to enforce the partitioning. Checking overlap + // within files might miss cases where compaction is needed to + // partition the files, as in this example: + // * File has two keys "001" and "111" + // * Compaction range is ["011", "101") + // * Partition boundary at "100" + // In cases like this, file-level overlap with the compaction + // range is sufficient to force any partitioning that is needed + // within the compaction range. + // + // But if there's no partitioning boundary within the compaction + // range, we can be sure there's no need to fix partitioning + // within that range, thus safe to check overlap within file. + // + // Use a hypothetical trivial move query to check for partition + // boundary in range. (NOTE: in defiance of all conventions, + // `begin` and `end` here are both INCLUSIVE bounds, which makes + // this analogy to CanDoTrivialMove() accurate even when `end` is + // the first key in a partition.) + if (!partitioner->CanDoTrivialMove(*begin, *end)) { + check_overlap_within_file = false; + } + } + } + if (check_overlap_within_file) { Status status = current_version->OverlapWithLevelIterator( ro, file_options_, *begin, *end, level, &overlap); if (!status.ok()) { - overlap = current_version->storage_info()->OverlapInLevel( - level, begin, end); + check_overlap_within_file = false; } - } else { + } + if (!check_overlap_within_file) { overlap = current_version->storage_info()->OverlapInLevel(level, begin, end); }