mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-29 09:36:17 +00:00
Make CompactRange() more aware of SstPartitionerFactory (#11032)
Summary: Some users are at least considering using SstPartitioner to support efficient physical migration of specific key ranges between RocksDB instances. One might expect manual `CompactRange()` over a narrow key range across some partition to enforce partitioning of any SST files crossing that partition boundary, but that currently only works if there are keys within that range. This change makes the overlap logic in CompactRange more aware of the partitioner to automatically select relevant files crossing a partition boundary, even when they otherwise would not be selected due to the compaction range falling in a gap between entries. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11032 Test Plan: unit test included Reviewed By: hx235 Differential Revision: D41981380 Pulled By: pdillinger fbshipit-source-id: 2fe445bdddc73c00276c20f295cc1fa33d15b05a
This commit is contained in:
parent
f8969ad7d4
commit
e6b6e74154
|
@ -15,6 +15,9 @@
|
|||
* Fixed a bug in LockWAL() leading to re-locking mutex (#11020).
|
||||
* Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache.
|
||||
|
||||
### New Features
|
||||
* When an SstPartitionerFactory is configured, CompactRange() now automatically selects for compaction any files overlapping a partition boundary that is in the compaction range, even if no actual entries are in the requested compaction range. With this feature, manual compaction can be used to (re-)establish SST partition points when SstPartitioner changes, without a full compaction.
|
||||
|
||||
## 7.9.0 (11/21/2022)
|
||||
### Performance Improvements
|
||||
* Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877).
|
||||
|
|
|
@ -1026,6 +1026,70 @@ TEST_F(DBCompactionTest, CompactionSstPartitioner) {
|
|||
ASSERT_EQ("B", Get("bbbb1"));
|
||||
}
|
||||
|
||||
TEST_F(DBCompactionTest, CompactionSstPartitionWithManualCompaction) {
|
||||
Options options = CurrentOptions();
|
||||
options.compaction_style = kCompactionStyleLevel;
|
||||
options.level0_file_num_compaction_trigger = 3;
|
||||
|
||||
DestroyAndReopen(options);
|
||||
|
||||
// create first file and flush to l0
|
||||
ASSERT_OK(Put("000015", "A"));
|
||||
ASSERT_OK(Put("000025", "B"));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
||||
|
||||
// create second file and flush to l0
|
||||
ASSERT_OK(Put("000015", "A2"));
|
||||
ASSERT_OK(Put("000025", "B2"));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
||||
|
||||
// CONTROL 1: compact without partitioner
|
||||
CompactRangeOptions compact_options;
|
||||
compact_options.bottommost_level_compaction =
|
||||
BottommostLevelCompaction::kForceOptimized;
|
||||
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
||||
|
||||
// Check (compacted but no partitioning yet)
|
||||
std::vector<LiveFileMetaData> files;
|
||||
dbfull()->GetLiveFilesMetaData(&files);
|
||||
ASSERT_EQ(1, files.size());
|
||||
|
||||
// Install partitioner
|
||||
std::shared_ptr<SstPartitionerFactory> factory(
|
||||
NewSstPartitionerFixedPrefixFactory(5));
|
||||
options.sst_partitioner_factory = factory;
|
||||
Reopen(options);
|
||||
|
||||
// CONTROL 2: request compaction on range with no partition boundary and no
|
||||
// overlap with actual entries
|
||||
Slice from("000017");
|
||||
Slice to("000019");
|
||||
ASSERT_OK(dbfull()->CompactRange(compact_options, &from, &to));
|
||||
|
||||
// Check (no partitioning yet)
|
||||
files.clear();
|
||||
dbfull()->GetLiveFilesMetaData(&files);
|
||||
ASSERT_EQ(1, files.size());
|
||||
ASSERT_EQ("A2", Get("000015"));
|
||||
ASSERT_EQ("B2", Get("000025"));
|
||||
|
||||
// TEST: request compaction overlapping with partition boundary but no
|
||||
// actual entries
|
||||
// NOTE: `to` is INCLUSIVE
|
||||
from = Slice("000019");
|
||||
to = Slice("000020");
|
||||
ASSERT_OK(dbfull()->CompactRange(compact_options, &from, &to));
|
||||
|
||||
// Check (must be partitioned)
|
||||
files.clear();
|
||||
dbfull()->GetLiveFilesMetaData(&files);
|
||||
ASSERT_EQ(2, files.size());
|
||||
ASSERT_EQ("A2", Get("000015"));
|
||||
ASSERT_EQ("B2", Get("000025"));
|
||||
}
|
||||
|
||||
TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) {
|
||||
Options options = CurrentOptions();
|
||||
options.compaction_style = kCompactionStyleLevel;
|
||||
|
|
|
@ -1087,6 +1087,22 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
|
|||
{
|
||||
SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
|
||||
Version* current_version = super_version->current;
|
||||
|
||||
// Might need to query the partitioner
|
||||
SstPartitionerFactory* partitioner_factory =
|
||||
current_version->cfd()->ioptions()->sst_partitioner_factory.get();
|
||||
std::unique_ptr<SstPartitioner> partitioner;
|
||||
if (partitioner_factory && begin != nullptr && end != nullptr) {
|
||||
SstPartitioner::Context context;
|
||||
context.is_full_compaction = false;
|
||||
context.is_manual_compaction = true;
|
||||
context.output_level = /*unknown*/ -1;
|
||||
// Small lies about compaction range
|
||||
context.smallest_user_key = *begin;
|
||||
context.largest_user_key = *end;
|
||||
partitioner = partitioner_factory->CreatePartitioner(context);
|
||||
}
|
||||
|
||||
ReadOptions ro;
|
||||
ro.total_order_seek = true;
|
||||
bool overlap;
|
||||
|
@ -1094,14 +1110,50 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
|
|||
level < current_version->storage_info()->num_non_empty_levels();
|
||||
level++) {
|
||||
overlap = true;
|
||||
|
||||
// Whether to look at specific keys within files for overlap with
|
||||
// compaction range, other than largest and smallest keys of the file
|
||||
// known in Version metadata.
|
||||
bool check_overlap_within_file = false;
|
||||
if (begin != nullptr && end != nullptr) {
|
||||
// Typically checking overlap within files in this case
|
||||
check_overlap_within_file = true;
|
||||
// WART: Not known why we don't check within file in one-sided bound
|
||||
// cases
|
||||
if (partitioner) {
|
||||
// Especially if the partitioner is new, the manual compaction
|
||||
// might be used to enforce the partitioning. Checking overlap
|
||||
// within files might miss cases where compaction is needed to
|
||||
// partition the files, as in this example:
|
||||
// * File has two keys "001" and "111"
|
||||
// * Compaction range is ["011", "101")
|
||||
// * Partition boundary at "100"
|
||||
// In cases like this, file-level overlap with the compaction
|
||||
// range is sufficient to force any partitioning that is needed
|
||||
// within the compaction range.
|
||||
//
|
||||
// But if there's no partitioning boundary within the compaction
|
||||
// range, we can be sure there's no need to fix partitioning
|
||||
// within that range, thus safe to check overlap within file.
|
||||
//
|
||||
// Use a hypothetical trivial move query to check for partition
|
||||
// boundary in range. (NOTE: in defiance of all conventions,
|
||||
// `begin` and `end` here are both INCLUSIVE bounds, which makes
|
||||
// this analogy to CanDoTrivialMove() accurate even when `end` is
|
||||
// the first key in a partition.)
|
||||
if (!partitioner->CanDoTrivialMove(*begin, *end)) {
|
||||
check_overlap_within_file = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (check_overlap_within_file) {
|
||||
Status status = current_version->OverlapWithLevelIterator(
|
||||
ro, file_options_, *begin, *end, level, &overlap);
|
||||
if (!status.ok()) {
|
||||
overlap = current_version->storage_info()->OverlapInLevel(
|
||||
level, begin, end);
|
||||
check_overlap_within_file = false;
|
||||
}
|
||||
} else {
|
||||
}
|
||||
if (!check_overlap_within_file) {
|
||||
overlap = current_version->storage_info()->OverlapInLevel(level,
|
||||
begin, end);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue