From 8d8eb0e77e13a3902d23fbda742dc47aa7bc418f Mon Sep 17 00:00:00 2001 From: "mayue.fight" Date: Thu, 18 May 2023 13:25:01 -0700 Subject: [PATCH] Support Clip DB to KeyRange (#11379) Summary: This PR is part of the request https://github.com/facebook/rocksdb/issues/11317. (Another part is https://github.com/facebook/rocksdb/pull/11378) ClipDB() will clip the entries in the CF according to the range [begin_key, end_key). All the entries outside this range will be completely deleted (including tombstones). This feature is mainly used to ensure that there is no overlapping Key when calling CreateColumnFamilyWithImports() to import multiple CFs. When Calling ClipDB [begin, end), there are the following steps 1. Quickly and directly delete files without overlap DeleteFilesInRanges(nullptr, begin) + DeleteFilesInRanges(end, nullptr) 2. Delete the Key outside the range Delete[smallest_key, begin) + Delete[end, largest_key] 3. Delete the tombstone through Manul Compact CompactRange(option, nullptr, nullptr) Pull Request resolved: https://github.com/facebook/rocksdb/pull/11379 Reviewed By: ajkr Differential Revision: D45840358 Pulled By: cbi42 fbshipit-source-id: 54152e8a45fd8ede137f99787eb252f0b51440a4 --- CMakeLists.txt | 1 + HISTORY.md | 1 + Makefile | 3 + TARGETS | 6 + db/db_clip_test.cc | 142 +++++++++++++++++++++++ db/db_impl/compacted_db_impl.h | 7 ++ db/db_impl/db_impl.cc | 76 +++++++++++- db/db_impl/db_impl.h | 5 + db/db_impl/db_impl_readonly.h | 7 ++ db/db_test.cc | 7 ++ db/version_set.cc | 43 +++++++ db/version_set.h | 3 + include/rocksdb/db.h | 19 +++ include/rocksdb/utilities/stackable_db.h | 7 ++ src.mk | 1 + 15 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 db/db_clip_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 109981c1b4..7797bde2cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1290,6 +1290,7 @@ if(WITH_TESTS) db/db_bloom_filter_test.cc db/db_compaction_filter_test.cc db/db_compaction_test.cc + db/db_clip_test.cc db/db_dynamic_level_test.cc db/db_encryption_test.cc db/db_flush_test.cc diff --git a/HISTORY.md b/HISTORY.md index d76c99e59a..51533d204e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ * New statistics `rocksdb.file.read.db.open.micros` that measures read time of block-based SST tables or blob files during db open. ### Public API Changes +* EXPERIMENTAL: Add new API `DB::ClipColumnFamily` to clip the key in CF to a certain range. It will physically deletes all keys outside the range including tombstones. * Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists. ### Behavior changes diff --git a/Makefile b/Makefile index 75a5d73591..b499e8be13 100644 --- a/Makefile +++ b/Makefile @@ -1480,6 +1480,9 @@ db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBR db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +db_clip_test: $(OBJ_DIR)/db/db_clip_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + db_dynamic_level_test: $(OBJ_DIR)/db/db_dynamic_level_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 52a29b7747..ff5321312c 100644 --- a/TARGETS +++ b/TARGETS @@ -4750,6 +4750,12 @@ cpp_unittest_wrapper(name="db_bloom_filter_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="db_clip_test", + srcs=["db/db_clip_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="db_compaction_filter_test", srcs=["db/db_compaction_filter_test.cc"], deps=[":rocksdb_test_lib"], diff --git a/db/db_clip_test.cc b/db/db_clip_test.cc new file mode 100644 index 0000000000..fd0bb57170 --- /dev/null +++ b/db/db_clip_test.cc @@ -0,0 +1,142 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/port.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class DBClipTest : public DBTestBase { + public: + DBClipTest() : DBTestBase("db_clip_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBClipTest, TestClipRange) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 3; + options.max_background_compactions = 3; + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + + // file [0 => 100), [100 => 200), ... [900, 1000) + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + values[k] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("10", FilesPerLevel(0)); + auto begin_key = Key(251), end_key = Key(751); + ASSERT_OK( + db_->ClipColumnFamily(db_->DefaultColumnFamily(), begin_key, end_key)); + + for (auto i = 0; i < 251; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 251; i < 751; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 751; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + + std::vector all_metadata; + db_->GetLiveFilesMetaData(&all_metadata); + for (auto& md : all_metadata) { + // make sure clip_begin_key <= file_smallestkey <= file_largestkey <= + // clip_end_key + bool in_range = false; + + if (options.comparator->Compare(begin_key, md.smallestkey) <= 0 && + options.comparator->Compare(end_key, md.largestkey) > 0) { + in_range = true; + } + ASSERT_TRUE(in_range); + } + + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,3", FilesPerLevel(0)); + + for (auto i = 0; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,0,3", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_EQ("0,5,3", FilesPerLevel(0)); + + for (auto i = 1; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,5,3", FilesPerLevel(0)); + + auto begin_key_2 = Key(222), end_key_2 = Key(888); + + ASSERT_OK(db_->ClipColumnFamily(db_->DefaultColumnFamily(), begin_key_2, + end_key_2)); + + for (auto i = 0; i < 222; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 222; i < 888; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 888; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + + std::vector all_metadata_2; + db_->GetLiveFilesMetaData(&all_metadata_2); + for (auto& md : all_metadata_2) { + // make sure clip_begin_key <= file_smallestkey <= file_largestkey <= + // clip_end_key + bool in_range = false; + if (begin_key_2.compare(md.smallestkey) <= 0 && + end_key_2.compare(md.largestkey) > 0) { + in_range = true; + } + ASSERT_TRUE(in_range); + } +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/db/db_impl/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h index e0e0d56649..9879d81b61 100644 --- a/db/db_impl/compacted_db_impl.h +++ b/db/db_impl/compacted_db_impl.h @@ -128,6 +128,13 @@ class CompactedDBImpl : public DBImpl { return Status::NotSupported("Not supported in compacted db mode."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + // FIXME: some missing overrides for more "write" functions // Share with DBImplReadOnly? diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 07aee535db..2ed0caf7f2 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -4522,7 +4522,6 @@ void DBImpl::GetAllColumnFamilyMetaData( } } - Status DBImpl::CheckConsistency() { mutex_.AssertHeld(); std::vector metadata; @@ -5705,6 +5704,81 @@ Status DBImpl::CreateColumnFamilyWithImport( return status; } +Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + assert(column_family); + Status status; + // Flush memtable + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + auto* cfd = + static_cast_with_check(column_family)->cfd(); + if (immutable_db_options_.atomic_flush) { + status = AtomicFlushMemTables(flush_opts, FlushReason::kDeleteFiles, + {} /* provided_candidate_cfds */, + false /* entered_write_thread */); + } else { + status = FlushMemTable(cfd, flush_opts, FlushReason::kDeleteFiles, + false /* entered_write_thread */); + } + + if (status.ok()) { + // DeleteFilesInRanges non-overlap files except L0 + std::vector ranges; + ranges.push_back(RangePtr(nullptr, &begin_key)); + ranges.push_back(RangePtr(&end_key, nullptr)); + status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size()); + } + + // DeleteRange the remaining overlapping keys + bool empty_after_delete = false; + if (status.ok()) { + Slice smallest_user_key, largest_user_key; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + cfd->current()->GetSstFilesBoundaryKeys(&smallest_user_key, + &largest_user_key); + } + // all the files has been deleted after DeleteFilesInRanges; + if (smallest_user_key.empty() && largest_user_key.empty()) { + empty_after_delete = true; + } else { + const Comparator* const ucmp = column_family->GetComparator(); + WriteOptions wo; + // Delete [smallest_user_key, clip_begin_key) + if (ucmp->Compare(smallest_user_key, begin_key) < 0) { + status = DeleteRange(wo, column_family, smallest_user_key, begin_key); + } + + if (status.ok()) { + // Delete [clip_end_key, largest_use_key] + if (ucmp->Compare(end_key, largest_user_key) < 0) { + status = DeleteRange(wo, column_family, end_key, largest_user_key); + if (status.ok()) { + status = Delete(wo, column_family, largest_user_key); + } + } + } + } + } + + if (status.ok() && !empty_after_delete) { + // CompactRange delete all the tombstones + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = true; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + // We could just compact the ranges [null, clip_begin_key] and + // [clip_end_key, null]. But due to how manual compaction calculates the + // last level to compact to and that range tombstones are not dropped + // during non-bottommost compactions, calling CompactRange() on these two + // ranges may not clear all range tombstones. + status = CompactRange(compact_options, nullptr, nullptr); + } + return status; +} + Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) { return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true); } diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 8f782a34dc..f2da7467db 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -534,6 +534,11 @@ class DBImpl : public DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) override; + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override; + using DB::VerifyFileChecksums; Status VerifyFileChecksums(const ReadOptions& read_options) override; diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h index 1cc3741981..a694acc003 100644 --- a/db/db_impl/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -142,6 +142,13 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + // FIXME: some missing overrides for more "write" functions protected: diff --git a/db/db_test.cc b/db/db_test.cc index fdf82153c9..d23daa55df 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3107,6 +3107,13 @@ class ModelDB : public DB { return Status::NotSupported("Not implemented."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not implemented."); + } + using DB::GetPropertiesOfAllTables; Status GetPropertiesOfAllTables( ColumnFamilyHandle* /*column_family*/, diff --git a/db/version_set.cc b/db/version_set.cc index b9610331ef..674c0e4aa9 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1826,6 +1826,49 @@ uint64_t Version::GetSstFilesSize() { return sst_files_size; } +void Version::GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key) { + smallest_user_key->clear(); + largest_user_key->clear(); + bool initialized = false; + const Comparator* ucmp = storage_info_.user_comparator_; + for (int level = 0; level < cfd_->NumberLevels(); level++) { + if (storage_info_.LevelFiles(level).size() == 0) { + continue; + } + if (level == 0) { + // we need to consider all files on level 0 + for (const auto& file : storage_info_.LevelFiles(level)) { + const Slice& start_user_key = file->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = file->largest.user_key(); + if (!initialized || + ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + const Slice& start_user_key = + storage_info_.LevelFiles(level)[0]->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = + storage_info_.LevelFiles(level).back()->largest.user_key(); + if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } +} + void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { uint64_t oldest_time = std::numeric_limits::max(); for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { diff --git a/db/version_set.h b/db/version_set.h index e7e96bc6cd..25ad365837 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -992,6 +992,9 @@ class Version { void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); + void GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key); + uint64_t GetSstFilesSize(); // Retrieves the file_creation_time of the oldest file in the DB. diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 6539eb8aeb..a42f5b8b6d 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1763,6 +1763,25 @@ class DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) = 0; + // EXPERIMENTAL + // ClipColumnFamily() will clip the entries in the CF according to the range + // [begin_key, + // end_key). + // Returns OK on success, and a non-OK status on error. + // Any entries outside this range will be completely deleted (including + // tombstones). + // The main difference between ClipColumnFamily(begin, end) and + // DeleteRange(begin, end) + // is that the former physically deletes all keys outside the range, but is + // more heavyweight than the latter. + // This feature is mainly used to ensure that there is no overlapping Key when + // calling + // CreateColumnFamilyWithImports() to import multiple CFs. + // Note that: concurrent updates cannot be performed during Clip. + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) = 0; + // Verify the checksums of files in db. Currently the whole-file checksum of // table files are checked. virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) { diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index abb365ad37..1a87a11366 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -178,6 +178,13 @@ class StackableDB : public DB { import_options, metadata, handle); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override { + return db_->ClipColumnFamily(column_family, begin_key, end_key); + } + using DB::VerifyFileChecksums; Status VerifyFileChecksums(const ReadOptions& read_opts) override { return db_->VerifyFileChecksums(read_opts); diff --git a/src.mk b/src.mk index baaea29be6..eb70ac04b7 100644 --- a/src.mk +++ b/src.mk @@ -460,6 +460,7 @@ TEST_MAIN_SOURCES = \ db/db_bloom_filter_test.cc \ db/db_compaction_filter_test.cc \ db/db_compaction_test.cc \ + db/db_clip_test.cc \ db/db_dynamic_level_test.cc \ db/db_encryption_test.cc \ db/db_flush_test.cc \