diff --git a/.github/workflows/benchmark-linux.yml b/.github/workflows/benchmark-linux.yml index ae09c01b5c..bdf26c8661 100644 --- a/.github/workflows/benchmark-linux.yml +++ b/.github/workflows/benchmark-linux.yml @@ -1,13 +1,13 @@ name: facebook/rocksdb/benchmark-linux on: workflow_dispatch -jobs: - # FIXME: when this job is fixed, it should be given a cron schedule like +permissions: {} + # FIXME: Disabled temporarily # schedule: - # - cron: 0 * * * * - # workflow_dispatch: + # - cron: 7 */2 * * * # At minute 7 past every 2nd hour +jobs: benchmark-linux: if: ${{ github.repository_owner == 'facebook' }} - runs-on: ubuntu-latest + runs-on: ubuntu-latest # FIXME: change this back to self-hosted when ready steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/build-for-benchmarks" diff --git a/.github/workflows/nightly-candidate.yml b/.github/workflows/nightly-candidate.yml index 28a2d3405b..bfc15d1b5c 100644 --- a/.github/workflows/nightly-candidate.yml +++ b/.github/workflows/nightly-candidate.yml @@ -1,5 +1,6 @@ name: facebook/rocksdb/nightly on: workflow_dispatch +permissions: {} jobs: # These jobs would be in nightly but are failing or otherwise broken for # some reason. diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 3c0ad9f255..0bf3436390 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -3,6 +3,7 @@ on: schedule: - cron: 0 9 * * * workflow_dispatch: +permissions: {} jobs: build-format-compatible: if: ${{ github.repository_owner == 'facebook' }} @@ -59,12 +60,15 @@ jobs: container: image: zjay437/rocksdb:0.6 options: --shm-size=16gb + env: + CC: clang-13 + CXX: clang++-13 steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - uses: "./.github/actions/setup-folly" - uses: "./.github/actions/build-folly" - - run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check + - run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check - uses: "./.github/actions/post-steps" build-linux-valgrind: if: ${{ github.repository_owner == 'facebook' }} diff --git a/.github/workflows/pr-jobs-candidate.yml b/.github/workflows/pr-jobs-candidate.yml index 5c8e968422..c6e280fdff 100644 --- a/.github/workflows/pr-jobs-candidate.yml +++ b/.github/workflows/pr-jobs-candidate.yml @@ -1,5 +1,6 @@ name: facebook/rocksdb/pr-jobs-candidate on: workflow_dispatch +permissions: {} jobs: # These jobs would be in pr-jobs but are failing or otherwise broken for # some reason. diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml index 627b4dfc03..9d89a111e5 100644 --- a/.github/workflows/pr-jobs.yml +++ b/.github/workflows/pr-jobs.yml @@ -1,5 +1,6 @@ name: facebook/rocksdb/pr-jobs on: [push, pull_request] +permissions: {} jobs: # NOTE: multiple workflows would be recommended, but the current GHA UI in # PRs doesn't make it clear when there's an overall error with a workflow, diff --git a/HISTORY.md b/HISTORY.md index cf060aa419..f3ad192da6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,28 @@ # Rocksdb Change Log > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt` +## 9.6.0 (08/19/2024) +### New Features +* *Best efforts recovery supports recovering to incomplete Version with a clean seqno cut that presents a valid point in time view from the user's perspective, if versioning history doesn't include atomic flush. +* New option `BlockBasedTableOptions::decouple_partitioned_filters` should improve efficiency in serving read queries because filter and index partitions can consistently target the configured `metadata_block_size`. This option is currently opt-in. +* Introduce a new mutable CF option `paranoid_memory_checks`. It enables additional validation on data integrity during reads/scanning. Currently, skip list based memtable will validate key ordering during look up and scans. + +### Public API Changes +* Add ticker stats to count file read retries due to checksum mismatch +* Adds optional installation callback function for remote compaction + +### Behavior Changes +* There may be less intra-L0 compaction triggered by total L0 size being too small. We now use compensated file size (tombstones are assigned some value size) when calculating L0 size and reduce the threshold for L0 size limit. This is to avoid accumulating too much data/tombstones in L0. + +### Bug Fixes +* *Make DestroyDB supports slow deletion when it's configured in `SstFileManager`. The slow deletion is subject to the configured `rate_bytes_per_sec`, but not subject to the `max_trash_db_ratio`. +* Fixed a bug where we set unprep_seqs_ even when WriteImpl() fails. This was caught by stress test write fault injection in WriteImpl(). This may have incorrectly caused iteration creation failure for unvalidated writes or returned wrong result for WriteUnpreparedTxn::GetUnpreparedSequenceNumbers(). +* Fixed a bug where successful write right after error recovery for last failed write finishes causes duplicate WAL entries +* Fixed a data race involving the background error status in `unordered_write` mode. +* *Fix a bug where file snapshot functions like backup, checkpoint may attempt to copy a non-existing manifest file. #12882 +* Fix a bug where per kv checksum corruption may be ignored in MultiGet(). +* Fix a race condition in pessimistic transactions that could allow multiple transactions with the same name to be registered simultaneously, resulting in a crash or other unpredictable behavior. + ## 9.5.0 (07/19/2024) ### Public API Changes * Introduced new C API function rocksdb_writebatch_iterate_cf for column family-aware iteration over the contents of a WriteBatch diff --git a/db/column_family.cc b/db/column_family.cc index b17571254f..06e2b4365d 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1652,6 +1652,9 @@ bool ColumnFamilyData::ShouldPostponeFlushToRetainUDT( } for (const Slice& table_newest_udt : imm()->GetTablesNewestUDT(max_memtable_id)) { + if (table_newest_udt.empty()) { + continue; + } assert(table_newest_udt.size() == full_history_ts_low.size()); // Checking the newest UDT contained in MemTable with ascending ID up to // `max_memtable_id`. Return immediately on finding the first MemTable that diff --git a/db/column_family_test.cc b/db/column_family_test.cc index f845ad05e9..d7751992b5 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -3067,12 +3067,20 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) { WaitForCompaction(); AssertFilesPerLevel("0,1", 0 /* cf */); + // We should calculate the limit by obtaining the number of env background + // threads, because the current test case will share the same env + // with another case that may have already increased the number of + // background threads which is larger than kParallelismLimit + const auto limit = env_->GetBackgroundThreads(Env::Priority::LOW); + // Block the compaction thread pool so marked files accumulate in L0. - test::SleepingBackgroundTask sleeping_tasks[kParallelismLimit]; - for (int i = 0; i < kParallelismLimit; i++) { + std::vector> sleeping_tasks; + for (int i = 0; i < limit; i++) { + sleeping_tasks.emplace_back( + std::make_shared()); env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_tasks[i], Env::Priority::LOW); - sleeping_tasks[i].WaitUntilSleeping(); + sleeping_tasks[i].get(), Env::Priority::LOW); + sleeping_tasks[i]->WaitUntilSleeping(); } // Zero marked upper-level files. No speedup. @@ -3091,9 +3099,9 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) { ASSERT_EQ(kParallelismLimit, dbfull()->TEST_BGCompactionsAllowed()); AssertFilesPerLevel("2,1", 0 /* cf */); - for (int i = 0; i < kParallelismLimit; i++) { - sleeping_tasks[i].WakeUp(); - sleeping_tasks[i].WaitUntilDone(); + for (int i = 0; i < limit; i++) { + sleeping_tasks[i]->WakeUp(); + sleeping_tasks[i]->WaitUntilDone(); } } diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 11a757fd68..bbc0fe4cf3 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -552,7 +552,8 @@ class CompactionJobTestBase : public testing::Test { /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", /*error_handler=*/nullptr, /*read_only=*/false)); compaction_job_stats_.Reset(); - ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); + ASSERT_OK( + SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown)); VersionEdit new_db; new_db.SetLogNumber(0); @@ -575,7 +576,8 @@ class CompactionJobTestBase : public testing::Test { } ASSERT_OK(s); // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + Temperature::kUnknown, nullptr); ASSERT_OK(s); diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index 92cf865016..ae289ac3fb 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -925,11 +925,15 @@ bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() { } uint64_t l0_size = 0; for (const auto& file : l0_files) { - l0_size += file->fd.GetFileSize(); + assert(file->compensated_file_size >= file->fd.GetFileSize()); + // Compact down L0s with more deletions. + l0_size += file->compensated_file_size; } - const uint64_t min_lbase_size = - l0_size * static_cast(std::max( - 10.0, mutable_cf_options_.max_bytes_for_level_multiplier)); + + // Avoid L0->Lbase compactions that are inefficient for write-amp. + const double kMultiplier = + std::max(10.0, mutable_cf_options_.max_bytes_for_level_multiplier) * 2; + const uint64_t min_lbase_size = MultiplyCheckOverflow(l0_size, kMultiplier); assert(min_lbase_size >= l0_size); const std::vector& lbase_files = vstorage_->LevelFiles(/*level=*/base_level); diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index 70f59a8765..b58f5d0100 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -214,7 +214,10 @@ class CompactionPickerTest : public CompactionPickerTestBase { explicit CompactionPickerTest() : CompactionPickerTestBase(BytewiseComparator()) {} - ~CompactionPickerTest() override = default; + ~CompactionPickerTest() override { + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + } }; class CompactionPickerU64TsTest : public CompactionPickerTestBase { @@ -4284,27 +4287,28 @@ TEST_F(CompactionPickerTest, IntraL0WhenL0IsSmall) { SCOPED_TRACE("lbase_size_multiplier=" + std::to_string(lbase_size_multiplier)); NewVersionStorage(6, kCompactionStyleLevel); - // When L0 size is <= Lbase size / max_bytes_for_level_multiplier, + // When L0 size is <= Lbase size / max_bytes_for_level_multiplier / 2, // intra-L0 compaction is picked. Otherwise, L0->L1 // compaction is picked. + // compensated_file_size will be used to compute total l0 size. Add(/*level=*/0, /*file_number=*/1U, /*smallest=*/"100", - /*largest=*/"200", /*file_size=*/1000, /*path_id=*/0, + /*largest=*/"200", /*file_size=*/10, /*path_id=*/0, /*smallest_seq=*/10, /*largest_seq=*/11, /*compensated_file_size=*/1000); Add(/*level=*/0, /*file_number=*/2U, /*smallest=*/"100", - /*largest=*/"100", /*file_size=*/1000, /*path_id=*/0, + /*largest=*/"100", /*file_size=*/10, /*path_id=*/0, /*smallest_seq=*/20, /*largest_seq=*/21, /*compensated_file_size=*/1000); Add(/*level=*/0, /*file_number=*/3U, /*smallest=*/"100", - /*largest=*/"200", /*file_size=*/1000, /*path_id=*/0, + /*largest=*/"200", /*file_size=*/10, /*path_id=*/0, /*smallest_seq=*/30, /*largest_seq=*/31, /*compensated_file_size=*/1000); Add(/*level=*/0, /*file_number=*/4U, /*smallest=*/"100", - /*largest=*/"200", /*file_size=*/1000, /*path_id=*/0, + /*largest=*/"200", /*file_size=*/10, /*path_id=*/0, /*smallest_seq=*/40, /*largest_seq=*/41, /*compensated_file_size=*/1000); const uint64_t l0_size = 4000; - const uint64_t lbase_size = l0_size * lbase_size_multiplier; + const uint64_t lbase_size = l0_size * lbase_size_multiplier * 2; Add(/*level=*/1, /*file_number=*/5U, /*smallest=*/"100", /*largest=*/"200", /*file_size=*/lbase_size, /*path_id=*/0, /*smallest_seq=*/0, /*largest_seq=*/0, diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc index 7f6b872cb2..3b56d057b4 100644 --- a/db/compaction/compaction_service_job.cc +++ b/db/compaction/compaction_service_job.cc @@ -140,9 +140,13 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( return compaction_status; } + // CompactionServiceJobStatus::kSuccess was returned, but somehow we failed to + // read the result. Consider this as an installation failure if (!s.ok()) { sub_compact->status = s; compaction_result.status.PermitUncheckedError(); + db_options_.compaction_service->OnInstallation( + response.scheduled_job_id, CompactionServiceJobStatus::kFailure); return CompactionServiceJobStatus::kFailure; } sub_compact->status = compaction_result.status; @@ -154,18 +158,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( is_first_one = false; } - ROCKS_LOG_INFO(db_options_.info_log, - "[%s] [JOB %d] Receive remote compaction result, output path: " - "%s, files: %s", - compaction_input.column_family.name.c_str(), job_id_, - compaction_result.output_path.c_str(), - output_files_oss.str().c_str()); - - if (!s.ok()) { - sub_compact->status = s; - return CompactionServiceJobStatus::kFailure; - } + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Received remote compaction result, output path: " + "%s, files: %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_result.output_path.c_str(), output_files_oss.str().c_str()); + // Installation Starts for (const auto& file : compaction_result.output_files) { uint64_t file_num = versions_->NewFileNumber(); auto src_file = compaction_result.output_path + "/" + file.file_name; @@ -174,6 +174,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr); if (!s.ok()) { sub_compact->status = s; + db_options_.compaction_service->OnInstallation( + response.scheduled_job_id, CompactionServiceJobStatus::kFailure); return CompactionServiceJobStatus::kFailure; } @@ -182,6 +184,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); if (!s.ok()) { sub_compact->status = s; + db_options_.compaction_service->OnInstallation( + response.scheduled_job_id, CompactionServiceJobStatus::kFailure); return CompactionServiceJobStatus::kFailure; } meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size, @@ -206,6 +210,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read); RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES, compaction_result.bytes_written); + db_options_.compaction_service->OnInstallation( + response.scheduled_job_id, CompactionServiceJobStatus::kSuccess); return CompactionServiceJobStatus::kSuccess; } diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc index 812a658dcf..8aacf2b6d2 100644 --- a/db/compaction/compaction_service_test.cc +++ b/db/compaction/compaction_service_test.cc @@ -108,6 +108,11 @@ class MyTestCompactionService : public CompactionService { } } + void OnInstallation(const std::string& /*scheduled_job_id*/, + CompactionServiceJobStatus status) override { + final_updated_status_ = status; + } + int GetCompactionNum() { return compaction_num_.load(); } CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; } @@ -136,6 +141,10 @@ class MyTestCompactionService : public CompactionService { void SetCanceled(bool canceled) { canceled_ = canceled; } + CompactionServiceJobStatus GetFinalCompactionServiceJobStatus() { + return final_updated_status_.load(); + } + private: InstrumentedMutex mutex_; std::atomic_int compaction_num_{0}; @@ -158,6 +167,8 @@ class MyTestCompactionService : public CompactionService { std::vector> table_properties_collector_factories_; std::atomic_bool canceled_{false}; + std::atomic final_updated_status_{ + CompactionServiceJobStatus::kUseLocal}; }; class CompactionServiceTest : public DBTestBase { @@ -255,6 +266,8 @@ TEST_F(CompactionServiceTest, BasicCompactions) { auto my_cs = GetCompactionService(); ASSERT_GE(my_cs->GetCompactionNum(), 1); + ASSERT_EQ(CompactionServiceJobStatus::kSuccess, + my_cs->GetFinalCompactionServiceJobStatus()); // make sure the compaction statistics is only recorded on the remote side ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1); @@ -437,6 +450,8 @@ TEST_F(CompactionServiceTest, InvalidResult) { Slice end(end_str); Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); ASSERT_FALSE(s.ok()); + ASSERT_EQ(CompactionServiceJobStatus::kFailure, + my_cs->GetFinalCompactionServiceJobStatus()); } TEST_F(CompactionServiceTest, SubCompaction) { diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 1e6c392bf0..bb8a132ae3 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -3407,6 +3407,46 @@ class TableFileListener : public EventListener { InstrumentedMutex mutex_; std::unordered_map> cf_to_paths_; }; + +class FlushTableFileListener : public EventListener { + public: + void OnTableFileCreated(const TableFileCreationInfo& info) override { + InstrumentedMutexLock lock(&mutex_); + if (info.reason != TableFileCreationReason::kFlush) { + return; + } + cf_to_flushed_files_[info.cf_name].push_back(info.file_path); + } + std::vector& GetFlushedFiles(const std::string& cf_name) { + InstrumentedMutexLock lock(&mutex_); + return cf_to_flushed_files_[cf_name]; + } + + private: + InstrumentedMutex mutex_; + std::unordered_map> + cf_to_flushed_files_; +}; + +class FlushBlobFileListener : public EventListener { + public: + void OnBlobFileCreated(const BlobFileCreationInfo& info) override { + InstrumentedMutexLock lock(&mutex_); + if (info.reason != BlobFileCreationReason::kFlush) { + return; + } + cf_to_flushed_blobs_files_[info.cf_name].push_back(info.file_path); + } + std::vector& GetFlushedBlobFiles(const std::string& cf_name) { + InstrumentedMutexLock lock(&mutex_); + return cf_to_flushed_blobs_files_[cf_name]; + } + + private: + InstrumentedMutex mutex_; + std::unordered_map> + cf_to_flushed_blobs_files_; +}; } // anonymous namespace TEST_F(DBBasicTest, LastSstFileNotInManifest) { @@ -3512,6 +3552,121 @@ TEST_F(DBBasicTest, RecoverWithMissingFiles) { } } +// Param 0: whether to enable blob DB. +// Param 1: when blob DB is enabled, whether to also delete the missing L0 +// file's associated blob file. +class BestEffortsRecoverIncompleteVersionTest + : public DBTestBase, + public testing::WithParamInterface> { + public: + BestEffortsRecoverIncompleteVersionTest() + : DBTestBase("best_efforts_recover_incomplete_version_test", + /*env_do_fsync=*/false) {} +}; + +TEST_P(BestEffortsRecoverIncompleteVersionTest, Basic) { + Options options = CurrentOptions(); + options.enable_blob_files = std::get<0>(GetParam()); + bool delete_blob_file_too = std::get<1>(GetParam()); + DestroyAndReopen(options); + FlushTableFileListener* flush_table_listener = new FlushTableFileListener(); + FlushBlobFileListener* flush_blob_listener = new FlushBlobFileListener(); + // Disable auto compaction to simplify SST file name tracking. + options.disable_auto_compactions = true; + options.listeners.emplace_back(flush_table_listener); + options.listeners.emplace_back(flush_blob_listener); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + std::vector all_cf_names = {kDefaultColumnFamilyName, "pikachu", + "eevee"}; + int num_cfs = static_cast(handles_.size()); + ASSERT_EQ(3, num_cfs); + std::string start = "a"; + Slice start_slice = start; + std::string end = "d"; + Slice end_slice = end; + for (int cf = 0; cf != num_cfs; ++cf) { + ASSERT_OK(Put(cf, "a", "a_value")); + ASSERT_OK(Flush(cf)); + // Compact file to L1 to avoid trivial file move in the next compaction + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], + &start_slice, &end_slice)); + ASSERT_OK(Put(cf, "a", "a_value_new")); + ASSERT_OK(Flush(cf)); + ASSERT_OK(Put(cf, "b", "b_value")); + ASSERT_OK(Flush(cf)); + ASSERT_OK(Put(cf, "f", "f_value")); + ASSERT_OK(Flush(cf)); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], + &start_slice, &end_slice)); + } + + dbfull()->TEST_DeleteObsoleteFiles(); + + // Delete the most recent L0 file which is before a compaction. + for (int i = 0; i < num_cfs; ++i) { + std::vector& files = + flush_table_listener->GetFlushedFiles(all_cf_names[i]); + ASSERT_EQ(4, files.size()); + ASSERT_OK(env_->DeleteFile(files[files.size() - 1])); + if (options.enable_blob_files) { + std::vector& blob_files = + flush_blob_listener->GetFlushedBlobFiles(all_cf_names[i]); + ASSERT_EQ(4, blob_files.size()); + if (delete_blob_file_too) { + ASSERT_OK(env_->DeleteFile(blob_files[files.size() - 1])); + } + } + } + options.best_efforts_recovery = true; + ReopenWithColumnFamilies(all_cf_names, options); + + for (int i = 0; i < num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + ColumnFamilyData* cfd = cfh->cfd(); + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + // The L0 file flushed right before the last compaction is missing. + ASSERT_EQ(0, vstorage->LevelFiles(0).size()); + // Only the output of the last compaction is available. + ASSERT_EQ(1, vstorage->LevelFiles(1).size()); + } + // Verify data + ReadOptions read_opts; + read_opts.total_order_seek = true; + for (int i = 0; i < num_cfs; ++i) { + std::unique_ptr iter(db_->NewIterator(read_opts, handles_[i])); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("a", iter->key()); + ASSERT_EQ("a_value_new", iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("b", iter->key()); + ASSERT_EQ("b_value", iter->value()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + // Write more data. + for (int cf = 0; cf < num_cfs; ++cf) { + ASSERT_OK(Put(cf, "g", "g_value")); + ASSERT_OK(Flush(cf)); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr, + nullptr)); + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), handles_[cf], "g", &value)); + ASSERT_EQ("g_value", value); + } +} + +INSTANTIATE_TEST_CASE_P(BestEffortsRecoverIncompleteVersionTest, + BestEffortsRecoverIncompleteVersionTest, + testing::Values(std::make_tuple(false, false), + std::make_tuple(true, false), + std::make_tuple(true, true))); + TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) { Options options = CurrentOptions(); options.env = env_; diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 83cfe32a48..4943e99c4c 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -45,6 +45,12 @@ const std::string kStandard128Ribbon = const std::string kAutoBloom = BloomFilterPolicy::kClassName(); const std::string kAutoRibbon = RibbonFilterPolicy::kClassName(); +enum class FilterPartitioning { + kUnpartitionedFilter, + kCoupledPartitionedFilter, + kDecoupledPartitionedFilter, +}; + template T Pop(T& var) { auto rv = var; @@ -62,32 +68,61 @@ class DBBloomFilterTest : public DBTestBase { public: DBBloomFilterTest() : DBTestBase("db_bloom_filter_test", /*env_do_fsync=*/true) {} + + bool PartitionFilters() { + return filter_partitioning_ != FilterPartitioning::kUnpartitionedFilter; + } + + void SetInTableOptions(BlockBasedTableOptions* table_options) { + table_options->partition_filters = PartitionFilters(); + if (PartitionFilters()) { + table_options->index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + } + + protected: + FilterPartitioning filter_partitioning_ = + FilterPartitioning::kUnpartitionedFilter; }; -class DBBloomFilterTestWithParam - : public DBTestBase, +class DBBloomFilterTestWithPartitioningParam + : public DBBloomFilterTest, + public testing::WithParamInterface { + public: + ~DBBloomFilterTestWithPartitioningParam() override = default; + + void SetUp() override { filter_partitioning_ = GetParam(); } +}; + +class DBBloomFilterTestWithFormatParams + : public DBBloomFilterTest, public testing::WithParamInterface< - std::tuple> { - // public testing::WithParamInterface { + std::tuple> { protected: std::string bfp_impl_; - bool partition_filters_; + double bits_per_key_; uint32_t format_version_; public: - DBBloomFilterTestWithParam() - : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {} - - ~DBBloomFilterTestWithParam() override = default; + ~DBBloomFilterTestWithFormatParams() override = default; void SetUp() override { bfp_impl_ = std::get<0>(GetParam()); - partition_filters_ = std::get<1>(GetParam()); + bits_per_key_ = 10; // default; + filter_partitioning_ = std::get<1>(GetParam()); format_version_ = std::get<2>(GetParam()); } + + void SetInTableOptions(BlockBasedTableOptions* table_options) { + DBBloomFilterTest::SetInTableOptions(table_options); + table_options->filter_policy = Create(bits_per_key_, bfp_impl_); + table_options->format_version = format_version_; + } }; -class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {}; +class DBBloomFilterTestDefFormatVersion + : public DBBloomFilterTestWithFormatParams {}; class SliceTransformLimitedDomainGeneric : public SliceTransform { const char* Name() const override { @@ -118,11 +153,11 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { std::string value; anon::OptionsOverride options_override; options_override.filter_policy = Create(20, bfp_impl_); - options_override.partition_filters = partition_filters_; + options_override.partition_filters = PartitionFilters(); options_override.metadata_block_size = 32; options_override.full_block_cache = true; Options options = CurrentOptions(options_override); - if (partition_filters_) { + if (PartitionFilters()) { auto* table_options = options.table_factory->GetOptions(); if (table_options != nullptr && @@ -190,397 +225,383 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction)); } -TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) { - for (bool partition_filters : {true, false}) { - Options options = last_options_; - options.prefix_extractor = - std::make_shared(); - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - get_perf_context()->EnablePerLevelPerfContext(); - BlockBasedTableOptions bbto; - bbto.filter_policy.reset(NewBloomFilterPolicy(10)); - if (partition_filters) { - bbto.partition_filters = true; - bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; - } - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); +TEST_P(DBBloomFilterTestWithPartitioningParam, + GetFilterByPrefixBloomCustomPrefixExtractor) { + Options options = last_options_; + options.prefix_extractor = + std::make_shared(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + BlockBasedTableOptions bbto; + SetInTableOptions(&bbto); + bbto.filter_policy.reset(NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); - WriteOptions wo; - ReadOptions ro; - FlushOptions fo; - fo.wait = true; - std::string value; + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; - ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); - ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); - ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); - ASSERT_OK(dbfull()->Flush(fo)); + ASSERT_OK(dbfull()->Flush(fo)); - ASSERT_EQ("foo", Get("barbarbar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); + ASSERT_EQ("foo", Get("barbarbar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); - ASSERT_EQ("foo2", Get("barbarbar2")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); + ASSERT_EQ("foo2", Get("barbarbar2")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); - ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); - ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); - ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); - ro.total_order_seek = true; - // NOTE: total_order_seek no longer affects Get() - ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); + ro.total_order_seek = true; + // NOTE: total_order_seek no longer affects Get() + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); - // No bloom on extractor changed - ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); - ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); + // No bloom on extractor changed + ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); - // No bloom on extractor changed, after re-open - options.prefix_extractor.reset(NewCappedPrefixTransform(10)); - Reopen(options); - ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); + // No bloom on extractor changed, after re-open + options.prefix_extractor.reset(NewCappedPrefixTransform(10)); + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); - get_perf_context()->Reset(); - } + get_perf_context()->Reset(); } -TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) { - for (bool partition_filters : {true, false}) { - Options options = last_options_; - options.prefix_extractor.reset(NewFixedPrefixTransform(8)); - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - get_perf_context()->EnablePerLevelPerfContext(); - BlockBasedTableOptions bbto; - bbto.filter_policy.reset(NewBloomFilterPolicy(10)); - if (partition_filters) { - bbto.partition_filters = true; - bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; - } - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); +TEST_P(DBBloomFilterTestWithPartitioningParam, GetFilterByPrefixBloom) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + BlockBasedTableOptions bbto; + SetInTableOptions(&bbto); + bbto.filter_policy.reset(NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); - WriteOptions wo; - ReadOptions ro; - FlushOptions fo; - fo.wait = true; - std::string value; + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; - ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); - ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); - ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); - ASSERT_OK(dbfull()->Flush(fo)); + ASSERT_OK(dbfull()->Flush(fo)); - ASSERT_EQ("foo", Get("barbarbar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - ASSERT_EQ("foo2", Get("barbarbar2")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); + ASSERT_EQ("foo", Get("barbarbar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ("foo2", Get("barbarbar2")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); - ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); - ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); - ro.total_order_seek = true; - // NOTE: total_order_seek no longer affects Get() - ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); + ro.total_order_seek = true; + // NOTE: total_order_seek no longer affects Get() + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); - // No bloom on extractor changed - ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); - ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); + // No bloom on extractor changed + ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); - get_perf_context()->Reset(); - } + get_perf_context()->Reset(); } -TEST_F(DBBloomFilterTest, FilterNumEntriesCoalesce) { - for (bool partition_filters : {true, false}) { - SCOPED_TRACE("partition_filters=" + std::to_string(partition_filters)); - for (bool prefix : {true, false}) { - SCOPED_TRACE("prefix=" + std::to_string(prefix)); - for (bool whole : {true, false}) { - SCOPED_TRACE("whole=" + std::to_string(whole)); - Options options = last_options_; - options.prefix_extractor.reset(); - if (prefix) { - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - } - BlockBasedTableOptions bbto; - bbto.filter_policy.reset(NewBloomFilterPolicy(10)); - bbto.whole_key_filtering = whole; - if (partition_filters) { - bbto.partition_filters = true; - bbto.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; - } - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); +TEST_P(DBBloomFilterTestWithPartitioningParam, FilterNumEntriesCoalesce) { + for (bool prefix : {true, false}) { + SCOPED_TRACE("prefix=" + std::to_string(prefix)); + for (bool whole : {true, false}) { + SCOPED_TRACE("whole=" + std::to_string(whole)); + Options options = last_options_; + options.prefix_extractor.reset(); + if (prefix) { + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + } + BlockBasedTableOptions bbto; + SetInTableOptions(&bbto); + bbto.filter_policy.reset(NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = whole; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); - // Need a snapshot to allow keeping multiple entries for the same key - std::vector snapshots; - for (int i = 1; i <= 3; ++i) { - std::string val = "val" + std::to_string(i); - ASSERT_OK(Put("foo1", val)); - ASSERT_OK(Put("foo2", val)); - ASSERT_OK(Put("bar1", val)); - ASSERT_OK(Put("bar2", val)); - ASSERT_OK(Put("bar3", val)); - snapshots.push_back(db_->GetSnapshot()); - } - ASSERT_OK(Flush()); + // Need a snapshot to allow keeping multiple entries for the same key + std::vector snapshots; + for (int i = 1; i <= 3; ++i) { + std::string val = "val" + std::to_string(i); + ASSERT_OK(Put("foo1", val)); + ASSERT_OK(Put("foo2", val)); + ASSERT_OK(Put("bar1", val)); + ASSERT_OK(Put("bar2", val)); + ASSERT_OK(Put("bar3", val)); + snapshots.push_back(db_->GetSnapshot()); + } + ASSERT_OK(Flush()); - TablePropertiesCollection tpc; - ASSERT_OK(db_->GetPropertiesOfAllTables(&tpc)); - // sanity checks - ASSERT_EQ(tpc.size(), 1U); - auto& tp = *tpc.begin()->second; - EXPECT_EQ(tp.num_entries, 3U * 5U); + TablePropertiesCollection tpc; + ASSERT_OK(db_->GetPropertiesOfAllTables(&tpc)); + // sanity checks + ASSERT_EQ(tpc.size(), 1U); + auto& tp = *tpc.begin()->second; + EXPECT_EQ(tp.num_entries, 3U * 5U); - // test checks - unsigned ex_filter_entries = 0; - if (whole) { - ex_filter_entries += 5; // unique keys - } - if (prefix) { - ex_filter_entries += 2; // unique prefixes - } - EXPECT_EQ(tp.num_filter_entries, ex_filter_entries); + // test checks + unsigned ex_filter_entries = 0; + if (whole) { + ex_filter_entries += 5; // unique keys + } + if (prefix) { + ex_filter_entries += 2; // unique prefixes + } + EXPECT_EQ(tp.num_filter_entries, ex_filter_entries); - for (auto* sn : snapshots) { - db_->ReleaseSnapshot(sn); - } + for (auto* sn : snapshots) { + db_->ReleaseSnapshot(sn); } } } } -TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { - for (bool partition_filters : {true, false}) { - Options options = last_options_; - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - get_perf_context()->EnablePerLevelPerfContext(); +TEST_P(DBBloomFilterTestWithPartitioningParam, WholeKeyFilterProp) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); - BlockBasedTableOptions bbto; - bbto.filter_policy.reset(NewBloomFilterPolicy(10)); - bbto.whole_key_filtering = false; - if (partition_filters) { - bbto.partition_filters = true; - bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + BlockBasedTableOptions bbto; + SetInTableOptions(&bbto); + bbto.filter_policy.reset(NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(dbfull()->Flush(fo)); + + Reopen(options); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("foo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("bar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("foo", Get("foobar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + + // Reopen with whole key filtering enabled and prefix extractor + // NULL. Bloom filter should be off for both of whole key and + // prefix bloom. + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.prefix_extractor.reset(); + Reopen(options); + + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("foo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("bar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("foo", Get("foobar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + // Write DB with only full key filtering. + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Reopen with both of whole key off and prefix extractor enabled. + // Still no bloom filter should be used. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("foo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("bar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("foo", Get("foobar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + + // Try to create a DB with mixed files: + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + options.prefix_extractor.reset(); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + // Try to create a DB with mixed files. + ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar")); + // In this case needs insert some keys to make sure files are + // not filtered out by key ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(Flush()); + + // Now we have two files: + // File 1: An older file with prefix bloom (disabled) + // File 2: A newer file with whole bloom filter. + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("foo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("bar", Get("barfoo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + + // Reopen with the same setting: only whole key is used + Reopen(options); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("foo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("bar", Get("barfoo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + + // Restart with both filters are allowed + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + // File 1 will has it filtered out. + // File 2 will not, as prefix `foo` exists in the file. + ASSERT_EQ("NOT_FOUND", Get("foo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("bar", Get("barfoo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + + // Restart with only prefix bloom is allowed. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("foo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("bar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("foo", Get("foobar")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("bar", Get("barfoo")); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); + uint64_t bloom_filter_useful_all_levels = 0; + for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { + if (kv.second.bloom_filter_useful > 0) { + bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; } - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); - - WriteOptions wo; - ReadOptions ro; - FlushOptions fo; - fo.wait = true; - std::string value; - - ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); - // Needs insert some keys to make sure files are not filtered out by key - // ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - ASSERT_OK(dbfull()->Flush(fo)); - - Reopen(options); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("foo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("bar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("foo", Get("foobar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - - // Reopen with whole key filtering enabled and prefix extractor - // NULL. Bloom filter should be off for both of whole key and - // prefix bloom. - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - options.prefix_extractor.reset(); - Reopen(options); - - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("foo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("bar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("foo", Get("foobar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - // Write DB with only full key filtering. - ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); - // Needs insert some keys to make sure files are not filtered out by key - // ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - - // Reopen with both of whole key off and prefix extractor enabled. - // Still no bloom filter should be used. - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); - - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("foo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("bar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("foo", Get("foobar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - - // Try to create a DB with mixed files: - ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); - // Needs insert some keys to make sure files are not filtered out by key - // ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - - options.prefix_extractor.reset(); - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); - - // Try to create a DB with mixed files. - ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar")); - // In this case needs insert some keys to make sure files are - // not filtered out by key ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - ASSERT_OK(Flush()); - - // Now we have two files: - // File 1: An older file with prefix bloom (disabled) - // File 2: A newer file with whole bloom filter. - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("foo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("bar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("foo", Get("foobar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("bar", Get("barfoo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - - // Reopen with the same setting: only whole key is used - Reopen(options); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("foo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("bar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("foo", Get("foobar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("bar", Get("barfoo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - - // Restart with both filters are allowed - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - // File 1 will has it filtered out. - // File 2 will not, as prefix `foo` exists in the file. - ASSERT_EQ("NOT_FOUND", Get("foo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("bar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("foo", Get("foobar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("bar", Get("barfoo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - - // Restart with only prefix bloom is allowed. - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("foo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("bar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("foo", Get("foobar")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("bar", Get("barfoo")); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); - EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); - uint64_t bloom_filter_useful_all_levels = 0; - for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { - if (kv.second.bloom_filter_useful > 0) { - bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; - } - } - ASSERT_EQ(12, bloom_filter_useful_all_levels); - get_perf_context()->Reset(); } + ASSERT_EQ(12, bloom_filter_useful_all_levels); + get_perf_context()->Reset(); } -TEST_P(DBBloomFilterTestWithParam, BloomFilter) { +INSTANTIATE_TEST_CASE_P( + DBBloomFilterTestWithPartitioningParam, + DBBloomFilterTestWithPartitioningParam, + ::testing::Values(FilterPartitioning::kUnpartitionedFilter, + FilterPartitioning::kCoupledPartitionedFilter, + FilterPartitioning::kDecoupledPartitionedFilter)); + +TEST_P(DBBloomFilterTestWithFormatParams, BloomFilter) { do { Options options = CurrentOptions(); env_->count_random_reads_ = true; @@ -588,20 +609,23 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { // ChangeCompactOptions() only changes compaction style, which does not // trigger reset of table_factory BlockBasedTableOptions table_options; + // When partitioned filters are coupled to index blocks, they tend to get + // extra fractional bits per key when rounding up to the next cache line + // size. Here we correct for that to get similar effective bits per key. + bits_per_key_ = table_options.decouple_partitioned_filters ? 10.5 : 10; + SetInTableOptions(&table_options); table_options.no_block_cache = true; - table_options.filter_policy = Create(10, bfp_impl_); table_options.optimize_filters_for_memory = false; - table_options.partition_filters = partition_filters_; - if (partition_filters_) { - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; - } - table_options.format_version = format_version_; if (format_version_ >= 4) { // value delta encoding challenged more with index interval > 1 table_options.index_block_restart_interval = 8; } - table_options.metadata_block_size = 32; + // This test is rather sensitive to the actual filter partition block size, + // and keeping that consistent between coupled and uncoupled requires a + // different metadata block size for this example (where it controls index + // block size). + table_options.metadata_block_size = + table_options.decouple_partitioned_filters ? 320 : 32; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); @@ -628,7 +652,7 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { int reads = env_->random_read_counter_.Read(); fprintf(stderr, "%d present => %d reads\n", N, reads); ASSERT_GE(reads, N); - if (partition_filters_) { + if (PartitionFilters()) { // Without block cache, we read an extra partition filter per each // level*read and a partition index per each read ASSERT_LE(reads, 4 * N + 2 * N / 100); @@ -643,7 +667,7 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { } reads = env_->random_read_counter_.Read(); fprintf(stderr, "%d missing => %d reads\n", N, reads); - if (partition_filters_) { + if (PartitionFilters()) { // With partitioned filter we read one extra filter per level per each // missed read. ASSERT_LE(reads, 2 * N + 3 * N / 100); @@ -658,7 +682,7 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { uint64_t nkeys = N + N / 100; uint64_t filter_size = ParseUint64(props["filter_size"]); EXPECT_LE(filter_size, - (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ 8); + (PartitionFilters() ? 12 : 11) * nkeys / /*bits / byte*/ 8); if (bfp_impl_ == kAutoRibbon) { // Sometimes using Ribbon filter which is more space-efficient EXPECT_GE(filter_size, 7 * nkeys / /*bits / byte*/ 8); @@ -679,15 +703,20 @@ namespace { class AlwaysTrueBitsBuilder : public FilterBitsBuilder { public: - void AddKey(const Slice&) override {} - size_t EstimateEntriesAdded() override { return 0U; } + void AddKey(const Slice&) override { ++count_; } + void AddKeyAndAlt(const Slice&, const Slice&) override { count_ += 2; } + size_t EstimateEntriesAdded() override { return count_; } Slice Finish(std::unique_ptr* /* buf */) override { + count_ = 0; // Interpreted as "always true" filter (0 probes over 1 byte of // payload, 5 bytes metadata) return Slice("\0\0\0\0\0\0", 6); } using FilterBitsBuilder::Finish; size_t ApproximateNumEntries(size_t) override { return SIZE_MAX; } + + private: + size_t count_ = 0; }; class AlwaysTrueFilterPolicy : public ReadOnlyBuiltinFilterPolicy { @@ -709,7 +738,7 @@ class AlwaysTrueFilterPolicy : public ReadOnlyBuiltinFilterPolicy { } // anonymous namespace -TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { +TEST_P(DBBloomFilterTestWithFormatParams, SkipFilterOnEssentiallyZeroBpk) { constexpr int maxKey = 10; auto PutFn = [&]() { int i; @@ -740,12 +769,7 @@ TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { Options options = CurrentOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); BlockBasedTableOptions table_options; - table_options.partition_filters = partition_filters_; - if (partition_filters_) { - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; - } - table_options.format_version = format_version_; + SetInTableOptions(&table_options); // Test 1: bits per key < 0.5 means skip filters -> no filter // constructed or read. @@ -819,27 +843,85 @@ TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { EXPECT_EQ(props["filter_size"], "0"); } +TEST_P(DBBloomFilterTestWithFormatParams, FilterBitsBuilderDedup) { + BlockBasedTableOptions table_options; + SetInTableOptions(&table_options); + FilterBuildingContext context{table_options}; + std::unique_ptr builder{ + table_options.filter_policy->GetBuilderWithContext(context)}; + + ASSERT_EQ(builder->EstimateEntriesAdded(), 0U); + // Check for sufficient de-duplication between regular keys and alt keys + // (prefixes), keeping in mind that the key might equal its prefix. + + builder->AddKey("abc"); + ASSERT_EQ(builder->EstimateEntriesAdded(), 1U); + builder->AddKeyAndAlt("abc1", "abc"); + ASSERT_EQ(builder->EstimateEntriesAdded(), 2U); + builder->AddKeyAndAlt("bcd", "bcd"); + ASSERT_EQ(builder->EstimateEntriesAdded(), 3U); + builder->AddKeyAndAlt("cde-1", "cde"); + ASSERT_EQ(builder->EstimateEntriesAdded(), 5U); + builder->AddKeyAndAlt("cde", "cde"); + ASSERT_EQ(builder->EstimateEntriesAdded(), 5U); + builder->AddKeyAndAlt("cde1", "cde"); + ASSERT_EQ(builder->EstimateEntriesAdded(), 6U); + builder->AddKeyAndAlt("def-1", "def"); + ASSERT_EQ(builder->EstimateEntriesAdded(), 8U); + builder->AddKeyAndAlt("def", "def"); + ASSERT_EQ(builder->EstimateEntriesAdded(), 8U); + builder->AddKey("def$$"); // Like not in extractor domain + ASSERT_EQ(builder->EstimateEntriesAdded(), 9U); + builder->AddKey("def$$"); + ASSERT_EQ(builder->EstimateEntriesAdded(), 9U); + builder->AddKeyAndAlt("efg42", "efg"); + ASSERT_EQ(builder->EstimateEntriesAdded(), 11U); + builder->AddKeyAndAlt("efg", "efg"); // Like extra "alt" on a partition + ASSERT_EQ(builder->EstimateEntriesAdded(), 11U); +} + #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) INSTANTIATE_TEST_CASE_P( FormatDef, DBBloomFilterTestDefFormatVersion, ::testing::Values( - std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion), - std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion), - std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion))); + std::make_tuple(kAutoBloom, + FilterPartitioning::kCoupledPartitionedFilter, + test::kDefaultFormatVersion), + std::make_tuple(kAutoBloom, + FilterPartitioning::kDecoupledPartitionedFilter, + test::kDefaultFormatVersion), + std::make_tuple(kAutoBloom, FilterPartitioning::kUnpartitionedFilter, + test::kDefaultFormatVersion), + std::make_tuple(kAutoRibbon, FilterPartitioning::kUnpartitionedFilter, + test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( - FormatDef, DBBloomFilterTestWithParam, + FormatDef, DBBloomFilterTestWithFormatParams, ::testing::Values( - std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion), - std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion), - std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion))); + std::make_tuple(kAutoBloom, + FilterPartitioning::kCoupledPartitionedFilter, + test::kDefaultFormatVersion), + std::make_tuple(kAutoBloom, + FilterPartitioning::kDecoupledPartitionedFilter, + test::kDefaultFormatVersion), + std::make_tuple(kAutoBloom, FilterPartitioning::kUnpartitionedFilter, + test::kDefaultFormatVersion), + std::make_tuple(kAutoRibbon, FilterPartitioning::kUnpartitionedFilter, + test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( - FormatLatest, DBBloomFilterTestWithParam, - ::testing::Values(std::make_tuple(kAutoBloom, true, kLatestFormatVersion), - std::make_tuple(kAutoBloom, false, kLatestFormatVersion), - std::make_tuple(kAutoRibbon, false, - kLatestFormatVersion))); + FormatLatest, DBBloomFilterTestWithFormatParams, + ::testing::Values( + std::make_tuple(kAutoBloom, + FilterPartitioning::kCoupledPartitionedFilter, + kLatestFormatVersion), + std::make_tuple(kAutoBloom, + FilterPartitioning::kDecoupledPartitionedFilter, + kLatestFormatVersion), + std::make_tuple(kAutoBloom, FilterPartitioning::kUnpartitionedFilter, + kLatestFormatVersion), + std::make_tuple(kAutoRibbon, FilterPartitioning::kUnpartitionedFilter, + kLatestFormatVersion))); #endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_F(DBBloomFilterTest, BloomFilterRate) { @@ -957,18 +1039,17 @@ TEST_F(DBBloomFilterTest, BloomFilterCompatibility) { using FilterConstructionReserveMemoryHash = uint64_t; class ChargeFilterConstructionTestWithParam - : public DBTestBase, - public testing::WithParamInterface> { + : public DBBloomFilterTest, + public testing::WithParamInterface< + std::tuple> { public: ChargeFilterConstructionTestWithParam() - : DBTestBase("db_bloom_filter_tests", - /*env_do_fsync=*/true), - num_key_(0), + : num_key_(0), charge_filter_construction_(std::get<0>(GetParam())), policy_(std::get<1>(GetParam())), - partition_filters_(std::get<2>(GetParam())), detect_filter_construct_corruption_(std::get<3>(GetParam())) { + filter_partitioning_ = std::get<2>(GetParam()); if (charge_filter_construction_ == CacheEntryRoleOptions::Decision::kDisabled || policy_ == kLegacyBloom) { @@ -976,7 +1057,7 @@ class ChargeFilterConstructionTestWithParam // cache charging happens instead of its accuracy. Therefore we don't // need many keys. num_key_ = 5; - } else if (partition_filters_) { + } else if (PartitionFilters()) { // For PartitionFilter case, since we set // table_options.metadata_block_size big enough such that each partition // trigger at least 1 dummy entry reservation each for hash entries and @@ -1013,6 +1094,7 @@ class ChargeFilterConstructionTestWithParam BlockBasedTableOptions GetBlockBasedTableOptions() { BlockBasedTableOptions table_options; + SetInTableOptions(&table_options); // We set cache capacity big enough to prevent cache full for convenience in // calculation. @@ -1022,10 +1104,7 @@ class ChargeFilterConstructionTestWithParam {CacheEntryRole::kFilterConstruction, {/*.charged = */ charge_filter_construction_}}); table_options.filter_policy = Create(10, policy_); - table_options.partition_filters = partition_filters_; if (table_options.partition_filters) { - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; // We set table_options.metadata_block_size big enough so that each // partition trigger at least 1 dummy entry insertion each for hash // entries and final filter. @@ -1054,8 +1133,6 @@ class ChargeFilterConstructionTestWithParam std::string GetFilterPolicy() { return policy_; } - bool PartitionFilters() { return partition_filters_; } - std::shared_ptr< TargetCacheChargeTrackingCache> GetCache() { @@ -1066,7 +1143,6 @@ class ChargeFilterConstructionTestWithParam std::size_t num_key_; CacheEntryRoleOptions::Decision charge_filter_construction_; std::string policy_; - bool partition_filters_; std::shared_ptr< TargetCacheChargeTrackingCache> cache_; @@ -1078,28 +1154,43 @@ INSTANTIATE_TEST_CASE_P( ChargeFilterConstructionTestWithParam, ::testing::Values( std::make_tuple(CacheEntryRoleOptions::Decision::kDisabled, - kFastLocalBloom, false, false), + kFastLocalBloom, + FilterPartitioning::kUnpartitionedFilter, false), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, - kFastLocalBloom, false, false), + kFastLocalBloom, + FilterPartitioning::kUnpartitionedFilter, false), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, - kFastLocalBloom, false, true), + kFastLocalBloom, + FilterPartitioning::kUnpartitionedFilter, true), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, - kFastLocalBloom, true, false), + kFastLocalBloom, + FilterPartitioning::kCoupledPartitionedFilter, false), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, - kFastLocalBloom, true, true), + kFastLocalBloom, + FilterPartitioning::kCoupledPartitionedFilter, true), + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, + kFastLocalBloom, + FilterPartitioning::kDecoupledPartitionedFilter, true), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, - kStandard128Ribbon, false, false), + kStandard128Ribbon, + FilterPartitioning::kUnpartitionedFilter, false), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, - kStandard128Ribbon, false, true), + kStandard128Ribbon, + FilterPartitioning::kUnpartitionedFilter, true), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, - kStandard128Ribbon, true, false), + kStandard128Ribbon, + FilterPartitioning::kCoupledPartitionedFilter, false), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, - kStandard128Ribbon, true, true), + kStandard128Ribbon, + FilterPartitioning::kCoupledPartitionedFilter, true), + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, + kStandard128Ribbon, + FilterPartitioning::kDecoupledPartitionedFilter, true), std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kLegacyBloom, - false, false))); + FilterPartitioning::kUnpartitionedFilter, false))); // TODO: Speed up this test, and reduce disk space usage (~700MB) // The current test inserts many keys (on the scale of dummy entry size) @@ -1160,7 +1251,6 @@ TEST_P(ChargeFilterConstructionTestWithParam, Basic) { bool charge_filter_construction = (ChargeFilterConstructMemory() == CacheEntryRoleOptions::Decision::kEnabled); std::string policy = GetFilterPolicy(); - bool partition_filters = PartitionFilters(); bool detect_filter_construct_corruption = table_options.detect_filter_construct_corruption; @@ -1247,7 +1337,7 @@ TEST_P(ChargeFilterConstructionTestWithParam, Basic) { * last longer since we release hash entries reservation later. * */ - if (!partition_filters) { + if (!PartitionFilters()) { EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1) << "Filter construction cache charging should have only 1 peak in " "case: kFastLocalBloom + FullFilter"; @@ -1369,7 +1459,7 @@ TEST_P(ChargeFilterConstructionTestWithParam, Basic) { * = hash entries + banding + final filter * */ - if (!partition_filters) { + if (!PartitionFilters()) { ASSERT_GE( std::floor( 1.0 * predicted_final_filter_cache_res / @@ -1444,29 +1534,21 @@ TEST_P(ChargeFilterConstructionTestWithParam, Basic) { } class DBFilterConstructionCorruptionTestWithParam - : public DBTestBase, + : public DBBloomFilterTest, public testing::WithParamInterface< std::tuple> { + FilterPartitioning>> { public: - DBFilterConstructionCorruptionTestWithParam() - : DBTestBase("db_bloom_filter_tests", - /*env_do_fsync=*/true) {} - - BlockBasedTableOptions GetBlockBasedTableOptions() { - BlockBasedTableOptions table_options; - table_options.detect_filter_construct_corruption = std::get<0>(GetParam()); - table_options.filter_policy = Create(10, std::get<1>(GetParam())); - table_options.partition_filters = std::get<2>(GetParam()); - if (table_options.partition_filters) { - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + void SetInTableOptions(BlockBasedTableOptions* table_options) { + table_options->detect_filter_construct_corruption = std::get<0>(GetParam()); + table_options->filter_policy = Create(10, std::get<1>(GetParam())); + filter_partitioning_ = std::get<2>(GetParam()); + DBBloomFilterTest::SetInTableOptions(table_options); + if (PartitionFilters()) { // We set table_options.metadata_block_size small enough so we can // trigger filter partitioning with GetNumKey() amount of keys - table_options.metadata_block_size = 10; + table_options->metadata_block_size = 10; } - - return table_options; } // Return an appropriate amount of keys for testing @@ -1477,15 +1559,26 @@ class DBFilterConstructionCorruptionTestWithParam INSTANTIATE_TEST_CASE_P( DBFilterConstructionCorruptionTestWithParam, DBFilterConstructionCorruptionTestWithParam, - ::testing::Values(std::make_tuple(false, kFastLocalBloom, false), - std::make_tuple(true, kFastLocalBloom, false), - std::make_tuple(true, kFastLocalBloom, true), - std::make_tuple(true, kStandard128Ribbon, false), - std::make_tuple(true, kStandard128Ribbon, true))); + ::testing::Values( + std::make_tuple(false, kFastLocalBloom, + FilterPartitioning::kUnpartitionedFilter), + std::make_tuple(true, kFastLocalBloom, + FilterPartitioning::kUnpartitionedFilter), + std::make_tuple(true, kFastLocalBloom, + FilterPartitioning::kCoupledPartitionedFilter), + std::make_tuple(true, kFastLocalBloom, + FilterPartitioning::kDecoupledPartitionedFilter), + std::make_tuple(true, kStandard128Ribbon, + FilterPartitioning::kUnpartitionedFilter), + std::make_tuple(true, kStandard128Ribbon, + FilterPartitioning::kCoupledPartitionedFilter), + std::make_tuple(true, kStandard128Ribbon, + FilterPartitioning::kDecoupledPartitionedFilter))); TEST_P(DBFilterConstructionCorruptionTestWithParam, DetectCorruption) { Options options = CurrentOptions(); - BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + BlockBasedTableOptions table_options; + SetInTableOptions(&table_options); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.create_if_missing = true; options.disable_auto_compactions = true; @@ -1572,7 +1665,8 @@ TEST_P(DBFilterConstructionCorruptionTestWithParam, DetectCorruption) { TEST_P(DBFilterConstructionCorruptionTestWithParam, DynamicallyTurnOnAndOffDetectConstructCorruption) { Options options = CurrentOptions(); - BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + BlockBasedTableOptions table_options; + SetInTableOptions(&table_options); // We intend to turn on // table_options.detect_filter_construct_corruption dynamically // therefore we override this test parmater's value @@ -2309,11 +2403,12 @@ static const std::string kPlainTable = "test_PlainTableBloom"; class BloomStatsTestWithParam : public DBBloomFilterTest, - public testing::WithParamInterface> { + public testing::WithParamInterface< + std::tuple> { public: BloomStatsTestWithParam() { bfp_impl_ = std::get<0>(GetParam()); - partition_filters_ = std::get<1>(GetParam()); + filter_partitioning_ = std::get<1>(GetParam()); options_.create_if_missing = true; options_.prefix_extractor.reset( @@ -2321,13 +2416,13 @@ class BloomStatsTestWithParam options_.memtable_prefix_bloom_size_ratio = 8.0 * 1024.0 / static_cast(options_.write_buffer_size); if (bfp_impl_ == kPlainTable) { - assert(!partition_filters_); // not supported in plain table + assert(!PartitionFilters()); // not supported in plain table PlainTableOptions table_options; options_.table_factory.reset(NewPlainTableFactory(table_options)); } else { BlockBasedTableOptions table_options; - if (partition_filters_) { - table_options.partition_filters = partition_filters_; + if (PartitionFilters()) { + table_options.partition_filters = PartitionFilters(); table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } @@ -2350,7 +2445,6 @@ class BloomStatsTestWithParam static void TearDownTestCase() {} std::string bfp_impl_; - bool partition_filters_; Options options_; }; @@ -2463,11 +2557,20 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { INSTANTIATE_TEST_CASE_P( BloomStatsTestWithParam, BloomStatsTestWithParam, - ::testing::Values(std::make_tuple(kLegacyBloom, false), - std::make_tuple(kLegacyBloom, true), - std::make_tuple(kFastLocalBloom, false), - std::make_tuple(kFastLocalBloom, true), - std::make_tuple(kPlainTable, false))); + ::testing::Values( + std::make_tuple(kLegacyBloom, FilterPartitioning::kUnpartitionedFilter), + std::make_tuple(kLegacyBloom, + FilterPartitioning::kCoupledPartitionedFilter), + std::make_tuple(kLegacyBloom, + FilterPartitioning::kDecoupledPartitionedFilter), + std::make_tuple(kFastLocalBloom, + FilterPartitioning::kUnpartitionedFilter), + std::make_tuple(kFastLocalBloom, + FilterPartitioning::kCoupledPartitionedFilter), + std::make_tuple(kFastLocalBloom, + FilterPartitioning::kDecoupledPartitionedFilter), + std::make_tuple(kPlainTable, + FilterPartitioning::kUnpartitionedFilter))); namespace { void PrefixScanInit(DBBloomFilterTest* dbtest) { diff --git a/db/db_follower_test.cc b/db/db_follower_test.cc index febc5ae4a4..a0f35a46b6 100644 --- a/db/db_follower_test.cc +++ b/db/db_follower_test.cc @@ -289,10 +289,12 @@ TEST_F(DBFollowerTest, RetryCatchup) { {"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"}, {"DBImpl::BackgroundCompaction:Start", "DBImplFollower::TryCatchupWithLeader:Begin2"}, - {"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1", + {"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:" + "Begin1", "DBImpl::BackgroundCompaction:BeforeCompaction"}, {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", - "VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"}, + "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:" + "Begin2"}, {"DBImplFollower::TryCatchupWithLeader:End", "Follower::WaitForCatchup"}, }); SyncPoint::GetInstance()->EnableProcessing(); @@ -335,10 +337,12 @@ TEST_F(DBFollowerTest, RetryCatchupManifestRollover) { SyncPoint::GetInstance()->LoadDependency({ {"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"}, {"Leader::Flushed", "DBImplFollower::TryCatchupWithLeader:Begin2"}, - {"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1", + {"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:" + "Begin1", "Leader::Done"}, {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", - "VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"}, + "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:" + "Begin2"}, {"DBImplFollower::TryCatchupWithLeader:End", "Follower::WaitForCatchup:1"}, }); diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 123d953725..392cbac41b 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -2475,7 +2476,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, RecordTick(stats_, MEMTABLE_HIT); } } - if (!done && !s.ok() && !s.IsMergeInProgress()) { + if (!s.ok() && !s.IsMergeInProgress() && !s.IsNotFound()) { + assert(done); ReturnAndCleanupSuperVersion(cfd, sv); return s; } @@ -3141,10 +3143,11 @@ Status DBImpl::MultiGetImpl( StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); assert(sorted_keys); + assert(start_key + num_keys <= sorted_keys->size()); // Clear the timestamps for returning results so that we can distinguish // between tombstone or key that has never been written - for (auto* kctx : *sorted_keys) { - assert(kctx); + for (size_t i = start_key; i < start_key + num_keys; ++i) { + KeyContext* kctx = (*sorted_keys)[i]; if (kctx->timestamp) { kctx->timestamp->clear(); } @@ -5240,6 +5243,14 @@ Status DestroyDB(const std::string& dbname, const Options& options, Env* env = soptions.env; std::vector filenames; bool wal_in_db_path = soptions.IsWalDirSameAsDBPath(); + auto sfm = static_cast_with_check( + options.sst_file_manager.get()); + // Allocate a separate trash bucket to be used by all the to be deleted + // files, so we can later wait for this bucket to be empty before return. + std::optional bucket; + if (sfm) { + bucket = sfm->NewTrashBucket(); + } // Reset the logger because it holds a handle to the // log file and prevents cleanup and directory removal @@ -5251,6 +5262,7 @@ Status DestroyDB(const std::string& dbname, const Options& options, /*IODebugContext*=*/nullptr) .PermitUncheckedError(); + std::set paths_to_delete; FileLock* lock; const std::string lockname = LockFileName(dbname); Status result = env->LockFile(lockname, &lock); @@ -5267,10 +5279,9 @@ Status DestroyDB(const std::string& dbname, const Options& options, del = DestroyDB(path_to_delete, options); } else if (type == kTableFile || type == kWalFile || type == kBlobFile) { - del = DeleteDBFile( - &soptions, path_to_delete, dbname, - /*force_bg=*/false, - /*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false); + del = DeleteUnaccountedDBFile(&soptions, path_to_delete, dbname, + /*force_bg=*/false, + /*force_fg=*/false, bucket); } else { del = env->DeleteFile(path_to_delete); } @@ -5279,6 +5290,7 @@ Status DestroyDB(const std::string& dbname, const Options& options, } } } + paths_to_delete.insert(dbname); std::set paths; for (const DbPath& db_path : options.db_paths) { @@ -5300,18 +5312,19 @@ Status DestroyDB(const std::string& dbname, const Options& options, (type == kTableFile || type == kBlobFile)) { // Lock file will be deleted at end std::string file_path = path + "/" + fname; - Status del = DeleteDBFile(&soptions, file_path, dbname, - /*force_bg=*/false, /*force_fg=*/false); + Status del = DeleteUnaccountedDBFile(&soptions, file_path, dbname, + /*force_bg=*/false, + /*force_fg=*/false, bucket); if (!del.ok() && result.ok()) { result = del; } } } - // TODO: Should we return an error if we cannot delete the directory? - env->DeleteDir(path).PermitUncheckedError(); } } + paths_to_delete.merge(paths); + std::vector walDirFiles; std::string archivedir = ArchivalDirectory(dbname); bool wal_dir_exists = false; @@ -5335,46 +5348,49 @@ Status DestroyDB(const std::string& dbname, const Options& options, // Delete archival files. for (const auto& file : archiveFiles) { if (ParseFileName(file, &number, &type) && type == kWalFile) { - Status del = - DeleteDBFile(&soptions, archivedir + "/" + file, archivedir, - /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); + Status del = DeleteUnaccountedDBFile( + &soptions, archivedir + "/" + file, archivedir, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path, bucket); if (!del.ok() && result.ok()) { result = del; } } } - // Ignore error in case dir contains other files - env->DeleteDir(archivedir).PermitUncheckedError(); + paths_to_delete.insert(archivedir); } // Delete log files in the WAL dir if (wal_dir_exists) { for (const auto& file : walDirFiles) { if (ParseFileName(file, &number, &type) && type == kWalFile) { - Status del = - DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number), - soptions.wal_dir, /*force_bg=*/false, - /*force_fg=*/!wal_in_db_path); + Status del = DeleteUnaccountedDBFile( + &soptions, LogFileName(soptions.wal_dir, number), + soptions.wal_dir, /*force_bg=*/false, + /*force_fg=*/!wal_in_db_path, bucket); if (!del.ok() && result.ok()) { result = del; } } } - // Ignore error in case dir contains other files - env->DeleteDir(soptions.wal_dir).PermitUncheckedError(); + paths_to_delete.insert(soptions.wal_dir); } // Ignore error since state is already gone env->UnlockFile(lock).PermitUncheckedError(); env->DeleteFile(lockname).PermitUncheckedError(); + // Make sure trash files are all cleared before return. + if (sfm && bucket.has_value()) { + sfm->WaitForEmptyTrashBucket(bucket.value()); + } // sst_file_manager holds a ref to the logger. Make sure the logger is // gone before trying to remove the directory. soptions.sst_file_manager.reset(); // Ignore error in case dir contains other files - env->DeleteDir(dbname).PermitUncheckedError(); - ; + for (const auto& path_to_delete : paths_to_delete) { + env->DeleteDir(path_to_delete).PermitUncheckedError(); + } } return result; } @@ -5820,11 +5836,6 @@ Status DBImpl::IngestExternalFiles( "write_global_seqno is deprecated and does not work with " "allow_db_generated_files."); } - if (ingest_opts.move_files) { - return Status::NotSupported( - "Options move_files and allow_db_generated_files are not " - "compatible."); - } } } diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index fd64f0cd66..e3eb3253e6 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -1226,6 +1226,8 @@ class DBImpl : public DB { return logs_.back().number; } + void TEST_DeleteObsoleteFiles(); + const std::unordered_set& TEST_GetFilesGrabbedForPurge() const { return files_grabbed_for_purge_; } diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index bb28b7f632..5ebe921cfe 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -314,6 +314,11 @@ const autovector& DBImpl::TEST_GetFilesToQuarantine() const { return error_handler_.GetFilesToQuarantine(); } +void DBImpl::TEST_DeleteObsoleteFiles() { + InstrumentedMutexLock l(&mutex_); + DeleteObsoleteFiles(); +} + size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { InstrumentedMutexLock l(&const_cast(this)->stats_history_mutex_); return EstimateInMemoryStatsHistorySize(); diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index dd4bf411cd..0db7293682 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -970,7 +970,9 @@ Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only, } // Persist it to IDENTITY file if allowed if (!read_only) { - s = SetIdentityFile(write_options, env_, dbname_, db_id_); + s = SetIdentityFile(write_options, env_, dbname_, + immutable_db_options_.metadata_write_temperature, + db_id_); } return s; } diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index aac85c9aa2..a58a142d71 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -295,7 +295,8 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) { Status DBImpl::NewDB(std::vector* new_filenames) { VersionEdit new_db; const WriteOptions write_options(Env::IOActivity::kDBOpen); - Status s = SetIdentityFile(write_options, env_, dbname_); + Status s = SetIdentityFile(write_options, env_, dbname_, + immutable_db_options_.metadata_write_temperature); if (!s.ok()) { return s; } @@ -319,6 +320,12 @@ Status DBImpl::NewDB(std::vector* new_filenames) { } std::unique_ptr file; FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_); + // DB option takes precedence when not kUnknown + if (immutable_db_options_.metadata_write_temperature != + Temperature::kUnknown) { + file_options.temperature = + immutable_db_options_.metadata_write_temperature; + } s = NewWritableFile(fs_.get(), manifest, &file, file_options); if (!s.ok()) { return s; @@ -344,6 +351,7 @@ Status DBImpl::NewDB(std::vector* new_filenames) { if (s.ok()) { // Make "CURRENT" file that points to the new manifest file. s = SetCurrentFile(write_options, fs_.get(), dbname_, 1, + immutable_db_options_.metadata_write_temperature, directories_.GetDbDir()); if (new_filenames) { new_filenames->emplace_back( @@ -530,6 +538,12 @@ Status DBImpl::Recover( /*no_error_if_files_missing=*/false, is_retry, &desc_status); desc_status.PermitUncheckedError(); + if (is_retry) { + RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_COUNT); + if (desc_status.ok()) { + RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT); + } + } if (can_retry) { // If we're opening for the first time and the failure is likely due to // a corrupt MANIFEST file (could result in either the log::Reader @@ -1930,6 +1944,10 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options, BuildDBOptions(immutable_db_options_, mutable_db_options_); FileOptions opt_file_options = fs_->OptimizeForLogWrite(file_options_, db_options); + // DB option takes precedence when not kUnknown + if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) { + opt_file_options.temperature = immutable_db_options_.wal_write_temperature; + } std::string wal_dir = immutable_db_options_.GetWalDir(); std::string log_fname = LogFileName(wal_dir, log_file_num); diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index ec5283ad94..d6899502ae 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -969,21 +969,17 @@ Status DBImpl::WriteImplWALOnly( assert(w.state == WriteThread::STATE_GROUP_LEADER); if (publish_last_seq == kDoPublishLastSeq) { - Status status; - // Currently we only use kDoPublishLastSeq in unordered_write assert(immutable_db_options_.unordered_write); - WriteContext write_context; - if (error_handler_.IsDBStopped()) { - status = error_handler_.GetBGError(); - } + // TODO(myabandeh): Make preliminary checks thread-safe so we could do them // without paying the cost of obtaining the mutex. - if (status.ok()) { - LogContext log_context; - status = PreprocessWrite(write_options, &log_context, &write_context); - WriteStatusCheckOnLocked(status); - } + LogContext log_context; + WriteContext write_context; + Status status = + PreprocessWrite(write_options, &log_context, &write_context); + WriteStatusCheckOnLocked(status); + if (!status.ok()) { WriteThread::WriteGroup write_group; write_thread->EnterAsBatchGroupLeader(&w, &write_group); diff --git a/db/db_io_failure_test.cc b/db/db_io_failure_test.cc index c570c6c2c7..9826ab6680 100644 --- a/db/db_io_failure_test.cc +++ b/db/db_io_failure_test.cc @@ -705,6 +705,7 @@ class DBIOCorruptionTest DBIOCorruptionTest() : DBIOFailureTest() { BlockBasedTableOptions bbto; options_ = CurrentOptions(); + options_.statistics = CreateDBStatistics(); base_env_ = env_; EXPECT_NE(base_env_, nullptr); @@ -727,6 +728,8 @@ class DBIOCorruptionTest Status ReopenDB() { return TryReopen(options_); } + Statistics* stats() { return options_.statistics.get(); } + protected: std::unique_ptr env_guard_; std::shared_ptr fs_; @@ -749,8 +752,12 @@ TEST_P(DBIOCorruptionTest, GetReadCorruptionRetry) { if (std::get<2>(GetParam())) { ASSERT_OK(s); ASSERT_EQ(val, "val1"); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT), + 1); } else { ASSERT_TRUE(s.IsCorruption()); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0); } } @@ -773,8 +780,12 @@ TEST_P(DBIOCorruptionTest, IterReadCorruptionRetry) { } if (std::get<2>(GetParam())) { ASSERT_OK(iter->status()); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT), + 1); } else { ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0); } delete iter; } @@ -799,9 +810,13 @@ TEST_P(DBIOCorruptionTest, MultiGetReadCorruptionRetry) { if (std::get<2>(GetParam())) { ASSERT_EQ(values[0].ToString(), "val1"); ASSERT_EQ(values[1].ToString(), "val2"); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT), + 1); } else { ASSERT_TRUE(statuses[0].IsCorruption()); ASSERT_TRUE(statuses[1].IsCorruption()); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0); } } @@ -818,6 +833,9 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) { Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); if (std::get<2>(GetParam())) { ASSERT_OK(s); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT), + 1); std::string val; ReadOptions ro; @@ -826,6 +844,7 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) { ASSERT_EQ(val, "val1"); } else { ASSERT_TRUE(s.IsCorruption()); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0); } } @@ -838,6 +857,9 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) { Status s = Flush(); if (std::get<2>(GetParam())) { ASSERT_OK(s); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT), + 1); std::string val; ReadOptions ro; @@ -846,6 +868,7 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) { ASSERT_EQ(val, "val1"); } else { ASSERT_NOK(s); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0); } } @@ -862,8 +885,12 @@ TEST_P(DBIOCorruptionTest, ManifestCorruptionRetry) { if (std::get<2>(GetParam())) { ASSERT_OK(ReopenDB()); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT), + 1); } else { ASSERT_EQ(ReopenDB(), Status::Corruption()); + ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0); } SyncPoint::GetInstance()->DisableProcessing(); } diff --git a/db/db_kv_checksum_test.cc b/db/db_kv_checksum_test.cc index 3b85ef50eb..6eea6e5b4b 100644 --- a/db/db_kv_checksum_test.cc +++ b/db/db_kv_checksum_test.cc @@ -684,13 +684,14 @@ class DbMemtableKVChecksumTest : public DbKvChecksumTest { DbMemtableKVChecksumTest() : DbKvChecksumTest() {} protected: + const size_t kValueLenOffset = 12; // Indices in the memtable entry that we will not corrupt. // For memtable entry format, see comments in MemTable::Add(). // We do not corrupt key length and value length fields in this test // case since it causes segfault and ASAN will complain. // For this test case, key and value are all of length 3, so // key length field is at index 0 and value length field is at index 12. - const std::set index_not_to_corrupt{0, 12}; + const std::set index_not_to_corrupt{0, kValueLenOffset}; void SkipNotToCorruptEntry() { if (index_not_to_corrupt.find(corrupt_byte_offset_) != @@ -737,6 +738,8 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) { buf[corrupt_byte_offset_] += corrupt_byte_addend_; ++corrupt_byte_offset_; }); + // Corrupt value only so that MultiGet below can find the key. + corrupt_byte_offset_ = kValueLenOffset + 1; SyncPoint::GetInstance()->EnableProcessing(); Options options = CurrentOptions(); options.memtable_protection_bytes_per_key = @@ -745,12 +748,17 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) { options.merge_operator = MergeOperators::CreateStringAppendOperator(); } + std::string key = "key"; SkipNotToCorruptEntry(); while (MoreBytesToCorrupt()) { Reopen(options); ASSERT_OK(ExecuteWrite(nullptr)); std::string val; - ASSERT_TRUE(db_->Get(ReadOptions(), "key", &val).IsCorruption()); + ASSERT_TRUE(db_->Get(ReadOptions(), key, &val).IsCorruption()); + std::vector vals = {val}; + std::vector statuses = db_->MultiGet( + ReadOptions(), {db_->DefaultColumnFamily()}, {key}, &vals, nullptr); + ASSERT_TRUE(statuses[0].IsCorruption()); Destroy(options); SkipNotToCorruptEntry(); } diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index 5c8b6db2ba..3f7b029572 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -339,6 +339,91 @@ TEST_F(DBMemTableTest, ColumnFamilyId) { } } +TEST_F(DBMemTableTest, IntegrityChecks) { + // We insert keys key000000, key000001 and key000002 into skiplist at fixed + // height 1 (smallest height). Then we corrupt the second key to aey000001 to + // make it smaller. With `paranoid_memory_checks` set to true, if the + // skip list sees key000000 and then aey000001, then it will report out of + // order keys with corruption status. With `paranoid_memory_checks` set + // to false, read/scan may return wrong results. + for (bool allow_data_in_error : {false, true}) { + Options options = CurrentOptions(); + options.allow_data_in_errors = allow_data_in_error; + options.paranoid_memory_checks = true; + DestroyAndReopen(options); + SyncPoint::GetInstance()->SetCallBack( + "InlineSkipList::RandomHeight::height", [](void* h) { + auto height_ptr = static_cast(h); + *height_ptr = 1; + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(0), "val0")); + ASSERT_OK(Put(Key(2), "val2")); + // p will point to the buffer for encoded key000001 + char* p = nullptr; + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:BeforeReturn:Encoded", [&](void* encoded) { + p = const_cast(static_cast(encoded)->data()); + }); + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_TRUE(p); + // Offset 0 is key size, key bytes start at offset 1. + // "key000001 -> aey000001" + p[1] = 'a'; + + ReadOptions rops; + std::string val; + Status s = db_->Get(rops, Key(1), &val); + ASSERT_TRUE(s.IsCorruption()); + std::string key0 = Slice(Key(0)).ToString(true); + ASSERT_EQ(s.ToString().find(key0) != std::string::npos, + allow_data_in_error); + // Without `paranoid_memory_checks`, NotFound will be returned. + // This would fail an assertion in InlineSkipList::FindGreaterOrEqual(). + // If we remove the assertion, this passes. + // ASSERT_TRUE(db_->Get(ReadOptions(), Key(1), &val).IsNotFound()); + + std::vector vals; + std::vector statuses = db_->MultiGet( + rops, {db_->DefaultColumnFamily()}, {Key(1)}, &vals, nullptr); + ASSERT_TRUE(statuses[0].IsCorruption()); + ASSERT_EQ(statuses[0].ToString().find(key0) != std::string::npos, + allow_data_in_error); + + std::unique_ptr iter{db_->NewIterator(rops)}; + ASSERT_OK(iter->status()); + iter->Seek(Key(1)); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + + iter->Seek(Key(0)); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + // iterating through skip list at height at 1 should catch out-of-order keys + iter->Next(); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + ASSERT_FALSE(iter->Valid()); + + iter->SeekForPrev(Key(2)); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + + // Internally DB Iter will iterate backwards (call Prev()) after + // SeekToLast() to find the correct internal key with the last user key. + // Prev() will do integrity checks and catch corruption. + iter->SeekToLast(); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + ASSERT_FALSE(iter->Valid()); + } +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 57cde40734..5499d58f5f 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -507,6 +507,23 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) { ASSERT_EQ(files_deleted, 0); ASSERT_EQ(files_scheduled_to_delete, 0); Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) { + assert(arg); + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + ++files_scheduled_to_delete; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::OnDeleteFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + files_deleted++; + } + }); ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_EQ(files_deleted, blob_files.size()); ASSERT_EQ(files_scheduled_to_delete, blob_files.size()); @@ -649,6 +666,23 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) { } Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) { + assert(arg); + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + ++files_scheduled_to_delete; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::OnDeleteFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + files_deleted++; + } + }); ASSERT_OK(DestroyDB(dbname_, options)); sfm->WaitForEmptyTrash(); ASSERT_EQ(files_deleted, 5); @@ -883,8 +917,9 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) { // Create 4 files in L0 for (char v = 'a'; v <= 'd'; v++) { if (v == 'c') { - // Maximize the change that the last log file will be preserved in trash - // before restarting the DB. + // Maximize the chance that the last log file will be preserved in trash + // before restarting the DB. (Enable slow deletion but at a very slow + // deletion rate) // We have to set this on the 2nd to last file for it to delay deletion // on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash()) options.sst_file_manager->SetDeleteRateBytesPerSecond(1); @@ -1902,6 +1937,24 @@ TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) { ASSERT_EQ(files_deleted, 1); Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) { + assert(arg); + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + ++files_scheduled_to_delete; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::OnDeleteFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + files_deleted++; + } + }); ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_EQ(files_scheduled_to_delete, 4); diff --git a/db/db_test2.cc b/db/db_test2.cc index e6a3adf9b8..92211ad42d 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "db/db_test_util.h" @@ -26,6 +27,7 @@ #include "rocksdb/utilities/replayer.h" #include "rocksdb/wal_filter.h" #include "test_util/testutil.h" +#include "util/defer.h" #include "util/random.h" #include "utilities/fault_injection_env.h" @@ -6544,6 +6546,235 @@ TEST_P(RenameCurrentTest, Compaction) { ASSERT_EQ("d_value", Get("d")); } +TEST_F(DBTest2, VariousFileTemperatures) { + constexpr size_t kNumberFileTypes = static_cast(kBlobFile) + 1U; + + struct MyTestFS : public FileTemperatureTestFS { + explicit MyTestFS(const std::shared_ptr& fs) + : FileTemperatureTestFS(fs) { + Reset(); + } + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + IOStatus ios = + FileTemperatureTestFS::NewWritableFile(fname, opts, result, dbg); + if (ios.ok()) { + uint64_t number; + FileType type; + if (ParseFileName(GetFileName(fname), &number, "LOG", &type)) { + if (type == kTableFile) { + // Not checked here + } else if (type == kWalFile) { + if (opts.temperature != expected_wal_temperature) { + std::cerr << "Attempt to open " << fname << " with temperature " + << temperature_to_string[opts.temperature] + << " rather than " + << temperature_to_string[expected_wal_temperature] + << std::endl; + assert(false); + } + } else if (type == kDescriptorFile) { + if (opts.temperature != expected_manifest_temperature) { + std::cerr << "Attempt to open " << fname << " with temperature " + << temperature_to_string[opts.temperature] + << " rather than " + << temperature_to_string[expected_wal_temperature] + << std::endl; + assert(false); + } + } else if (opts.temperature != expected_other_metadata_temperature) { + std::cerr << "Attempt to open " << fname << " with temperature " + << temperature_to_string[opts.temperature] + << " rather than " + << temperature_to_string[expected_wal_temperature] + << std::endl; + assert(false); + } + UpdateCount(type, 1); + } + } + return ios; + } + + IOStatus RenameFile(const std::string& src, const std::string& dst, + const IOOptions& options, + IODebugContext* dbg) override { + IOStatus ios = FileTemperatureTestFS::RenameFile(src, dst, options, dbg); + if (ios.ok()) { + uint64_t number; + FileType src_type; + FileType dst_type; + assert(ParseFileName(GetFileName(src), &number, "LOG", &src_type)); + assert(ParseFileName(GetFileName(dst), &number, "LOG", &dst_type)); + + UpdateCount(src_type, -1); + UpdateCount(dst_type, 1); + } + return ios; + } + + void UpdateCount(FileType type, int delta) { + size_t i = static_cast(type); + assert(i < kNumberFileTypes); + counts[i].FetchAddRelaxed(delta); + } + + std::map PopCounts() { + std::map ret; + for (size_t i = 0; i < kNumberFileTypes; ++i) { + int c = counts[i].ExchangeRelaxed(0); + if (c > 0) { + ret[static_cast(i)] = c; + } + } + return ret; + } + + FileOptions OptimizeForLogWrite( + const FileOptions& file_options, + const DBOptions& /*db_options*/) const override { + FileOptions opts = file_options; + if (optimize_wal_temperature != Temperature::kUnknown) { + opts.temperature = optimize_wal_temperature; + } + return opts; + } + + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override { + FileOptions opts = file_options; + if (optimize_manifest_temperature != Temperature::kUnknown) { + opts.temperature = optimize_manifest_temperature; + } + return opts; + } + + void Reset() { + optimize_manifest_temperature = Temperature::kUnknown; + optimize_wal_temperature = Temperature::kUnknown; + expected_manifest_temperature = Temperature::kUnknown; + expected_other_metadata_temperature = Temperature::kUnknown; + expected_wal_temperature = Temperature::kUnknown; + for (auto& c : counts) { + c.StoreRelaxed(0); + } + } + + Temperature optimize_manifest_temperature; + Temperature optimize_wal_temperature; + Temperature expected_manifest_temperature; + Temperature expected_other_metadata_temperature; + Temperature expected_wal_temperature; + std::array, kNumberFileTypes> counts; + }; + + // We don't have enough non-unknown temps to confidently distinguish that + // a specific setting caused a specific outcome, in a single run. This is a + // reasonable work-around without blowing up test time. Only returns + // non-unknown temperatures. + auto RandomTemp = [] { + static std::vector temps = { + Temperature::kHot, Temperature::kWarm, Temperature::kCold}; + return temps[Random::GetTLSInstance()->Uniform( + static_cast(temps.size()))]; + }; + + auto test_fs = std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, test_fs)); + for (bool use_optimize : {false, true}) { + std::cerr << "use_optimize: " << std::to_string(use_optimize) << std::endl; + for (bool use_temp_options : {false, true}) { + std::cerr << "use_temp_options: " << std::to_string(use_temp_options) + << std::endl; + + Options options = CurrentOptions(); + // Currently require for last level temperature + options.compaction_style = kCompactionStyleUniversal; + options.env = env.get(); + test_fs->Reset(); + if (use_optimize) { + test_fs->optimize_manifest_temperature = RandomTemp(); + test_fs->expected_manifest_temperature = + test_fs->optimize_manifest_temperature; + test_fs->optimize_wal_temperature = RandomTemp(); + test_fs->expected_wal_temperature = test_fs->optimize_wal_temperature; + } + if (use_temp_options) { + options.metadata_write_temperature = RandomTemp(); + test_fs->expected_manifest_temperature = + options.metadata_write_temperature; + test_fs->expected_other_metadata_temperature = + options.metadata_write_temperature; + options.wal_write_temperature = RandomTemp(); + test_fs->expected_wal_temperature = options.wal_write_temperature; + options.last_level_temperature = RandomTemp(); + options.default_write_temperature = RandomTemp(); + } + + DestroyAndReopen(options); + Defer closer([&] { Close(); }); + + using FTC = std::map; + // Files on DB startup + ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1}, + {kDescriptorFile, 2}, + {kCurrentFile, 2}, + {kIdentityFile, 1}, + {kOptionsFile, 1}})); + + // Temperature count map + using TCM = std::map; + ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), TCM({})); + + ASSERT_OK(Put("foo", "1")); + ASSERT_OK(Put("bar", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "2")); + ASSERT_OK(Put("bar", "2")); + ASSERT_OK(Flush()); + + ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), + TCM({{options.default_write_temperature, 2}})); + + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); + + ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), + TCM({{options.last_level_temperature, 1}})); + + ASSERT_OK(Put("foo", "3")); + ASSERT_OK(Put("bar", "3")); + ASSERT_OK(Flush()); + + // Just in memtable/WAL + ASSERT_OK(Put("dog", "3")); + + { + TCM expected; + expected[options.default_write_temperature] += 1; + expected[options.last_level_temperature] += 1; + ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), expected); + } + + // New files during operation + ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 3}, {kTableFile, 4}})); + + Reopen(options); + + // New files during re-open/recovery + ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1}, + {kTableFile, 1}, + {kDescriptorFile, 1}, + {kCurrentFile, 1}, + {kOptionsFile, 1}})); + + Destroy(options); + } + } +} + TEST_F(DBTest2, LastLevelTemperature) { class TestListener : public EventListener { public: diff --git a/db/db_test_util.cc b/db/db_test_util.cc index e3be672efb..d444bc5193 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -366,6 +366,11 @@ Options DBTestBase::GetOptions( table_options.block_cache = NewLRUCache(/* too small */ 1); } + // Test anticipated new default as much as reasonably possible (and remove + // this code when obsolete) + assert(!table_options.decouple_partitioned_filters); + table_options.decouple_partitioned_filters = true; + bool can_allow_mmap = IsMemoryMappedAccessSupported(); switch (option_config) { case kHashSkipList: diff --git a/db/db_test_util.h b/db/db_test_util.h index 47b1667eac..36a4615344 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -831,6 +831,15 @@ class FileTemperatureTestFS : public FileSystemWrapper { return count; } + std::map CountCurrentSstFilesByTemp() { + MutexLock lock(&mu_); + std::map ret; + for (const auto& e : current_sst_file_temperatures_) { + ret[e.second]++; + } + return ret; + } + void OverrideSstFileTemperature(uint64_t number, Temperature temp) { MutexLock lock(&mu_); current_sst_file_temperatures_[number] = temp; @@ -842,7 +851,7 @@ class FileTemperatureTestFS : public FileSystemWrapper { requested_sst_file_temperatures_; std::map current_sst_file_temperatures_; - std::string GetFileName(const std::string& fname) { + static std::string GetFileName(const std::string& fname) { auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1); // workaround only for Windows that the file path could contain both Windows // FilePathSeparator and '/' diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index b6a716356d..c9052cc9fe 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -213,7 +213,6 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) { options.num_levels = num_levels_; options.write_buffer_size = 105 << 10; // 105KB options.arena_block_size = 4 << 10; - options.target_file_size_base = 32 << 10; // 32KB // trigger compaction if there are >= 4 files options.level0_file_num_compaction_trigger = 4; KeepFilterFactory* filter = new KeepFilterFactory(true); diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 6ffa342bf1..51c25746d3 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -1472,6 +1472,126 @@ TEST_F(DBWALTest, SyncMultipleLogs) { ASSERT_OK(dbfull()->SyncWAL()); } +TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) { + Options options = CurrentOptions(); + options.max_write_buffer_number = 5; + options.track_and_verify_wals_in_manifest = true; + options.max_bgerror_resume_count = 0; // manual resume + options.recycle_log_file_num = 3; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + // Disable truncating recycled WALs to new size in posix env + // (approximating a crash) + SyncPoint::GetInstance()->SetCallBack( + "PosixWritableFile::Close", + [](void* arg) { *(static_cast(arg)) = 0; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Re-open with desired options + DestroyAndReopen(options); + Defer closer([this]() { Close(); }); + + // Ensure WAL recycling wasn't sanitized away + ASSERT_EQ(db_->GetOptions().recycle_log_file_num, + options.recycle_log_file_num); + + // Prepare external files for later ingestion + std::string sst_files_dir = dbname_ + "/sst_files/"; + ASSERT_OK(DestroyDir(env_, sst_files_dir)); + ASSERT_OK(env_->CreateDir(sst_files_dir)); + std::string external_file1 = sst_files_dir + "file1.sst"; + { + SstFileWriter sst_file_writer(EnvOptions(), options); + ASSERT_OK(sst_file_writer.Open(external_file1)); + ASSERT_OK(sst_file_writer.Put("external1", "ex1")); + ExternalSstFileInfo file_info; + ASSERT_OK(sst_file_writer.Finish(&file_info)); + } + std::string external_file2 = sst_files_dir + "file2.sst"; + { + SstFileWriter sst_file_writer(EnvOptions(), options); + ASSERT_OK(sst_file_writer.Open(external_file2)); + ASSERT_OK(sst_file_writer.Put("external2", "ex2")); + ExternalSstFileInfo file_info; + ASSERT_OK(sst_file_writer.Finish(&file_info)); + } + + // Populate some WALs to be recycled such that there will be extra data + // from an old incarnation of the WAL on recovery + ASSERT_OK(db_->PauseBackgroundWork()); + ASSERT_OK(Put("ignore1", Random::GetTLSInstance()->RandomString(500))); + ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + ASSERT_OK(Put("ignore2", Random::GetTLSInstance()->RandomString(500))); + ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + ASSERT_OK(db_->ContinueBackgroundWork()); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ignore3", Random::GetTLSInstance()->RandomString(500))); + ASSERT_OK(Flush()); + + // Verify expected log files (still there for recycling) + std::vector files; + int log_count = 0; + ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files)); + for (const auto& f : files) { + if (EndsWith(f.name, ".log")) { + EXPECT_GT(f.size_bytes, 500); + ++log_count; + } + } + EXPECT_EQ(log_count, 3); + + // (Re-used recipe) Generate two inactive WALs and one active WAL, with a + // gap in sequence numbers to interfere with recovery + ASSERT_OK(db_->PauseBackgroundWork()); + ASSERT_OK(Put("key1", "val1")); + ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + // Need a gap in sequence numbers, so e.g. ingest external file + // with an open snapshot + { + ManagedSnapshot snapshot(db_); + ASSERT_OK( + db_->IngestExternalFile({external_file1}, IngestExternalFileOptions())); + } + ASSERT_OK(Put("key3", "val3")); + ASSERT_OK(db_->SyncWAL()); + // Need an SST file that is logically after that WAL, so that dropping WAL + // data is not a valid point in time. + { + ManagedSnapshot snapshot(db_); + ASSERT_OK( + db_->IngestExternalFile({external_file2}, IngestExternalFileOptions())); + } + + // Approximate a crash, with respect to recycled WAL data extending past + // the end of the current WAL data (see SyncPoint callback above) + Close(); + + // Verify recycled log files haven't been truncated + files.clear(); + log_count = 0; + ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files)); + for (const auto& f : files) { + if (EndsWith(f.name, ".log")) { + EXPECT_GT(f.size_bytes, 500); + ++log_count; + } + } + EXPECT_EQ(log_count, 3); + + // Verify no data loss after reopen. + Reopen(options); + EXPECT_EQ("val1", Get("key1")); + EXPECT_EQ("val2", Get("key2")); // Passes because of adjacent seqnos + EXPECT_EQ("ex1", Get("external1")); + EXPECT_EQ("val3", Get("key3")); // <- ONLY FAILURE! (Not a point in time) + EXPECT_EQ("ex2", Get("external2")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + TEST_F(DBWALTest, SyncWalPartialFailure) { class MyTestFileSystem : public FileSystemWrapper { public: @@ -1532,7 +1652,7 @@ TEST_F(DBWALTest, SyncWalPartialFailure) { // * one inactive WAL, not synced, and // * one active WAL, not synced // with a single thread, to exercise as much logic as we reasonably can. - ASSERT_OK(static_cast_with_check(db_)->PauseBackgroundWork()); + ASSERT_OK(db_->PauseBackgroundWork()); ASSERT_OK(Put("key1", "val1")); ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); ASSERT_OK(db_->SyncWAL()); diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index 2d5b08832a..acb1c05c1d 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -172,6 +172,70 @@ TEST_F(DBBasicTestWithTimestamp, MixedCfs) { Close(); } +TEST_F(DBBasicTestWithTimestamp, MultiGetMultipleCfs) { + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.avoid_flush_during_shutdown = true; + options.comparator = &test_cmp; + DestroyAndReopen(options); + + Options options1 = CurrentOptions(); + options1.env = env_; + options1.comparator = &test_cmp; + ColumnFamilyHandle* handle = nullptr; + Status s = db_->CreateColumnFamily(options1, "data", &handle); + ASSERT_OK(s); + + std::string ts = Timestamp(1, 0); + WriteBatch wb(0, 0, 0, kTimestampSize); + ASSERT_OK(wb.Put("a", "value")); + ASSERT_OK(wb.Put(handle, "a", "value")); + const auto ts_sz_func = [kTimestampSize](uint32_t /*cf_id*/) { + return kTimestampSize; + }; + ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func)); + ASSERT_OK(db_->Write(WriteOptions(), &wb)); + + int num_keys = 2; + std::vector keys; + std::vector expected_values; + for (int i = 0; i < num_keys; i++) { + keys.push_back("a"); + expected_values.push_back("value"); + } + std::vector handles; + handles.push_back(db_->DefaultColumnFamily()); + handles.push_back(handle); + + { + Slice read_ts_slice(ts); + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + + std::vector values; + values.resize(num_keys); + std::vector statuses; + statuses.resize(num_keys); + std::vector timestamps; + timestamps.resize(num_keys); + + db_->MultiGet(read_opts, num_keys, handles.data(), keys.data(), + values.data(), timestamps.data(), statuses.data()); + + for (int i = 0; i < num_keys; i++) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(expected_values[i], values[i].ToString()); + ASSERT_EQ(ts, timestamps[i]); + } + } + + delete handle; + Close(); +} + TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) { Options options = CurrentOptions(); options.env = env_; diff --git a/db/dbformat.h b/db/dbformat.h index bd0c8f03de..3f87157804 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -330,17 +330,16 @@ inline Slice ExtractUserKey(const Slice& internal_key) { // output : inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, size_t ts_sz) { - Slice ret = internal_key; - ret.remove_suffix(kNumInternalBytes + ts_sz); - return ret; + assert(internal_key.size() >= kNumInternalBytes + ts_sz); + return Slice(internal_key.data(), + internal_key.size() - (kNumInternalBytes + ts_sz)); } // input [user key]: // output: inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { - Slice ret = user_key; - ret.remove_suffix(ts_sz); - return ret; + assert(user_key.size() >= ts_sz); + return Slice(user_key.data(), user_key.size() - ts_sz); } // input [user key]: diff --git a/db/event_helpers.cc b/db/event_helpers.cc index 514616385c..fa9c5e153c 100644 --- a/db/event_helpers.cc +++ b/db/event_helpers.cc @@ -124,6 +124,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( << "comparator" << table_properties.comparator_name << "user_defined_timestamps_persisted" << table_properties.user_defined_timestamps_persisted + << "key_largest_seqno" << table_properties.key_largest_seqno << "merge_operator" << table_properties.merge_operator_name << "prefix_extractor_name" << table_properties.prefix_extractor_name << "property_collectors" diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 6f733af169..750c9641a5 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -114,7 +114,6 @@ Status ExternalSstFileIngestionJob::Prepare( const std::string path_inside_db = TableFileName( cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); if (ingestion_options_.move_files) { - assert(!ingestion_options_.allow_db_generated_files); status = fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); if (status.ok()) { @@ -627,7 +626,8 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) { DeleteInternalFiles(); consumed_seqno_count_ = 0; files_overlap_ = false; - } else if (status.ok() && ingestion_options_.move_files) { + } else if (status.ok() && ingestion_options_.move_files && + !ingestion_options_.allow_db_generated_files) { // The files were moved and added successfully, remove original file links for (IngestedFileInfo& f : files_to_ingest_) { Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr); @@ -914,9 +914,18 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( } else if (!iter->status().ok()) { return iter->status(); } - if (ingestion_options_.allow_db_generated_files) { - // Verify that all keys have seqno zero. - // TODO: store largest seqno in table property and validate it instead. + SequenceNumber largest_seqno = + table_reader.get()->GetTableProperties()->key_largest_seqno; + // UINT64_MAX means unknown and the file is generated before table property + // `key_largest_seqno` is introduced. + if (largest_seqno != UINT64_MAX && largest_seqno > 0) { + return Status::Corruption( + "External file has non zero largest sequence number " + + std::to_string(largest_seqno)); + } + if (ingestion_options_.allow_db_generated_files && + largest_seqno == UINT64_MAX) { + // Need to verify that all keys have seqno zero. for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { Status pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors); diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index ae32343144..3a15c8ef10 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -674,10 +674,8 @@ class SstFileWriterCollector : public TablePropertiesCollector { Status Finish(UserCollectedProperties* properties) override { std::string count = std::to_string(count_); - *properties = UserCollectedProperties{ - {prefix_ + "_SstFileWriterCollector", "YES"}, - {prefix_ + "_Count", count}, - }; + properties->insert({prefix_ + "_SstFileWriterCollector", "YES"}); + properties->insert({prefix_ + "_Count", count}); return Status::OK(); } @@ -3727,13 +3725,14 @@ INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest, std::make_tuple(true, true), std::make_tuple(false, false))); -class IngestDBGeneratedFileTest : public ExternalSSTFileTestBase, - public ::testing::WithParamInterface { +class IngestDBGeneratedFileTest + : public ExternalSSTFileTestBase, + public ::testing::WithParamInterface> { public: IngestDBGeneratedFileTest() { ingest_opts.allow_db_generated_files = true; - ingest_opts.move_files = false; - ingest_opts.verify_checksums_before_ingest = GetParam(); + ingest_opts.move_files = std::get<0>(GetParam()); + ingest_opts.verify_checksums_before_ingest = std::get<1>(GetParam()); ingest_opts.snapshot_consistency = false; } @@ -3742,9 +3741,16 @@ class IngestDBGeneratedFileTest : public ExternalSSTFileTestBase, }; INSTANTIATE_TEST_CASE_P(BasicMultiConfig, IngestDBGeneratedFileTest, - testing::Bool()); + testing::Combine(testing::Bool(), testing::Bool())); TEST_P(IngestDBGeneratedFileTest, FailureCase) { + if (encrypted_env_ && ingest_opts.move_files) { + // FIXME: should fail ingestion or support this combination. + ROCKSDB_GTEST_SKIP( + "Encrypted env and move_files do not work together, as we reopen the " + "file after linking it which appends an extra encryption prefix."); + return; + } // Ingesting overlapping data should always fail. do { SCOPED_TRACE("option_config_ = " + std::to_string(option_config_)); @@ -3778,6 +3784,7 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) { live_meta[0].relative_filename); // Ingesting a file whose boundary key has non-zero seqno. Status s = db_->IngestExternalFile(to_ingest_files, ingest_opts); + // This error msg is from checking seqno of boundary keys. ASSERT_TRUE( s.ToString().find("External file has non zero sequence number") != std::string::npos); @@ -3824,10 +3831,9 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) { live_meta[0].directory + "/" + live_meta[0].relative_filename; s = db_->IngestExternalFile(to_ingest_files, ingest_opts); ASSERT_NOK(s); - ASSERT_TRUE( - s.ToString().find( - "External file has a key with non zero sequence number") != - std::string::npos); + // This error msg is from checking largest seqno in table property. + ASSERT_TRUE(s.ToString().find("non zero largest sequence number") != + std::string::npos); db_->ReleaseSnapshot(snapshot); } @@ -3897,14 +3903,6 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) { ASSERT_TRUE(s.ToString().find(err) != std::string::npos); ASSERT_NOK(s); - ingest_opts.move_files = true; - s = db_->IngestExternalFile(to_ingest_files, ingest_opts); - ingest_opts.move_files = false; - ASSERT_TRUE( - s.ToString().find("Options move_files and allow_db_generated_files are " - "not compatible") != std::string::npos); - ASSERT_NOK(s); - ingest_opts.snapshot_consistency = false; ASSERT_OK(db_->IngestExternalFile(to_ingest_files, ingest_opts)); db_->ReleaseSnapshot(snapshot); @@ -3924,14 +3922,16 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) { class IngestDBGeneratedFileTest2 : public ExternalSSTFileTestBase, - public ::testing::WithParamInterface> { + public ::testing::WithParamInterface< + std::tuple> { public: IngestDBGeneratedFileTest2() = default; }; INSTANTIATE_TEST_CASE_P(VaryingOptions, IngestDBGeneratedFileTest2, testing::Combine(testing::Bool(), testing::Bool(), - testing::Bool(), testing::Bool())); + testing::Bool(), testing::Bool(), + testing::Bool())); TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) { // Use a separate column family to sort some data, generate multiple SST @@ -3939,11 +3939,11 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) { // to be ingested does not overlap with existing data. IngestExternalFileOptions ingest_opts; ingest_opts.allow_db_generated_files = true; - ingest_opts.move_files = false; ingest_opts.snapshot_consistency = std::get<0>(GetParam()); ingest_opts.allow_global_seqno = std::get<1>(GetParam()); ingest_opts.allow_blocking_flush = std::get<2>(GetParam()); ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam()); + ingest_opts.move_files = std::get<4>(GetParam()); do { SCOPED_TRACE("option_config_ = " + std::to_string(option_config_)); diff --git a/db/flush_job.cc b/db/flush_job.cc index e874d81ffc..44fe86c786 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -1156,6 +1156,11 @@ void FlushJob::GetEffectiveCutoffUDTForPickedMemTables() { // Find the newest user-defined timestamps from all the flushed memtables. for (MemTable* m : mems_) { Slice table_newest_udt = m->GetNewestUDT(); + // Empty memtables can be legitimately created and flushed, for example + // by error recovery flush attempts. + if (table_newest_udt.empty()) { + continue; + } if (cutoff_udt_.empty() || ucmp->CompareTimestamp(table_newest_udt, cutoff_udt_) > 0) { if (!cutoff_udt_.empty()) { diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 3ffb77d537..d407e4815f 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -68,7 +68,8 @@ class FlushJobTestBase : public testing::Test { } void NewDB() { - ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); + ASSERT_OK( + SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown)); VersionEdit new_db; new_db.SetLogNumber(0); @@ -114,7 +115,8 @@ class FlushJobTestBase : public testing::Test { } ASSERT_OK(s); // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + Temperature::kUnknown, nullptr); ASSERT_OK(s); } diff --git a/db/listener_test.cc b/db/listener_test.cc index d298a86e7e..9f0f036b78 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -354,13 +354,13 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) { } TEST_F(EventListenerTest, MultiCF) { - Options options; - options.env = CurrentOptions().env; - options.write_buffer_size = k110KB; -#ifdef ROCKSDB_USING_THREAD_STATUS - options.enable_thread_tracking = true; -#endif // ROCKSDB_USING_THREAD_STATUS for (auto atomic_flush : {false, true}) { + Options options; + options.env = CurrentOptions().env; + options.write_buffer_size = k110KB; +#ifdef ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = true; +#endif // ROCKSDB_USING_THREAD_STATUS options.atomic_flush = atomic_flush; options.create_if_missing = true; DestroyAndReopen(options); diff --git a/db/memtable.cc b/db/memtable.cc index 2b6c39d6e7..ef1184ded4 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -67,9 +67,10 @@ ImmutableMemTableOptions::ImmutableMemTableOptions( statistics(ioptions.stats), merge_operator(ioptions.merge_operator.get()), info_log(ioptions.logger), - allow_data_in_errors(ioptions.allow_data_in_errors), protection_bytes_per_key( - mutable_cf_options.memtable_protection_bytes_per_key) {} + mutable_cf_options.memtable_protection_bytes_per_key), + allow_data_in_errors(ioptions.allow_data_in_errors), + paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {} MemTable::MemTable(const InternalKeyComparator& cmp, const ImmutableOptions& ioptions, @@ -370,15 +371,17 @@ class MemTableIterator : public InternalIterator { : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), comparator_(mem.comparator_), - valid_(false), seqno_to_time_mapping_(seqno_to_time_mapping), - arena_mode_(arena != nullptr), - value_pinned_( - !mem.GetImmutableMemTableOptions()->inplace_update_support), - protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key), status_(Status::OK()), logger_(mem.moptions_.info_log), - ts_sz_(mem.ts_sz_) { + ts_sz_(mem.ts_sz_), + protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key), + valid_(false), + value_pinned_( + !mem.GetImmutableMemTableOptions()->inplace_update_support), + arena_mode_(arena != nullptr), + paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks), + allow_data_in_error(mem.moptions_.allow_data_in_errors) { if (use_range_del_table) { iter_ = mem.range_del_table_->GetIterator(arena); } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && @@ -406,6 +409,7 @@ class MemTableIterator : public InternalIterator { } else { delete iter_; } + status_.PermitUncheckedError(); } #ifndef NDEBUG @@ -415,10 +419,16 @@ class MemTableIterator : public InternalIterator { PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; #endif - bool Valid() const override { return valid_ && status_.ok(); } + bool Valid() const override { + // If inner iter_ is not valid, then this iter should also not be valid. + assert(iter_->Valid() || !(valid_ && status_.ok())); + return valid_ && status_.ok(); + } + void Seek(const Slice& k) override { PERF_TIMER_GUARD(seek_on_memtable_time); PERF_COUNTER_ADD(seek_on_memtable_count, 1); + status_ = Status::OK(); if (bloom_) { // iterator should only use prefix bloom filter Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); @@ -433,13 +443,18 @@ class MemTableIterator : public InternalIterator { } } } - iter_->Seek(k, nullptr); + if (paranoid_memory_checks_) { + status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error); + } else { + iter_->Seek(k, nullptr); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); } void SeekForPrev(const Slice& k) override { PERF_TIMER_GUARD(seek_on_memtable_time); PERF_COUNTER_ADD(seek_on_memtable_count, 1); + status_ = Status::OK(); if (bloom_) { Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); if (prefix_extractor_->InDomain(user_k_without_ts)) { @@ -453,7 +468,11 @@ class MemTableIterator : public InternalIterator { } } } - iter_->Seek(k, nullptr); + if (paranoid_memory_checks_) { + status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error); + } else { + iter_->Seek(k, nullptr); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); if (!Valid() && status().ok()) { @@ -464,11 +483,13 @@ class MemTableIterator : public InternalIterator { } } void SeekToFirst() override { + status_ = Status::OK(); iter_->SeekToFirst(); valid_ = iter_->Valid(); VerifyEntryChecksum(); } void SeekToLast() override { + status_ = Status::OK(); iter_->SeekToLast(); valid_ = iter_->Valid(); VerifyEntryChecksum(); @@ -476,8 +497,12 @@ class MemTableIterator : public InternalIterator { void Next() override { PERF_COUNTER_ADD(next_on_memtable_count, 1); assert(Valid()); - iter_->Next(); - TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_); + if (paranoid_memory_checks_) { + status_ = iter_->NextAndValidate(allow_data_in_error); + } else { + iter_->Next(); + TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); } @@ -494,7 +519,11 @@ class MemTableIterator : public InternalIterator { void Prev() override { PERF_COUNTER_ADD(prev_on_memtable_count, 1); assert(Valid()); - iter_->Prev(); + if (paranoid_memory_checks_) { + status_ = iter_->PrevAndValidate(allow_data_in_error); + } else { + iter_->Prev(); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); } @@ -540,15 +569,17 @@ class MemTableIterator : public InternalIterator { const SliceTransform* const prefix_extractor_; const MemTable::KeyComparator comparator_; MemTableRep::Iterator* iter_; - bool valid_; // The seqno to time mapping is owned by the SuperVersion. UnownedPtr seqno_to_time_mapping_; - bool arena_mode_; - bool value_pinned_; - uint32_t protection_bytes_per_key_; Status status_; Logger* logger_; size_t ts_sz_; + uint32_t protection_bytes_per_key_; + bool valid_; + bool value_pinned_; + bool arena_mode_; + const bool paranoid_memory_checks_; + const bool allow_data_in_error; void VerifyEntryChecksum() { if (protection_bytes_per_key_ > 0 && Valid()) { @@ -933,6 +964,8 @@ static bool SaveValue(void* arg, const char* entry) { Saver* s = static_cast(arg); assert(s != nullptr); assert(!s->value || !s->columns); + assert(!*(s->found_final_value)); + assert(s->status->ok() || s->status->IsMergeInProgress()); MergeContext* merge_context = s->merge_context; SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq; @@ -966,6 +999,7 @@ static bool SaveValue(void* arg, const char* entry) { *(s->status) = MemTable::VerifyEntryChecksum( entry, s->protection_bytes_per_key, s->allow_data_in_errors); if (!s->status->ok()) { + *(s->found_final_value) = true; ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState()); // Memtable entry corrupted return false; @@ -1231,6 +1265,7 @@ static bool SaveValue(void* arg, const char* entry) { ". "); msg.append("seq: " + std::to_string(seq) + "."); } + *(s->found_final_value) = true; *(s->status) = Status::Corruption(msg.c_str()); return false; } @@ -1310,8 +1345,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value, // No change to value, since we have not yet found a Put/Delete // Propagate corruption error - if (!found_final_value && merge_in_progress && !s->IsCorruption()) { - *s = Status::MergeInProgress(); + if (!found_final_value && merge_in_progress) { + if (s->ok()) { + *s = Status::MergeInProgress(); + } else { + assert(s->IsMergeInProgress()); + } } PERF_COUNTER_ADD(get_from_memtable_count, 1); return found_final_value; @@ -1347,7 +1386,19 @@ void MemTable::GetFromTable(const LookupKey& key, saver.do_merge = do_merge; saver.allow_data_in_errors = moptions_.allow_data_in_errors; saver.protection_bytes_per_key = moptions_.protection_bytes_per_key; - table_->Get(key, &saver, SaveValue); + + if (!moptions_.paranoid_memory_checks) { + table_->Get(key, &saver, SaveValue); + } else { + Status check_s = table_->GetAndValidate(key, &saver, SaveValue, + moptions_.allow_data_in_errors); + if (check_s.IsCorruption()) { + *(saver.status) = check_s; + // Should stop searching the LSM. + *(saver.found_final_value) = true; + } + } + assert(s->ok() || s->IsMergeInProgress() || *found_final_value); *seq = saver.seq; } @@ -1421,10 +1472,19 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, &found_final_value, &merge_in_progress); if (!found_final_value && merge_in_progress) { - *(iter->s) = Status::MergeInProgress(); + if (iter->s->ok()) { + *(iter->s) = Status::MergeInProgress(); + } else { + assert(iter->s->IsMergeInProgress()); + } } - if (found_final_value) { + if (found_final_value || + (!iter->s->ok() && !iter->s->IsMergeInProgress())) { + // `found_final_value` should be set if an error/corruption occurs. + // The check on iter->s is just there in case GetFromTable() did not + // set `found_final_value` properly. + assert(found_final_value); if (iter->value) { iter->value->PinSelf(); range->AddValueSize(iter->value->size()); diff --git a/db/memtable.h b/db/memtable.h index 9b42d130f0..ca0652bc04 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -60,8 +60,9 @@ struct ImmutableMemTableOptions { Statistics* statistics; MergeOperator* merge_operator; Logger* info_log; - bool allow_data_in_errors; uint32_t protection_bytes_per_key; + bool allow_data_in_errors; + bool paranoid_memory_checks; }; // Batched counters to updated when inserting keys in one write batch. @@ -249,12 +250,14 @@ class MemTable { // If do_merge = true the default behavior which is Get value for key is // executed. Expected behavior is described right below. // If memtable contains a value for key, store it in *value and return true. - // If memtable contains a deletion for key, store a NotFound() error - // in *status and return true. + // If memtable contains a deletion for key, store NotFound() in *status and + // return true. // If memtable contains Merge operation as the most recent entry for a key, // and the merge process does not stop (not reaching a value or delete), // prepend the current merge operand to *operands. // store MergeInProgress in s, and return false. + // If an unexpected error or corruption occurs, store Corruption() or other + // error in *status and return true. // Else, return false. // If any operation was found, its most recent sequence number // will be stored in *seq on success (regardless of whether true/false is @@ -264,6 +267,11 @@ class MemTable { // If do_merge = false then any Merge Operands encountered for key are simply // stored in merge_context.operands_list and never actually merged to get a // final value. The raw Merge Operands are eventually returned to the user. + // @param value If not null and memtable contains a value for key, `value` + // will be set to the result value. + // @param column If not null and memtable contains a value/WideColumn for key, + // `column` will be set to the result value/WideColumn. + // Note: only one of `value` and `column` can be non-nullptr. // @param immutable_memtable Whether this memtable is immutable. Used // internally by NewRangeTombstoneIterator(). See comment above // NewRangeTombstoneIterator() for more detail. diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 1f259e688a..3675a280b9 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -181,7 +181,8 @@ bool MemTableListVersion::GetFromList( } if (done) { - assert(*seq != kMaxSequenceNumber || s->IsNotFound()); + assert(*seq != kMaxSequenceNumber || + (!s->ok() && !s->IsMergeInProgress())); return true; } if (!s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index bb4e44761e..22a96d67ff 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -287,6 +287,7 @@ TEST_F(MemTableListTest, GetTest) { // Fetch the newly written keys merge_context.Clear(); + s = Status::OK(); found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr, /*timestamp*/ nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions(), @@ -295,6 +296,7 @@ TEST_F(MemTableListTest, GetTest) { ASSERT_EQ(value, "value1"); merge_context.Clear(); + s = Status::OK(); found = mem->Get(LookupKey("key1", 2), &value, /*columns*/ nullptr, /*timestamp*/ nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions(), @@ -303,6 +305,7 @@ TEST_F(MemTableListTest, GetTest) { ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); + s = Status::OK(); found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr, /*timestamp*/ nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions(), @@ -311,6 +314,7 @@ TEST_F(MemTableListTest, GetTest) { ASSERT_EQ(value, "value2.2"); merge_context.Clear(); + s = Status::OK(); found = mem->Get(LookupKey("key3", seq), &value, /*columns*/ nullptr, /*timestamp*/ nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions(), @@ -350,6 +354,7 @@ TEST_F(MemTableListTest, GetTest) { // Fetch keys via MemTableList merge_context.Clear(); + s = Status::OK(); found = list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, @@ -357,6 +362,7 @@ TEST_F(MemTableListTest, GetTest) { ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); + s = Status::OK(); found = list.current()->Get(LookupKey("key1", saved_seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, @@ -365,6 +371,7 @@ TEST_F(MemTableListTest, GetTest) { ASSERT_EQ("value1", value); merge_context.Clear(); + s = Status::OK(); found = list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, @@ -373,12 +380,14 @@ TEST_F(MemTableListTest, GetTest) { ASSERT_EQ(value, "value2.3"); merge_context.Clear(); + s = Status::OK(); found = list.current()->Get(LookupKey("key2", 1), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_FALSE(found); merge_context.Clear(); + s = Status::OK(); found = list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, @@ -438,6 +447,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { // Fetch the newly written keys merge_context.Clear(); + s = Status::OK(); found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr, /*timestamp*/ nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions(), @@ -446,6 +456,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); + s = Status::OK(); found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr, /*timestamp*/ nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions(), @@ -462,6 +473,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { // Fetch keys via MemTableList merge_context.Clear(); + s = Status::OK(); found = list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, @@ -469,6 +481,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); + s = Status::OK(); found = list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, @@ -508,6 +521,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { // Verify keys are present in history merge_context.Clear(); + s = Status::OK(); found = list.current()->GetFromHistory( LookupKey("key1", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, @@ -515,6 +529,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); + s = Status::OK(); found = list.current()->GetFromHistory( LookupKey("key2", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, @@ -568,6 +583,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { // Verify keys are no longer in MemTableList merge_context.Clear(); + s = Status::OK(); found = list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, @@ -575,6 +591,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { ASSERT_FALSE(found); merge_context.Clear(); + s = Status::OK(); found = list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, @@ -582,6 +599,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { ASSERT_FALSE(found); merge_context.Clear(); + s = Status::OK(); found = list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, @@ -590,6 +608,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { // Verify that the second memtable's keys are in the history merge_context.Clear(); + s = Status::OK(); found = list.current()->GetFromHistory( LookupKey("key1", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, @@ -597,6 +616,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); + s = Status::OK(); found = list.current()->GetFromHistory( LookupKey("key3", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, @@ -606,6 +626,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { // Verify that key2 from the first memtable is no longer in the history merge_context.Clear(); + s = Status::OK(); found = list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, /*timestamp=*/nullptr, &s, &merge_context, diff --git a/db/version_builder.cc b/db/version_builder.cc index 9a72307d37..ed8ab82142 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -29,6 +29,7 @@ #include "db/internal_stats.h" #include "db/table_cache.h" #include "db/version_edit.h" +#include "db/version_edit_handler.h" #include "db/version_set.h" #include "port/port.h" #include "table/table_reader.h" @@ -37,6 +38,25 @@ namespace ROCKSDB_NAMESPACE { class VersionBuilder::Rep { + class NewestFirstBySeqNo { + public: + bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { + assert(lhs); + assert(rhs); + + if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) { + return lhs->fd.largest_seqno > rhs->fd.largest_seqno; + } + + if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) { + return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno; + } + + // Break ties by file number + return lhs->fd.GetNumber() > rhs->fd.GetNumber(); + } + }; + class NewestFirstByEpochNumber { private: inline static const NewestFirstBySeqNo seqno_cmp; @@ -249,9 +269,10 @@ class VersionBuilder::Rep { std::unordered_map table_file_levels_; // Current compact cursors that should be changed after the last compaction std::unordered_map updated_compact_cursors_; - NewestFirstByEpochNumber level_zero_cmp_by_epochno_; - NewestFirstBySeqNo level_zero_cmp_by_seqno_; - BySmallestKey level_nonzero_cmp_; + const std::shared_ptr + level_zero_cmp_by_epochno_; + const std::shared_ptr level_zero_cmp_by_seqno_; + const std::shared_ptr level_nonzero_cmp_; // Mutable metadata objects for all blob files affected by the series of // version edits. @@ -259,11 +280,56 @@ class VersionBuilder::Rep { std::shared_ptr file_metadata_cache_res_mgr_; + ColumnFamilyData* cfd_; + VersionEditHandler* version_edit_handler_; + bool track_found_and_missing_files_; + // If false, only a complete Version with all files consisting it found is + // considered valid. If true, besides complete Version, if the Version is + // never edited in an atomic group, an incomplete Version with only a suffix + // of L0 files missing is also considered valid. + bool allow_incomplete_valid_version_; + + // These are only tracked if `track_found_and_missing_files_` is enabled. + + // The SST files that are found (blob files not included yet). + std::unordered_set found_files_; + // Missing SST files for L0 + std::unordered_set l0_missing_files_; + // Missing SST files for non L0 levels + std::unordered_set non_l0_missing_files_; + // Intermediate SST files (blob files not included yet) + std::vector intermediate_files_; + // The highest file number for all the missing blob files, useful to check + // if a complete Version is available. + uint64_t missing_blob_files_high_ = kInvalidBlobFileNumber; + // Missing blob files, useful to check if only the missing L0 files' + // associated blob files are missing. + std::unordered_set missing_blob_files_; + // True if all files consisting the Version can be found. Or if + // `allow_incomplete_valid_version_` is true and the version history is not + // ever edited in an atomic group, this will be true if only a + // suffix of L0 SST files and their associated blob files are missing. + bool valid_version_available_; + // True if version is ever edited in an atomic group. + bool edited_in_atomic_group_; + + // Flag to indicate if the Version is updated since last validity check. If no + // `Apply` call is made between a `Rep`'s construction and a + // `ValidVersionAvailable` check or between two `ValidVersionAvailable` calls. + // This flag will be true to indicate the cached validity value can be + // directly used without a recheck. + bool version_updated_since_last_check_; + + // End of fields that are only tracked when `track_found_and_missing_files_` + // is enabled. + public: Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions, TableCache* table_cache, VersionStorageInfo* base_vstorage, VersionSet* version_set, - std::shared_ptr file_metadata_cache_res_mgr) + std::shared_ptr file_metadata_cache_res_mgr, + ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler, + bool track_found_and_missing_files, bool allow_incomplete_valid_version) : file_options_(file_options), ioptions_(ioptions), table_cache_(table_cache), @@ -271,11 +337,76 @@ class VersionBuilder::Rep { version_set_(version_set), num_levels_(base_vstorage->num_levels()), has_invalid_levels_(false), - level_nonzero_cmp_(base_vstorage_->InternalComparator()), - file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) { + level_zero_cmp_by_epochno_( + std::make_shared()), + level_zero_cmp_by_seqno_(std::make_shared()), + level_nonzero_cmp_(std::make_shared( + base_vstorage_->InternalComparator())), + file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr), + cfd_(cfd), + version_edit_handler_(version_edit_handler), + track_found_and_missing_files_(track_found_and_missing_files), + allow_incomplete_valid_version_(allow_incomplete_valid_version) { assert(ioptions_); levels_ = new LevelState[num_levels_]; + if (track_found_and_missing_files_) { + assert(cfd_); + assert(version_edit_handler_); + // `track_found_and_missing_files_` mode used by VersionEditHandlerPIT + // assumes the initial base version is valid. For best efforts recovery, + // base will be empty. For manifest tailing usage like secondary instance, + // they do not allow incomplete version, so the base version in subsequent + // catch up attempts should be valid too. + valid_version_available_ = true; + edited_in_atomic_group_ = false; + version_updated_since_last_check_ = false; + } + } + + Rep(const Rep& other) + : file_options_(other.file_options_), + ioptions_(other.ioptions_), + table_cache_(other.table_cache_), + base_vstorage_(other.base_vstorage_), + version_set_(other.version_set_), + num_levels_(other.num_levels_), + invalid_level_sizes_(other.invalid_level_sizes_), + has_invalid_levels_(other.has_invalid_levels_), + table_file_levels_(other.table_file_levels_), + updated_compact_cursors_(other.updated_compact_cursors_), + level_zero_cmp_by_epochno_(other.level_zero_cmp_by_epochno_), + level_zero_cmp_by_seqno_(other.level_zero_cmp_by_seqno_), + level_nonzero_cmp_(other.level_nonzero_cmp_), + mutable_blob_file_metas_(other.mutable_blob_file_metas_), + file_metadata_cache_res_mgr_(other.file_metadata_cache_res_mgr_), + cfd_(other.cfd_), + version_edit_handler_(other.version_edit_handler_), + track_found_and_missing_files_(other.track_found_and_missing_files_), + allow_incomplete_valid_version_(other.allow_incomplete_valid_version_), + found_files_(other.found_files_), + l0_missing_files_(other.l0_missing_files_), + non_l0_missing_files_(other.non_l0_missing_files_), + intermediate_files_(other.intermediate_files_), + missing_blob_files_high_(other.missing_blob_files_high_), + missing_blob_files_(other.missing_blob_files_), + valid_version_available_(other.valid_version_available_), + edited_in_atomic_group_(other.edited_in_atomic_group_), + version_updated_since_last_check_( + other.version_updated_since_last_check_) { + assert(ioptions_); + levels_ = new LevelState[num_levels_]; + for (int level = 0; level < num_levels_; level++) { + levels_[level] = other.levels_[level]; + const auto& added = levels_[level].added_files; + for (auto& pair : added) { + RefFile(pair.second); + } + } + if (track_found_and_missing_files_) { + assert(cfd_); + assert(version_edit_handler_); + } } ~Rep() { @@ -289,6 +420,12 @@ class VersionBuilder::Rep { delete[] levels_; } + void RefFile(FileMetaData* f) { + assert(f); + assert(f->refs > 0); + f->refs++; + } + void UnrefFile(FileMetaData* f) { f->refs--; if (f->refs <= 0) { @@ -397,7 +534,7 @@ class VersionBuilder::Rep { if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) { - if (!level_zero_cmp_by_seqno_(lhs, rhs)) { + if (!level_zero_cmp_by_seqno_->operator()(lhs, rhs)) { std::ostringstream oss; oss << "L0 files are not sorted properly: files #" << lhs->fd.GetNumber() << " with seqnos (largest, smallest) " @@ -429,7 +566,7 @@ class VersionBuilder::Rep { } } - if (!level_zero_cmp_by_epochno_(lhs, rhs)) { + if (!level_zero_cmp_by_epochno_->operator()(lhs, rhs)) { std::ostringstream oss; oss << "L0 files are not sorted properly: files #" << lhs->fd.GetNumber() << " with epoch number " @@ -458,7 +595,7 @@ class VersionBuilder::Rep { assert(lhs); assert(rhs); - if (!level_nonzero_cmp_(lhs, rhs)) { + if (!level_nonzero_cmp_->operator()(lhs, rhs)) { std::ostringstream oss; oss << 'L' << level << " files are not sorted properly: files #" << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber(); @@ -634,7 +771,22 @@ class VersionBuilder::Rep { mutable_blob_file_metas_.emplace( blob_file_number, MutableBlobFileMetaData(std::move(shared_meta))); - return Status::OK(); + Status s; + if (track_found_and_missing_files_) { + assert(version_edit_handler_); + s = version_edit_handler_->VerifyBlobFile(cfd_, blob_file_number, + blob_file_addition); + if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { + missing_blob_files_high_ = + std::max(missing_blob_files_high_, blob_file_number); + missing_blob_files_.insert(blob_file_number); + s = Status::OK(); + } else if (!s.ok()) { + return s; + } + } + + return s; } Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) { @@ -752,6 +904,29 @@ class VersionBuilder::Rep { table_file_levels_[file_number] = VersionStorageInfo::FileLocation::Invalid().GetLevel(); + if (track_found_and_missing_files_) { + assert(version_edit_handler_); + if (l0_missing_files_.find(file_number) != l0_missing_files_.end()) { + l0_missing_files_.erase(file_number); + } else if (non_l0_missing_files_.find(file_number) != + non_l0_missing_files_.end()) { + non_l0_missing_files_.erase(file_number); + } else { + auto fiter = found_files_.find(file_number); + // Only mark new files added during this catchup attempt for deletion. + // These files were never installed in VersionStorageInfo. + // Already referenced files that are deleted by a VersionEdit will + // be added to the VersionStorageInfo's obsolete files when the old + // version is dereferenced. + if (fiter != found_files_.end()) { + assert(!ioptions_->cf_paths.empty()); + intermediate_files_.emplace_back( + MakeTableFileName(ioptions_->cf_paths[0].path, file_number)); + found_files_.erase(fiter); + } + } + } + return Status::OK(); } @@ -824,7 +999,31 @@ class VersionBuilder::Rep { table_file_levels_[file_number] = level; - return Status::OK(); + Status s; + if (track_found_and_missing_files_) { + assert(version_edit_handler_); + assert(!ioptions_->cf_paths.empty()); + const std::string fpath = + MakeTableFileName(ioptions_->cf_paths[0].path, file_number); + s = version_edit_handler_->VerifyFile(cfd_, fpath, level, meta); + if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { + if (0 == level) { + l0_missing_files_.insert(file_number); + } else { + non_l0_missing_files_.insert(file_number); + } + if (s.IsCorruption()) { + found_files_.insert(file_number); + } + s = Status::OK(); + } else if (!s.ok()) { + return s; + } else { + found_files_.insert(file_number); + } + } + + return s; } Status ApplyCompactCursors(int level, @@ -845,6 +1044,7 @@ class VersionBuilder::Rep { // Apply all of the edits in *edit to the current state. Status Apply(const VersionEdit* edit) { + bool version_updated = false; { const Status s = CheckConsistency(base_vstorage_); if (!s.ok()) { @@ -862,6 +1062,7 @@ class VersionBuilder::Rep { if (!s.ok()) { return s; } + version_updated = true; } // Increase the amount of garbage for blob files affected by GC @@ -870,6 +1071,7 @@ class VersionBuilder::Rep { if (!s.ok()) { return s; } + version_updated = true; } // Delete table files @@ -881,6 +1083,7 @@ class VersionBuilder::Rep { if (!s.ok()) { return s; } + version_updated = true; } // Add new table files @@ -892,6 +1095,7 @@ class VersionBuilder::Rep { if (!s.ok()) { return s; } + version_updated = true; } // Populate compact cursors for round-robin compaction, leave @@ -904,6 +1108,13 @@ class VersionBuilder::Rep { return s; } } + + if (track_found_and_missing_files_ && version_updated) { + version_updated_since_last_check_ = true; + if (!edited_in_atomic_group_ && edit->IsInAtomicGroup()) { + edited_in_atomic_group_ = true; + } + } return Status::OK(); } @@ -1046,14 +1257,35 @@ class VersionBuilder::Rep { mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes()); } + bool OnlyLinkedToMissingL0Files( + const std::unordered_set& linked_ssts) const { + return std::all_of( + linked_ssts.begin(), linked_ssts.end(), [&](const uint64_t& element) { + return l0_missing_files_.find(element) != l0_missing_files_.end(); + }); + } + // Add the blob file specified by meta to *vstorage if it is determined to // contain valid data (blobs). template - static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) { + void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta, + uint64_t blob_file_number) const { assert(vstorage); assert(meta); - if (meta->GetLinkedSsts().empty() && + const auto& linked_ssts = meta->GetLinkedSsts(); + if (track_found_and_missing_files_) { + if (missing_blob_files_.find(blob_file_number) != + missing_blob_files_.end()) { + return; + } + // Leave the empty case for the below blob garbage collection logic. + if (!linked_ssts.empty() && OnlyLinkedToMissingL0Files(linked_ssts)) { + return; + } + } + + if (linked_ssts.empty() && meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) { return; } @@ -1065,6 +1297,7 @@ class VersionBuilder::Rep { // applied, and save the result into *vstorage. void SaveBlobFilesTo(VersionStorageInfo* vstorage) const { assert(vstorage); + assert(!track_found_and_missing_files_ || valid_version_available_); assert(base_vstorage_); vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() + @@ -1080,22 +1313,24 @@ class VersionBuilder::Rep { } auto process_base = - [vstorage](const std::shared_ptr& base_meta) { + [this, vstorage](const std::shared_ptr& base_meta) { assert(base_meta); - AddBlobFileIfNeeded(vstorage, base_meta); + AddBlobFileIfNeeded(vstorage, base_meta, + base_meta->GetBlobFileNumber()); return true; }; auto process_mutable = - [vstorage](const MutableBlobFileMetaData& mutable_meta) { - AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); + [this, vstorage](const MutableBlobFileMetaData& mutable_meta) { + AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta), + mutable_meta.GetBlobFileNumber()); return true; }; - auto process_both = [vstorage]( + auto process_both = [this, vstorage]( const std::shared_ptr& base_meta, const MutableBlobFileMetaData& mutable_meta) { assert(base_meta); @@ -1108,12 +1343,14 @@ class VersionBuilder::Rep { mutable_meta.GetGarbageBlobBytes()); assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts()); - AddBlobFileIfNeeded(vstorage, base_meta); + AddBlobFileIfNeeded(vstorage, base_meta, + base_meta->GetBlobFileNumber()); return true; } - AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); + AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta), + mutable_meta.GetBlobFileNumber()); return true; }; @@ -1125,6 +1362,10 @@ class VersionBuilder::Rep { void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) const { const uint64_t file_number = f->fd.GetNumber(); + if (track_found_and_missing_files_ && level == 0 && + l0_missing_files_.find(file_number) != l0_missing_files_.end()) { + return; + } const auto& level_state = levels_[level]; @@ -1148,6 +1389,29 @@ class VersionBuilder::Rep { } } + bool ContainsCompleteVersion() const { + assert(track_found_and_missing_files_); + return l0_missing_files_.empty() && non_l0_missing_files_.empty() && + (missing_blob_files_high_ == kInvalidBlobFileNumber || + missing_blob_files_high_ < GetMinOldestBlobFileNumber()); + } + + bool HasMissingFiles() const { + assert(track_found_and_missing_files_); + return !l0_missing_files_.empty() || !non_l0_missing_files_.empty() || + missing_blob_files_high_ != kInvalidBlobFileNumber; + } + + std::vector& GetAndClearIntermediateFiles() { + assert(track_found_and_missing_files_); + return intermediate_files_; + } + + void ClearFoundFiles() { + assert(track_found_and_missing_files_); + found_files_.clear(); + } + template void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const { // Merge the set of added files with the set of pre-existing files. @@ -1156,6 +1420,16 @@ class VersionBuilder::Rep { const auto& unordered_added_files = levels_[level].added_files; vstorage->Reserve(level, base_files.size() + unordered_added_files.size()); + MergeUnorderdAddedFilesWithBase( + base_files, unordered_added_files, cmp, + [&](FileMetaData* file) { MaybeAddFile(vstorage, level, file); }); + } + + template + void MergeUnorderdAddedFilesWithBase( + const std::vector& base_files, + const std::unordered_map& unordered_added_files, + Cmp cmp, AddFileFunc add_file_func) const { // Sort added files for the level. std::vector added_files; added_files.reserve(unordered_added_files.size()); @@ -1171,9 +1445,9 @@ class VersionBuilder::Rep { while (added_iter != added_end || base_iter != base_end) { if (base_iter == base_end || (added_iter != added_end && cmp(*added_iter, *base_iter))) { - MaybeAddFile(vstorage, level, *added_iter++); + add_file_func(*added_iter++); } else { - MaybeAddFile(vstorage, level, *base_iter++); + add_file_func(*base_iter++); } } } @@ -1215,13 +1489,13 @@ class VersionBuilder::Rep { } if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) { - SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_seqno_); + SaveSSTFilesTo(vstorage, /* level */ 0, *level_zero_cmp_by_seqno_); } else { - SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_epochno_); + SaveSSTFilesTo(vstorage, /* level */ 0, *level_zero_cmp_by_epochno_); } for (int level = 1; level < num_levels_; ++level) { - SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_); + SaveSSTFilesTo(vstorage, level, *level_nonzero_cmp_); } } @@ -1232,8 +1506,111 @@ class VersionBuilder::Rep { } } + bool ValidVersionAvailable() { + assert(track_found_and_missing_files_); + if (version_updated_since_last_check_) { + valid_version_available_ = ContainsCompleteVersion(); + if (!valid_version_available_ && !edited_in_atomic_group_ && + allow_incomplete_valid_version_) { + valid_version_available_ = OnlyMissingL0Suffix(); + } + version_updated_since_last_check_ = false; + } + return valid_version_available_; + } + + bool OnlyMissingL0Suffix() const { + if (!non_l0_missing_files_.empty()) { + return false; + } + assert(!(l0_missing_files_.empty() && missing_blob_files_.empty())); + + if (!l0_missing_files_.empty() && !MissingL0FilesAreL0Suffix()) { + return false; + } + if (!missing_blob_files_.empty() && + !RemainingSstFilesNotMissingBlobFiles()) { + return false; + } + return true; + } + + // Check missing L0 files are a suffix of expected sorted L0 files. + bool MissingL0FilesAreL0Suffix() const { + assert(non_l0_missing_files_.empty()); + assert(!l0_missing_files_.empty()); + std::vector expected_sorted_l0_files; + const auto& base_files = base_vstorage_->LevelFiles(0); + const auto& unordered_added_files = levels_[0].added_files; + expected_sorted_l0_files.reserve(base_files.size() + + unordered_added_files.size()); + EpochNumberRequirement epoch_number_requirement = + base_vstorage_->GetEpochNumberRequirement(); + + if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) { + MergeUnorderdAddedFilesWithBase( + base_files, unordered_added_files, *level_zero_cmp_by_seqno_, + [&](FileMetaData* file) { + expected_sorted_l0_files.push_back(file); + }); + } else { + MergeUnorderdAddedFilesWithBase( + base_files, unordered_added_files, *level_zero_cmp_by_epochno_, + [&](FileMetaData* file) { + expected_sorted_l0_files.push_back(file); + }); + } + assert(expected_sorted_l0_files.size() >= l0_missing_files_.size()); + std::unordered_set unaddressed_missing_files = l0_missing_files_; + for (auto iter = expected_sorted_l0_files.begin(); + iter != expected_sorted_l0_files.end(); iter++) { + uint64_t file_number = (*iter)->fd.GetNumber(); + if (l0_missing_files_.find(file_number) != l0_missing_files_.end()) { + assert(unaddressed_missing_files.find(file_number) != + unaddressed_missing_files.end()); + unaddressed_missing_files.erase(file_number); + } else if (!unaddressed_missing_files.empty()) { + return false; + } else { + break; + } + } + return true; + } + + // Check for each of the missing blob file missing, it either is older than + // the minimum oldest blob file required by this Version or only linked to + // the missing L0 files. + bool RemainingSstFilesNotMissingBlobFiles() const { + assert(non_l0_missing_files_.empty()); + assert(!missing_blob_files_.empty()); + bool no_l0_files_missing = l0_missing_files_.empty(); + uint64_t min_oldest_blob_file_num = GetMinOldestBlobFileNumber(); + for (const auto& missing_blob_file : missing_blob_files_) { + if (missing_blob_file < min_oldest_blob_file_num) { + continue; + } + auto iter = mutable_blob_file_metas_.find(missing_blob_file); + assert(iter != mutable_blob_file_metas_.end()); + const std::unordered_set& linked_ssts = + iter->second.GetLinkedSsts(); + // TODO(yuzhangyu): In theory, if no L0 SST files ara missing, and only + // blob files exclusively linked to a L0 suffix are missing, we can + // recover to a valid point in time too. We don't recover that type of + // incomplete Version yet. + if (!linked_ssts.empty() && no_l0_files_missing) { + return false; + } + if (!OnlyLinkedToMissingL0Files(linked_ssts)) { + return false; + } + } + return true; + } + // Save the current state in *vstorage. Status SaveTo(VersionStorageInfo* vstorage) const { + assert(!track_found_and_missing_files_ || valid_version_available_); Status s; #ifndef NDEBUG @@ -1266,6 +1643,7 @@ class VersionBuilder::Rep { size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, uint8_t block_protection_bytes_per_key) { assert(table_cache_ != nullptr); + assert(!track_found_and_missing_files_ || valid_version_available_); size_t table_cache_capacity = table_cache_->get_cache().get()->GetCapacity(); @@ -1305,6 +1683,11 @@ class VersionBuilder::Rep { for (int level = 0; level < num_levels_; level++) { for (auto& file_meta_pair : levels_[level].added_files) { auto* file_meta = file_meta_pair.second; + uint64_t file_number = file_meta->fd.GetNumber(); + if (track_found_and_missing_files_ && level == 0 && + l0_missing_files_.find(file_number) != l0_missing_files_.end()) { + continue; + } // If the file has been opened before, just skip it. if (!file_meta->table_reader_handle) { files_meta.emplace_back(file_meta, level); @@ -1369,9 +1752,13 @@ VersionBuilder::VersionBuilder( const FileOptions& file_options, const ImmutableCFOptions* ioptions, TableCache* table_cache, VersionStorageInfo* base_vstorage, VersionSet* version_set, - std::shared_ptr file_metadata_cache_res_mgr) + std::shared_ptr file_metadata_cache_res_mgr, + ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler, + bool track_found_and_missing_files, bool allow_incomplete_valid_version) : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage, - version_set, file_metadata_cache_res_mgr)) {} + version_set, file_metadata_cache_res_mgr, cfd, + version_edit_handler, track_found_and_missing_files, + allow_incomplete_valid_version)) {} VersionBuilder::~VersionBuilder() = default; @@ -1399,27 +1786,71 @@ Status VersionBuilder::LoadTableHandlers( read_options, block_protection_bytes_per_key); } -uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { - return rep_->GetMinOldestBlobFileNumber(); +void VersionBuilder::CreateOrReplaceSavePoint() { + assert(rep_); + savepoint_ = std::move(rep_); + rep_ = std::make_unique(*savepoint_); } +bool VersionBuilder::ValidVersionAvailable() { + return rep_->ValidVersionAvailable(); +} + +bool VersionBuilder::HasMissingFiles() const { return rep_->HasMissingFiles(); } + +std::vector& VersionBuilder::GetAndClearIntermediateFiles() { + return rep_->GetAndClearIntermediateFiles(); +} + +void VersionBuilder::ClearFoundFiles() { return rep_->ClearFoundFiles(); } + +Status VersionBuilder::SaveSavePointTo(VersionStorageInfo* vstorage) const { + if (!savepoint_ || !savepoint_->ValidVersionAvailable()) { + return Status::InvalidArgument(); + } + return savepoint_->SaveTo(vstorage); +} + +Status VersionBuilder::LoadSavePointTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, + const std::shared_ptr& prefix_extractor, + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key) { + if (!savepoint_ || !savepoint_->ValidVersionAvailable()) { + return Status::InvalidArgument(); + } + return savepoint_->LoadTableHandlers( + internal_stats, max_threads, prefetch_index_and_filter_in_cache, + is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin, + read_options, block_protection_bytes_per_key); +} + +void VersionBuilder::ClearSavePoint() { savepoint_.reset(nullptr); } + BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( - ColumnFamilyData* cfd) + ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler, + bool track_found_and_missing_files, bool allow_incomplete_valid_version) : version_builder_(new VersionBuilder( cfd->current()->version_set()->file_options(), cfd->ioptions(), cfd->table_cache(), cfd->current()->storage_info(), cfd->current()->version_set(), - cfd->GetFileMetadataCacheReservationManager())), + cfd->GetFileMetadataCacheReservationManager(), cfd, + version_edit_handler, track_found_and_missing_files, + allow_incomplete_valid_version)), version_(cfd->current()) { version_->Ref(); } BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( - ColumnFamilyData* cfd, Version* v) + ColumnFamilyData* cfd, Version* v, VersionEditHandler* version_edit_handler, + bool track_found_and_missing_files, bool allow_incomplete_valid_version) : version_builder_(new VersionBuilder( cfd->current()->version_set()->file_options(), cfd->ioptions(), cfd->table_cache(), v->storage_info(), v->version_set(), - cfd->GetFileMetadataCacheReservationManager())), + cfd->GetFileMetadataCacheReservationManager(), cfd, + version_edit_handler, track_found_and_missing_files, + allow_incomplete_valid_version)), version_(v) { assert(version_ != cfd->current()); } diff --git a/db/version_builder.h b/db/version_builder.h index fb2a304a84..7e2a0253cd 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -26,6 +26,7 @@ struct FileMetaData; class InternalStats; class Version; class VersionSet; +class VersionEditHandler; class ColumnFamilyData; class CacheReservationManager; @@ -38,22 +39,80 @@ class VersionBuilder { const ImmutableCFOptions* ioptions, TableCache* table_cache, VersionStorageInfo* base_vstorage, VersionSet* version_set, std::shared_ptr - file_metadata_cache_res_mgr = nullptr); + file_metadata_cache_res_mgr = nullptr, + ColumnFamilyData* cfd = nullptr, + VersionEditHandler* version_edit_handler = nullptr, + bool track_found_and_missing_files = false, + bool allow_incomplete_valid_version = false); ~VersionBuilder(); bool CheckConsistencyForNumLevels(); + Status Apply(const VersionEdit* edit); + + // Save the current Version to the provided `vstorage`. Status SaveTo(VersionStorageInfo* vstorage) const; + + // Load all the table handlers for the current Version in the builder. Status LoadTableHandlers( InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, uint8_t block_protection_bytes_per_key); - uint64_t GetMinOldestBlobFileNumber() const; + + //============APIs only used by VersionEditHandlerPointInTime ============// + + // Creates a save point for the Version that has been built so far. Subsequent + // VersionEdits applied to the builder will not affect the Version in this + // save point. VersionBuilder currently only supports creating one save point, + // so when `CreateOrReplaceSavePoint` is called again, the previous save point + // is cleared. `ClearSavePoint` can be called explicitly to clear + // the save point too. + void CreateOrReplaceSavePoint(); + + // The builder can find all the files to build a `Version`. Or if + // `allow_incomplete_valid_version_` is true and the version history is never + // edited in an atomic group, and only a suffix of L0 SST files and their + // associated blob files are missing. + // From the users' perspective, missing a suffix of L0 files means missing the + // user's most recently written data. So the remaining available files still + // presents a valid point in time view, although for some previous time. + // This validity check result will be cached and reused if the Version is not + // updated between two validity checks. + bool ValidVersionAvailable(); + + bool HasMissingFiles() const; + + // When applying a sequence of VersionEdit, intermediate files are the ones + // that are added and then deleted. The caller should clear this intermediate + // files tracking after calling this API. So that the tracking for subsequent + // VersionEdits can start over with a clean state. + std::vector& GetAndClearIntermediateFiles(); + + // Clearing all the found files in this Version. + void ClearFoundFiles(); + + // Save the Version in the save point to the provided `vstorage`. + // Non-OK status will be returned if there is not a valid save point. + Status SaveSavePointTo(VersionStorageInfo* vstorage) const; + + // Load all the table handlers for the Version in the save point. + // Non-OK status will be returned if there is not a valid save point. + Status LoadSavePointTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, + const std::shared_ptr& prefix_extractor, + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key); + + void ClearSavePoint(); + + //======= End of APIs only used by VersionEditPointInTime==========// private: class Rep; + std::unique_ptr savepoint_; std::unique_ptr rep_; }; @@ -62,8 +121,15 @@ class VersionBuilder { // Both of the constructor and destructor need to be called inside DB Mutex. class BaseReferencedVersionBuilder { public: - explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd); - BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v); + explicit BaseReferencedVersionBuilder( + ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler = nullptr, + bool track_found_and_missing_files = false, + bool allow_incomplete_valid_version = false); + BaseReferencedVersionBuilder( + ColumnFamilyData* cfd, Version* v, + VersionEditHandler* version_edit_handler = nullptr, + bool track_found_and_missing_files = false, + bool allow_incomplete_valid_version = false); ~BaseReferencedVersionBuilder(); VersionBuilder* version_builder() const { return version_builder_.get(); } @@ -71,23 +137,4 @@ class BaseReferencedVersionBuilder { std::unique_ptr version_builder_; Version* version_; }; - -class NewestFirstBySeqNo { - public: - bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { - assert(lhs); - assert(rhs); - - if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) { - return lhs->fd.largest_seqno > rhs->fd.largest_seqno; - } - - if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) { - return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno; - } - - // Break ties by file number - return lhs->fd.GetNumber() > rhs->fd.GetNumber(); - } -}; } // namespace ROCKSDB_NAMESPACE diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index 3284768046..4784c90960 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -155,6 +155,7 @@ VersionEditHandler::VersionEditHandler( VersionSet* version_set, bool track_found_and_missing_files, bool no_error_if_files_missing, const std::shared_ptr& io_tracer, const ReadOptions& read_options, bool skip_load_table_files, + bool allow_incomplete_valid_version, EpochNumberRequirement epoch_number_requirement) : VersionEditHandlerBase(read_options), read_only_(read_only), @@ -165,6 +166,7 @@ VersionEditHandler::VersionEditHandler( io_tracer_(io_tracer), skip_load_table_files_(skip_load_table_files), initialized_(false), + allow_incomplete_valid_version_(allow_incomplete_valid_version), epoch_number_requirement_(epoch_number_requirement) { assert(version_set_ != nullptr); } @@ -218,15 +220,15 @@ Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit, Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd) { - bool cf_in_not_found = false; + bool do_not_open_cf = false; bool cf_in_builders = false; - CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders); assert(cfd != nullptr); *cfd = nullptr; const std::string& cf_name = edit.GetColumnFamilyName(); Status s; - if (cf_in_builders || cf_in_not_found) { + if (cf_in_builders || do_not_open_cf) { s = Status::Corruption("MANIFEST adding the same column family twice: " + cf_name); } @@ -239,7 +241,7 @@ Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit, cf_name.compare(kPersistentStatsColumnFamilyName) == 0; if (cf_options == name_to_options_.end() && !is_persistent_stats_column_family) { - column_families_not_found_.emplace(edit.GetColumnFamily(), cf_name); + do_not_open_column_families_.emplace(edit.GetColumnFamily(), cf_name); } else { if (is_persistent_stats_column_family) { ColumnFamilyOptions cfo; @@ -256,9 +258,9 @@ Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit, Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit, ColumnFamilyData** cfd) { - bool cf_in_not_found = false; + bool do_not_open_cf = false; bool cf_in_builders = false; - CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders); assert(cfd != nullptr); *cfd = nullptr; @@ -266,8 +268,8 @@ Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit, Status s; if (cf_in_builders) { tmp_cfd = DestroyCfAndCleanup(edit); - } else if (cf_in_not_found) { - column_families_not_found_.erase(edit.GetColumnFamily()); + } else if (do_not_open_cf) { + do_not_open_column_families_.erase(edit.GetColumnFamily()); } else { s = Status::Corruption("MANIFEST - dropping non-existing column family"); } @@ -288,22 +290,20 @@ Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) { Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd) { - bool cf_in_not_found = false; + bool do_not_open_cf = false; bool cf_in_builders = false; - CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders); assert(cfd != nullptr); *cfd = nullptr; Status s; - if (!cf_in_not_found) { + if (!do_not_open_cf) { if (!cf_in_builders) { s = Status::Corruption( "MANIFEST record referencing unknown column family"); } ColumnFamilyData* tmp_cfd = nullptr; if (s.ok()) { - auto builder_iter = builders_.find(edit.GetColumnFamily()); - assert(builder_iter != builders_.end()); tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily( edit.GetColumnFamily()); assert(tmp_cfd != nullptr); @@ -318,56 +318,33 @@ Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit, if (!s.ok()) { return s; } - s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false); - if (s.ok()) { - s = builder_iter->second->version_builder()->Apply(&edit); - } + s = MaybeCreateVersionBeforeApplyEdit(edit, tmp_cfd, + /*force_create_version=*/false); } *cfd = tmp_cfd; } return s; } -// TODO maybe cache the computation result -bool VersionEditHandler::HasMissingFiles() const { - bool ret = false; - for (const auto& elem : cf_to_missing_files_) { - const auto& missing_files = elem.second; - if (!missing_files.empty()) { - ret = true; - break; - } - } - if (!ret) { - for (const auto& elem : cf_to_missing_blob_files_high_) { - if (elem.second != kInvalidBlobFileNumber) { - ret = true; - break; - } - } - } - return ret; -} - void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit, - bool* cf_in_not_found, + bool* do_not_open_cf, bool* cf_in_builders) const { - assert(cf_in_not_found != nullptr); + assert(do_not_open_cf != nullptr); assert(cf_in_builders != nullptr); // Not found means that user didn't supply that column // family option AND we encountered column family add // record. Once we encounter column family drop record, // we will delete the column family from - // column_families_not_found. + // do_not_open_column_families_. uint32_t cf_id = edit.GetColumnFamily(); - bool in_not_found = column_families_not_found_.find(cf_id) != - column_families_not_found_.end(); + bool in_do_not_open = do_not_open_column_families_.find(cf_id) != + do_not_open_column_families_.end(); // in builders means that user supplied that column family // option AND that we encountered column family add record bool in_builders = builders_.find(cf_id) != builders_.end(); // They cannot both be true - assert(!(in_not_found && in_builders)); - *cf_in_not_found = in_not_found; + assert(!(in_do_not_open && in_builders)); + *do_not_open_cf = in_do_not_open; *cf_in_builders = in_builders; } @@ -396,9 +373,9 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, // There were some column families in the MANIFEST that weren't specified // in the argument. This is OK in read_only mode if (s->ok() && MustOpenAllColumnFamilies() && - !column_families_not_found_.empty()) { + !do_not_open_column_families_.empty()) { std::string msg; - for (const auto& cf : column_families_not_found_) { + for (const auto& cf : do_not_open_column_families_) { msg.append(", "); msg.append(cf.second); } @@ -453,7 +430,8 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, } assert(cfd->initialized()); VersionEdit edit; - *s = MaybeCreateVersion(edit, cfd, /*force_create_version=*/true); + *s = MaybeCreateVersionBeforeApplyEdit(edit, cfd, + /*force_create_version=*/true); if (!s->ok()) { break; } @@ -498,13 +476,9 @@ ColumnFamilyData* VersionEditHandler::CreateCfAndInit( assert(cfd != nullptr); cfd->set_initialized(); assert(builders_.find(cf_id) == builders_.end()); - builders_.emplace(cf_id, - VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd))); - if (track_found_and_missing_files_) { - cf_to_found_files_.emplace(cf_id, std::unordered_set()); - cf_to_missing_files_.emplace(cf_id, std::unordered_set()); - cf_to_missing_blob_files_high_.emplace(cf_id, kInvalidBlobFileNumber); - } + builders_.emplace(cf_id, VersionBuilderUPtr(new BaseReferencedVersionBuilder( + cfd, this, track_found_and_missing_files_, + allow_incomplete_valid_version_))); return cfd; } @@ -514,21 +488,6 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup( auto builder_iter = builders_.find(cf_id); assert(builder_iter != builders_.end()); builders_.erase(builder_iter); - if (track_found_and_missing_files_) { - auto found_files_iter = cf_to_found_files_.find(cf_id); - assert(found_files_iter != cf_to_found_files_.end()); - cf_to_found_files_.erase(found_files_iter); - - auto missing_files_iter = cf_to_missing_files_.find(cf_id); - assert(missing_files_iter != cf_to_missing_files_.end()); - cf_to_missing_files_.erase(missing_files_iter); - - auto missing_blob_files_high_iter = - cf_to_missing_blob_files_high_.find(cf_id); - assert(missing_blob_files_high_iter != - cf_to_missing_blob_files_high_.end()); - cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter); - } ColumnFamilyData* ret = version_set_->GetColumnFamilySet()->GetColumnFamily(cf_id); assert(ret != nullptr); @@ -538,15 +497,14 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup( return ret; } -Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, - ColumnFamilyData* cfd, - bool force_create_version) { +Status VersionEditHandler::MaybeCreateVersionBeforeApplyEdit( + const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) { assert(cfd->initialized()); Status s; + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + auto* builder = builder_iter->second->version_builder(); if (force_create_version) { - auto builder_iter = builders_.find(cfd->GetID()); - assert(builder_iter != builders_.end()); - auto* builder = builder_iter->second->version_builder(); auto* v = new Version(cfd, version_set_, version_set_->file_options_, *cfd->GetLatestMutableCFOptions(), io_tracer_, version_set_->current_version_number_++, @@ -562,6 +520,7 @@ Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, delete v; } } + s = builder->Apply(&edit); return s; } @@ -731,12 +690,13 @@ Status VersionEditHandler::MaybeHandleFileBoundariesForNewFiles( VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( bool read_only, std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer, - const ReadOptions& read_options, + const ReadOptions& read_options, bool allow_incomplete_valid_version, EpochNumberRequirement epoch_number_requirement) : VersionEditHandler(read_only, column_families, version_set, /*track_found_and_missing_files=*/true, /*no_error_if_files_missing=*/true, io_tracer, - read_options, epoch_number_requirement) {} + read_options, allow_incomplete_valid_version, + epoch_number_requirement) {} VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { for (const auto& cfid_and_version : atomic_update_versions_) { @@ -762,7 +722,8 @@ Status VersionEditHandlerPointInTime::OnAtomicGroupReplayBegin() { assert(!cfd->IsDropped()); assert(cfd->initialized()); VersionEdit edit; - Status s = MaybeCreateVersion(edit, cfd, true /* force_create_version */); + Status s = MaybeCreateVersionBeforeApplyEdit( + edit, cfd, true /* force_create_version */); if (!s.ok()) { return s; } @@ -824,17 +785,17 @@ void VersionEditHandlerPointInTime::CheckIterationResult( } assert(cfd->initialized()); auto v_iter = versions_.find(cfd->GetID()); + auto builder_iter = builders_.find(cfd->GetID()); if (v_iter != versions_.end()) { assert(v_iter->second != nullptr); + assert(builder_iter != builders_.end()); version_set_->AppendVersion(cfd, v_iter->second); versions_.erase(v_iter); // Let's clear found_files, since any files in that are part of the // installed Version. Any files that got obsoleted would have already // been moved to intermediate_files_ - auto found_files_iter = cf_to_found_files_.find(cfd->GetID()); - assert(found_files_iter != cf_to_found_files_.end()); - found_files_iter->second.clear(); + builder_iter->second->version_builder()->ClearFoundFiles(); } } } else { @@ -863,147 +824,50 @@ ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup( return cfd; } -Status VersionEditHandlerPointInTime::MaybeCreateVersion( +Status VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit( const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) { - TEST_SYNC_POINT("VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1"); - TEST_SYNC_POINT("VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"); + TEST_SYNC_POINT( + "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:" + "Begin1"); + TEST_SYNC_POINT( + "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:" + "Begin2"); assert(cfd != nullptr); if (!force_create_version) { assert(edit.GetColumnFamily() == cfd->GetID()); } - auto found_files_iter = cf_to_found_files_.find(cfd->GetID()); - assert(found_files_iter != cf_to_found_files_.end()); - std::unordered_set& found_files = found_files_iter->second; - - auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID()); - assert(missing_files_iter != cf_to_missing_files_.end()); - std::unordered_set& missing_files = missing_files_iter->second; - - auto missing_blob_files_high_iter = - cf_to_missing_blob_files_high_.find(cfd->GetID()); - assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end()); - const uint64_t prev_missing_blob_file_high = - missing_blob_files_high_iter->second; - - VersionBuilder* builder = nullptr; - - if (prev_missing_blob_file_high != kInvalidBlobFileNumber) { - auto builder_iter = builders_.find(cfd->GetID()); - assert(builder_iter != builders_.end()); - builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - } - - // At this point, we have not yet applied the new version edits read from the - // MANIFEST. We check whether we have any missing table and blob files. - const bool prev_has_missing_files = - !missing_files.empty() || - (prev_missing_blob_file_high != kInvalidBlobFileNumber && - prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber()); - - for (const auto& file : edit.GetDeletedFiles()) { - uint64_t file_num = file.second; - auto fiter = missing_files.find(file_num); - if (fiter != missing_files.end()) { - missing_files.erase(fiter); - } else { - fiter = found_files.find(file_num); - // Only mark new files added during this catchup attempt for deletion. - // These files were never installed in VersionStorageInfo. - // Already referenced files that are deleted by a VersionEdit will - // be added to the VersionStorageInfo's obsolete files when the old - // version is dereferenced. - if (fiter != found_files.end()) { - intermediate_files_.emplace_back( - MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num)); - found_files.erase(fiter); - } - } - } - - assert(!cfd->ioptions()->cf_paths.empty()); - Status s; - for (const auto& elem : edit.GetNewFiles()) { - int level = elem.first; - const FileMetaData& meta = elem.second; - const FileDescriptor& fd = meta.fd; - uint64_t file_num = fd.GetNumber(); - const std::string fpath = - MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num); - s = VerifyFile(cfd, fpath, level, meta); - if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { - missing_files.insert(file_num); - if (s.IsCorruption()) { - found_files.insert(file_num); - } - s = Status::OK(); - } else if (!s.ok()) { - break; - } else { - found_files.insert(file_num); - } - } - - uint64_t missing_blob_file_num = prev_missing_blob_file_high; - for (const auto& elem : edit.GetBlobFileAdditions()) { - uint64_t file_num = elem.GetBlobFileNumber(); - s = VerifyBlobFile(cfd, file_num, elem); - if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { - missing_blob_file_num = std::max(missing_blob_file_num, file_num); - s = Status::OK(); - } else if (!s.ok()) { - break; - } - } - - bool has_missing_blob_files = false; - if (missing_blob_file_num != kInvalidBlobFileNumber && - missing_blob_file_num >= prev_missing_blob_file_high) { - missing_blob_files_high_iter->second = missing_blob_file_num; - has_missing_blob_files = true; - } else if (missing_blob_file_num < prev_missing_blob_file_high) { - assert(false); - } - - // We still have not applied the new version edit, but have tried to add new - // table and blob files after verifying their presence and consistency. - // Therefore, we know whether we will see new missing table and blob files - // later after actually applying the version edit. We perform the check here - // and record the result. - const bool has_missing_files = - !missing_files.empty() || has_missing_blob_files; bool missing_info = !version_edit_params_.HasLogNumber() || !version_edit_params_.HasNextFile() || !version_edit_params_.HasLastSequence(); - // Create version before apply edit. The version will represent the state - // before applying the version edit. + Status s; + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + VersionBuilder* builder = builder_iter->second->version_builder(); + const bool valid_pit_before_edit = builder->ValidVersionAvailable(); + builder->CreateOrReplaceSavePoint(); + s = builder->Apply(&edit); + const bool valid_pit_after_edit = builder->ValidVersionAvailable(); + // A new version will be created if: // 1) no error has occurred so far, and // 2) log_number_, next_file_number_ and last_sequence_ are known, and // 3) not in an AtomicGroup // 4) any of the following: - // a) no missing file before, but will have missing file(s) after applying - // this version edit. - // b) no missing file after applying the version edit, and the caller - // explicitly request that a new version be created. + // a) a valid Version is available before applying the edit + // and a valid Version is not available after the edit. + // b) a valid Version is available after the edit and the + // caller explicitly request that a new version be created. if (s.ok() && !missing_info && !in_atomic_group_ && - ((has_missing_files && !prev_has_missing_files) || - (!has_missing_files && force_create_version))) { - if (!builder) { - auto builder_iter = builders_.find(cfd->GetID()); - assert(builder_iter != builders_.end()); - builder = builder_iter->second->version_builder(); - assert(builder); - } - + ((!valid_pit_after_edit && valid_pit_before_edit) || + (valid_pit_after_edit && force_create_version))) { const MutableCFOptions* cf_opts_ptr = cfd->GetLatestMutableCFOptions(); auto* version = new Version(cfd, version_set_, version_set_->file_options_, *cf_opts_ptr, io_tracer_, version_set_->current_version_number_++, epoch_number_requirement_); - s = builder->LoadTableHandlers( + s = builder->LoadSavePointTableHandlers( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, false, true, cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr), @@ -1015,7 +879,7 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( } return s; } - s = builder->SaveTo(version->storage_info()); + s = builder->SaveSavePointTo(version->storage_info()); if (s.ok()) { if (AtomicUpdateVersionsContains(cfd->GetID())) { AtomicUpdateVersionsPut(version); @@ -1038,6 +902,8 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( delete version; } } + + builder->ClearSavePoint(); return s; } @@ -1072,6 +938,15 @@ Status VersionEditHandlerPointInTime::LoadTables( return Status::OK(); } +bool VersionEditHandlerPointInTime::HasMissingFiles() const { + for (const auto& builder : builders_) { + if (builder.second->version_builder()->HasMissingFiles()) { + return true; + } + } + return false; +} + bool VersionEditHandlerPointInTime::AtomicUpdateVersionsCompleted() { return atomic_update_versions_missing_ == 0; } @@ -1145,8 +1020,9 @@ Status ManifestTailer::Initialize() { Version* base_version = dummy_version->Next(); assert(base_version); base_version->Ref(); - VersionBuilderUPtr new_builder( - new BaseReferencedVersionBuilder(default_cfd, base_version)); + VersionBuilderUPtr new_builder(new BaseReferencedVersionBuilder( + default_cfd, base_version, this, track_found_and_missing_files_, + allow_incomplete_valid_version_)); builder_iter->second = std::move(new_builder); initialized_ = true; @@ -1189,8 +1065,8 @@ Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit, Version* base_version = dummy_version->Next(); assert(base_version); base_version->Ref(); - VersionBuilderUPtr new_builder( - new BaseReferencedVersionBuilder(tmp_cfd, base_version)); + VersionBuilderUPtr new_builder(new BaseReferencedVersionBuilder( + tmp_cfd, base_version, this, track_found_and_missing_files_)); builder_iter->second = std::move(new_builder); #ifndef NDEBUG @@ -1213,6 +1089,18 @@ void ManifestTailer::CheckIterationResult(const log::Reader& reader, } } +std::vector ManifestTailer::GetAndClearIntermediateFiles() { + std::vector res; + for (const auto& builder : builders_) { + auto files = + builder.second->version_builder()->GetAndClearIntermediateFiles(); + res.insert(res.end(), std::make_move_iterator(files.begin()), + std::make_move_iterator(files.end())); + files.erase(files.begin(), files.end()); + } + return res; +} + Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& fmeta) { diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h index e19ead31df..f3637ae730 100644 --- a/db/version_edit_handler.h +++ b/db/version_edit_handler.h @@ -100,7 +100,9 @@ using VersionBuilderUPtr = std::unique_ptr; // A class used for scanning MANIFEST file. // VersionEditHandler reads a MANIFEST file, parses the version edits, and // builds the version set's in-memory state, e.g. the version storage info for -// the versions of column families. +// the versions of column families. It replays all the version edits in one +// MANIFEST file to build the end version. +// // To use this class and its subclasses, // 1. Create an object of VersionEditHandler or its subclasses. // VersionEditHandler handler(read_only, column_families, version_set, @@ -119,13 +121,14 @@ class VersionEditHandler : public VersionEditHandlerBase { VersionSet* version_set, bool track_found_and_missing_files, bool no_error_if_files_missing, const std::shared_ptr& io_tracer, - const ReadOptions& read_options, + const ReadOptions& read_options, bool allow_incomplete_valid_version, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent) : VersionEditHandler(read_only, column_families, version_set, track_found_and_missing_files, no_error_if_files_missing, io_tracer, read_options, /*skip_load_table_files=*/false, + allow_incomplete_valid_version, epoch_number_requirement) {} ~VersionEditHandler() override {} @@ -134,14 +137,24 @@ class VersionEditHandler : public VersionEditHandlerBase { return version_edit_params_; } - bool HasMissingFiles() const; - void GetDbId(std::string* db_id) const { if (db_id && version_edit_params_.HasDbId()) { *db_id = version_edit_params_.GetDbId(); } } + virtual Status VerifyFile(ColumnFamilyData* /*cfd*/, + const std::string& /*fpath*/, int /*level*/, + const FileMetaData& /*fmeta*/) { + return Status::OK(); + } + + virtual Status VerifyBlobFile(ColumnFamilyData* /*cfd*/, + uint64_t /*blob_file_num*/, + const BlobFileAddition& /*blob_addition*/) { + return Status::OK(); + } + protected: explicit VersionEditHandler( bool read_only, std::vector column_families, @@ -149,6 +162,7 @@ class VersionEditHandler : public VersionEditHandlerBase { bool no_error_if_files_missing, const std::shared_ptr& io_tracer, const ReadOptions& read_options, bool skip_load_table_files, + bool allow_incomplete_valid_version, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent); @@ -166,7 +180,7 @@ class VersionEditHandler : public VersionEditHandlerBase { Status Initialize() override; - void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found, + void CheckColumnFamilyId(const VersionEdit& edit, bool* do_not_open_cf, bool* cf_in_builders) const; void CheckIterationResult(const log::Reader& reader, Status* s) override; @@ -176,9 +190,9 @@ class VersionEditHandler : public VersionEditHandlerBase { virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit); - virtual Status MaybeCreateVersion(const VersionEdit& edit, - ColumnFamilyData* cfd, - bool force_create_version); + virtual Status MaybeCreateVersionBeforeApplyEdit(const VersionEdit& edit, + ColumnFamilyData* cfd, + bool force_create_version); virtual Status LoadTables(ColumnFamilyData* cfd, bool prefetch_index_and_filter_in_cache, @@ -191,21 +205,23 @@ class VersionEditHandler : public VersionEditHandlerBase { VersionSet* version_set_; std::unordered_map builders_; std::unordered_map name_to_options_; - // Keeps track of column families in manifest that were not found in - // column families parameters. if those column families are not dropped - // by subsequent manifest records, Recover() will return failure status. - std::unordered_map column_families_not_found_; - VersionEditParams version_edit_params_; const bool track_found_and_missing_files_; - std::unordered_map> cf_to_found_files_; - std::unordered_map> - cf_to_missing_files_; - std::unordered_map cf_to_missing_blob_files_high_; + // Keeps track of column families in manifest that were not found in + // column families parameters. Namely, the user asks to not open these column + // families. In non read only mode, if those column families are not dropped + // by subsequent manifest records, Recover() will return failure status. + std::unordered_map do_not_open_column_families_; + VersionEditParams version_edit_params_; bool no_error_if_files_missing_; std::shared_ptr io_tracer_; bool skip_load_table_files_; bool initialized_; std::unique_ptr> cf_to_cmp_names_; + // If false, only a complete Version for which all files consisting it can be + // found is considered a valid Version. If true, besides complete Version, an + // incomplete Version with only a suffix of L0 files missing is also + // considered valid if the Version is never edited in an atomic group. + const bool allow_incomplete_valid_version_; EpochNumberRequirement epoch_number_requirement_; std::unordered_set cfds_to_mark_no_udt_; @@ -226,8 +242,18 @@ class VersionEditHandler : public VersionEditHandlerBase { }; // A class similar to its base class, i.e. VersionEditHandler. -// VersionEditHandlerPointInTime restores the versions to the most recent point -// in time such that at this point, the version does not have missing files. +// Unlike VersionEditHandler that only aims to build the end version, this class +// supports building the most recent point in time version. A point in time +// version is a version for which no files are missing, or if +// `allow_incomplete_valid_version` is true, only a suffix of L0 files (and +// their associated blob files) are missing. +// +// Building a point in time version when end version is not available can +// be useful for best efforts recovery (options.best_efforts_recovery), which +// uses this class and sets `allow_incomplete_valid_version` to true. +// It's also useful for secondary instances/follower instances for which end +// version could be transiently unavailable. These two cases use subclass +// `ManifestTailer` and sets `allow_incomplete_valid_version` to false. // // Not thread-safe, external synchronization is necessary if an object of // VersionEditHandlerPointInTime is shared by multiple threads. @@ -236,28 +262,32 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { VersionEditHandlerPointInTime( bool read_only, std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer, - const ReadOptions& read_options, + const ReadOptions& read_options, bool allow_incomplete_valid_version, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent); ~VersionEditHandlerPointInTime() override; + bool HasMissingFiles() const; + + virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, + int level, const FileMetaData& fmeta) override; + virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num, + const BlobFileAddition& blob_addition) override; + protected: Status OnAtomicGroupReplayBegin() override; Status OnAtomicGroupReplayEnd() override; void CheckIterationResult(const log::Reader& reader, Status* s) override; ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override; - // `MaybeCreateVersion(..., false)` creates a version upon a negative edge - // trigger (transition from valid to invalid). + // `MaybeCreateVersionBeforeApplyEdit(..., false)` creates a version upon a + // negative edge trigger (transition from valid to invalid). // - // `MaybeCreateVersion(..., true)` creates a version on a positive level - // trigger (state is valid). - Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd, - bool force_create_version) override; - virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, - int level, const FileMetaData& fmeta); - virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num, - const BlobFileAddition& blob_addition); + // `MaybeCreateVersionBeforeApplyEdit(..., true)` creates a version on a + // positive level trigger (state is valid). + Status MaybeCreateVersionBeforeApplyEdit(const VersionEdit& edit, + ColumnFamilyData* cfd, + bool force_create_version) override; Status LoadTables(ColumnFamilyData* cfd, bool prefetch_index_and_filter_in_cache, @@ -275,8 +305,6 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { bool in_atomic_group_ = false; - std::vector intermediate_files_; - private: bool AtomicUpdateVersionsCompleted(); bool AtomicUpdateVersionsContains(uint32_t cfid); @@ -292,6 +320,12 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { void AtomicUpdateVersionsApply(); }; +// A class similar to `VersionEditHandlerPointInTime` that parse MANIFEST and +// builds point in time version. +// `ManifestTailer` supports reading one MANIFEST file in multiple tailing +// attempts and supports switching to a different MANIFEST after +// `PrepareToReadNewManifest` is called. This class is used by secondary and +// follower instance. class ManifestTailer : public VersionEditHandlerPointInTime { public: explicit ManifestTailer(std::vector column_families, @@ -302,9 +336,13 @@ class ManifestTailer : public VersionEditHandlerPointInTime { EpochNumberRequirement::kMustPresent) : VersionEditHandlerPointInTime(/*read_only=*/false, column_families, version_set, io_tracer, read_options, + /*allow_incomplete_valid_version=*/false, epoch_number_requirement), mode_(Mode::kRecovery) {} + Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level, + const FileMetaData& fmeta) override; + void PrepareToReadNewManifest() { initialized_ = false; ClearReadBuffer(); @@ -314,9 +352,7 @@ class ManifestTailer : public VersionEditHandlerPointInTime { return cfds_changed_; } - std::vector& GetIntermediateFiles() { - return intermediate_files_; - } + std::vector GetAndClearIntermediateFiles(); protected: Status Initialize() override; @@ -329,9 +365,6 @@ class ManifestTailer : public VersionEditHandlerPointInTime { void CheckIterationResult(const log::Reader& reader, Status* s) override; - Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level, - const FileMetaData& fmeta) override; - enum Mode : uint8_t { kRecovery = 0, kCatchUp = 1, @@ -352,7 +385,9 @@ class DumpManifestHandler : public VersionEditHandler { /*read_only=*/true, column_families, version_set, /*track_found_and_missing_files=*/false, /*no_error_if_files_missing=*/false, io_tracer, read_options, - /*skip_load_table_files=*/true), + /*skip_load_table_files=*/true, + /*allow_incomplete_valid_version=*/false, + /*epoch_number_requirement=*/EpochNumberRequirement::kMustPresent), verbose_(verbose), hex_(hex), json_(json), diff --git a/db/version_set.cc b/db/version_set.cc index 1be0468a5f..e81165a3d2 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -5511,6 +5511,10 @@ Status VersionSet::ProcessManifestWrites( std::unique_ptr new_desc_log_ptr; { FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_); + // DB option (in file_options_) takes precedence when not kUnknown + if (file_options_.temperature != Temperature::kUnknown) { + opt_file_opts.temperature = file_options_.temperature; + } mu->Unlock(); TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart"); TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr); @@ -5637,9 +5641,9 @@ Status VersionSet::ProcessManifestWrites( assert(manifest_io_status.ok()); } if (s.ok() && new_descriptor_log) { - io_s = SetCurrentFile(write_options, fs_.get(), dbname_, - pending_manifest_file_number_, - dir_contains_current_file); + io_s = SetCurrentFile( + write_options, fs_.get(), dbname_, pending_manifest_file_number_, + file_options_.temperature, dir_contains_current_file); if (!io_s.ok()) { s = io_s; // Quarantine old manifest file in case new manifest file's CURRENT file @@ -6080,7 +6084,8 @@ Status VersionSet::Recover( VersionEditHandler handler( read_only, column_families, const_cast(this), /*track_found_and_missing_files=*/false, no_error_if_files_missing, - io_tracer_, read_options, EpochNumberRequirement::kMightMissing); + io_tracer_, read_options, /*allow_incomplete_valid_version=*/false, + EpochNumberRequirement::kMightMissing); handler.Iterate(reader, &log_read_status); s = handler.status(); if (s.ok()) { @@ -6256,7 +6261,8 @@ Status VersionSet::TryRecoverFromOneManifest( /*checksum=*/true, /*log_num=*/0); VersionEditHandlerPointInTime handler_pit( read_only, column_families, const_cast(this), io_tracer_, - read_options, EpochNumberRequirement::kMightMissing); + read_options, /*allow_incomplete_valid_version=*/true, + EpochNumberRequirement::kMightMissing); handler_pit.Iterate(reader, &s); @@ -7477,7 +7483,7 @@ Status ReactiveVersionSet::ReadAndApply( *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies()); } if (files_to_delete) { - *files_to_delete = std::move(manifest_tailer_->GetIntermediateFiles()); + *files_to_delete = manifest_tailer_->GetAndClearIntermediateFiles(); } return s; diff --git a/db/version_set.h b/db/version_set.h index acbd70a7fe..9e80b3a4c0 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1277,6 +1277,15 @@ class VersionSet { bool no_error_if_files_missing = false, bool is_retry = false, Status* log_status = nullptr); + // Do a best-efforts recovery (Options.best_efforts_recovery=true) from all + // available MANIFEST files. Similar to `Recover` with these differences: + // 1) not only the latest MANIFEST can be used, if it's not available or + // no successful recovery can be achieved with it, this function also tries + // to recover from previous MANIFEST files, in reverse chronological order + // until a successful recovery can be achieved. + // 2) this function doesn't just aim to recover to the latest version, if that + // is not available, the most recent point in time version will be saved in + // memory. Check doc for `VersionEditHandlerPointInTime` for more details. Status TryRecover(const std::vector& column_families, bool read_only, const std::vector& files_in_dbname, diff --git a/db/version_set_test.cc b/db/version_set_test.cc index d4b748db7b..4f3665fba6 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -11,6 +11,7 @@ #include +#include "db/blob/blob_log_writer.h" #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/log_writer.h" @@ -1345,18 +1346,27 @@ class VersionSetTestBase { std::string key; // the only key int level = 0; uint64_t epoch_number; + bool file_missing = false; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; SstInfo(uint64_t file_num, const std::string& cf_name, const std::string& _key, - uint64_t _epoch_number = kUnknownEpochNumber) - : SstInfo(file_num, cf_name, _key, 0, _epoch_number) {} + uint64_t _epoch_number = kUnknownEpochNumber, + bool _file_missing = false, + uint64_t _oldest_blob_file_number = kInvalidBlobFileNumber) + : SstInfo(file_num, cf_name, _key, 0, _epoch_number, _file_missing, + _oldest_blob_file_number) {} SstInfo(uint64_t file_num, const std::string& cf_name, const std::string& _key, int lvl, - uint64_t _epoch_number = kUnknownEpochNumber) + uint64_t _epoch_number = kUnknownEpochNumber, + bool _file_missing = false, + uint64_t _oldest_blob_file_number = kInvalidBlobFileNumber) : file_number(file_num), column_family(cf_name), key(_key), level(lvl), - epoch_number(_epoch_number) {} + epoch_number(_epoch_number), + file_missing(_file_missing), + oldest_blob_file_number(_oldest_blob_file_number) {} }; // Create dummy sst, return their metadata. Note that only file name and size @@ -1395,22 +1405,32 @@ class VersionSetTestBase { ASSERT_NE(0, file_size); file_metas->emplace_back( file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false, - Temperature::kUnknown, 0, 0, 0, info.epoch_number, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, - 0, 0, /* user_defined_timestamps_persisted */ true); + Temperature::kUnknown, info.oldest_blob_file_number, 0, 0, + info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); + if (info.file_missing) { + ASSERT_OK(fs_->DeleteFile(fname, IOOptions(), nullptr)); + } } } + void CreateCurrentFile() { + // Make "CURRENT" file point to the new manifest file. + ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + Temperature::kUnknown, + /* dir_contains_current_file */ nullptr)); + } + // Create DB with 3 column families. void NewDB() { SequenceNumber last_seqno; std::unique_ptr log_writer; - ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); + ASSERT_OK( + SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown)); PrepareManifest(&column_families_, &last_seqno, &log_writer); log_writer.reset(); - // Make "CURRENT" file point to the new manifest file. - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); - ASSERT_OK(s); + CreateCurrentFile(); EXPECT_OK(versions_->Recover(column_families_, false)); EXPECT_EQ(column_families_.size(), @@ -2586,7 +2606,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, edits_[i].MarkAtomicGroup(--remaining); edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); + CreateCurrentFile(); } void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) { @@ -2598,7 +2618,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, edits_[i].MarkAtomicGroup(--remaining); edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); + CreateCurrentFile(); } void SetupCorruptedAtomicGroup(int atomic_group_size) { @@ -2612,7 +2632,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, } edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); + CreateCurrentFile(); } void SetupIncorrectAtomicGroup(int atomic_group_size) { @@ -2628,7 +2648,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, } edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); + CreateCurrentFile(); } void SetupTestSyncPoints() { @@ -3394,8 +3414,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { SequenceNumber last_seqno; std::unique_ptr log_writer; PrepareManifest(&column_families, &last_seqno, &log_writer); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); - ASSERT_OK(s); + CreateCurrentFile(); EXPECT_OK(versions_->Recover(column_families, false /* read_only */)); EXPECT_EQ(column_families.size(), @@ -3417,7 +3436,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { cfd_to_drop->Ref(); drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID()); mutex_.Lock(); - s = versions_->LogAndApply( + Status s = versions_->LogAndApply( cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), read_options, write_options, &drop_cf_edit, &mutex_, nullptr); mutex_.Unlock(); @@ -3527,9 +3546,7 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase, TEST_F(EmptyDefaultCfNewManifest, Recover) { PrepareManifest(nullptr, nullptr, &log_writer_); log_writer_.reset(); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); std::vector column_families; @@ -3538,7 +3555,7 @@ TEST_F(EmptyDefaultCfNewManifest, Recover) { cf_options_); std::string db_id; bool has_missing_table_file = false; - s = versions_->TryRecoverFromOneManifest( + Status s = versions_->TryRecoverFromOneManifest( manifest_path, column_families, false, &db_id, &has_missing_table_file); ASSERT_OK(s); ASSERT_FALSE(has_missing_table_file); @@ -3559,7 +3576,8 @@ class VersionSetTestEmptyDb assert(nullptr != log_writer); VersionEdit new_db; if (db_options_.write_dbid_to_manifest) { - ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); + ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_, + Temperature::kUnknown)); DBOptions tmp_db_options; tmp_db_options.env = env_; std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); @@ -3592,9 +3610,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) { db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); PrepareManifest(nullptr, nullptr, &log_writer_); log_writer_.reset(); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3609,9 +3625,9 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) { std::string db_id; bool has_missing_table_file = false; - s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, - read_only, &db_id, - &has_missing_table_file); + Status s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families, read_only, &db_id, + &has_missing_table_file); auto iter = std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); if (iter == cf_names.end()) { @@ -3637,9 +3653,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) { ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3685,9 +3699,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) { ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3744,9 +3756,7 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) { ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3802,9 +3812,7 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) { ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, - /* dir_contains_current_file */ nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3869,8 +3877,9 @@ INSTANTIATE_TEST_CASE_P( class VersionSetTestMissingFiles : public VersionSetTestBase, public testing::Test { public: - VersionSetTestMissingFiles() - : VersionSetTestBase("version_set_test_missing_files"), + explicit VersionSetTestMissingFiles( + const std::string& test_name = "version_set_test_missing_files") + : VersionSetTestBase(test_name), internal_comparator_( std::make_shared(options_.comparator)) {} @@ -3947,7 +3956,8 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, // This method updates last_sequence_. void WriteFileAdditionAndDeletionToManifest( uint32_t cf, const std::vector>& added_files, - const std::vector>& deleted_files) { + const std::vector>& deleted_files, + const std::vector& blob_files = {}) { VersionEdit edit; edit.SetColumnFamily(cf); for (const auto& elem : added_files) { @@ -3958,6 +3968,9 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, int level = elem.first; edit.DeleteFile(level, elem.second); } + for (const auto& elem : blob_files) { + edit.AddBlobFile(elem); + } edit.SetLastSequence(last_seqno_); ++last_seqno_; assert(log_writer_.get() != nullptr); @@ -4006,15 +4019,14 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, std::vector>(), deleted_files); log_writer_.reset(); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); std::string db_id; bool has_missing_table_file = false; - s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, - /*read_only=*/false, &db_id, - &has_missing_table_file); + Status s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families_, + /*read_only=*/false, &db_id, &has_missing_table_file); ASSERT_OK(s); ASSERT_TRUE(has_missing_table_file); for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { @@ -4064,15 +4076,14 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, added_files, std::vector>()); log_writer_.reset(); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); std::string db_id; bool has_missing_table_file = false; - s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, - /*read_only=*/false, &db_id, - &has_missing_table_file); + Status s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families_, + /*read_only=*/false, &db_id, &has_missing_table_file); ASSERT_OK(s); ASSERT_TRUE(has_missing_table_file); for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { @@ -4118,15 +4129,14 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, std::vector>(), deleted_files); log_writer_.reset(); - Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); - ASSERT_OK(s); + CreateCurrentFile(); std::string manifest_path; VerifyManifest(&manifest_path); std::string db_id; bool has_missing_table_file = false; - s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, - /*read_only=*/false, &db_id, - &has_missing_table_file); + Status s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families_, + /*read_only=*/false, &db_id, &has_missing_table_file); ASSERT_OK(s); ASSERT_FALSE(has_missing_table_file); for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { @@ -4171,6 +4181,250 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) { } } +class BestEffortsRecoverIncompleteVersionTest + : public VersionSetTestMissingFiles { + public: + BestEffortsRecoverIncompleteVersionTest() + : VersionSetTestMissingFiles("best_efforts_recover_incomplete_version") {} + + struct BlobInfo { + uint64_t file_number; + bool file_missing; + std::string key; + std::string blob; + BlobInfo(uint64_t _file_number, bool _file_missing, std::string _key, + std::string _blob) + : file_number(_file_number), + file_missing(_file_missing), + key(_key), + blob(_blob) {} + }; + + void CreateDummyBlobFiles(const std::vector& infos, + std::vector* blob_metas) { + for (const auto& info : infos) { + if (!info.file_missing) { + WriteDummyBlobFile(info.file_number, info.key, info.blob); + } + blob_metas->emplace_back( + info.file_number, 1 /*total_blob_count*/, + info.key.size() + info.blob.size() /*total_blob_bytes*/, + "" /*checksum_method*/, "" /*check_sum_value*/); + } + } + // Creates a test blob file that is valid so it can pass the + // `VersionEditHandlerPointInTime::VerifyBlobFile` check. + void WriteDummyBlobFile(uint64_t blob_file_number, const Slice& key, + const Slice& blob) { + ImmutableOptions options; + std::string blob_file_path = BlobFileName(dbname_, blob_file_number); + + std::unique_ptr file; + ASSERT_OK( + fs_->NewWritableFile(blob_file_path, FileOptions(), &file, nullptr)); + + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), blob_file_path, FileOptions(), options.clock)); + + BlobLogWriter blob_log_writer(std::move(file_writer), options.clock, + /*statistics*/ nullptr, blob_file_number, + /*use_fsync*/ true, + /*do_flush*/ false); + + constexpr ExpirationRange expiration_range; + BlobLogHeader header(/*column_family_id*/ 0, kNoCompression, + /*has_ttl*/ false, expiration_range); + ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header)); + std::string compressed_blob; + uint64_t key_offset = 0; + uint64_t blob_offset = 0; + ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), key, blob, &key_offset, + &blob_offset)); + BlobLogFooter footer; + footer.blob_count = 1; + footer.expiration_range = expiration_range; + std::string checksum_method; + std::string checksum_value; + ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer, + &checksum_method, &checksum_value)); + } + + void RecoverFromManifestWithMissingFiles( + const std::vector>& added_files, + const std::vector& blob_files) { + PrepareManifest(&column_families_, &last_seqno_, &log_writer_); + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, added_files, std::vector>(), + blob_files); + log_writer_.reset(); + CreateCurrentFile(); + std::string manifest_path; + VerifyManifest(&manifest_path); + std::string db_id; + bool has_missing_table_file = false; + Status s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families_, + /*read_only=*/false, &db_id, &has_missing_table_file); + ASSERT_OK(s); + ASSERT_TRUE(has_missing_table_file); + } +}; + +TEST_F(BestEffortsRecoverIncompleteVersionTest, NonL0MissingFiles) { + std::vector sst_files = { + SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */, + 100 /* epoch_number */, true /* file_missing */), + SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */, + 101 /* epoch_number */, false /* file_missing */), + SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */, + 102 /* epoch_number */, false /* file_missing */), + }; + std::vector file_metas; + CreateDummyTableFiles(sst_files, &file_metas); + + std::vector> added_files; + for (size_t i = 0; i < sst_files.size(); i++) { + const auto& info = sst_files[i]; + const auto& meta = file_metas[i]; + added_files.emplace_back(info.level, meta); + } + RecoverFromManifestWithMissingFiles(added_files, + std::vector()); + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_TRUE(all_table_files.empty()); +} + +TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingNonSuffixL0Files) { + std::vector sst_files = { + SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */, + 100 /* epoch_number */, false /* file_missing */), + SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */, + 101 /* epoch_number */, true /* file_missing */), + SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */, + 102 /* epoch_number */, false /* file_missing */), + }; + std::vector file_metas; + CreateDummyTableFiles(sst_files, &file_metas); + + std::vector> added_files; + for (size_t i = 0; i < sst_files.size(); i++) { + const auto& info = sst_files[i]; + const auto& meta = file_metas[i]; + added_files.emplace_back(info.level, meta); + } + RecoverFromManifestWithMissingFiles(added_files, + std::vector()); + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_TRUE(all_table_files.empty()); +} + +TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingBlobFiles) { + std::vector sst_files = { + SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */, + 100 /* epoch_number */, false /* file_missing */, + 102 /*oldest_blob_file_number*/), + SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */, + 101 /* epoch_number */, false /* file_missing */, + 103 /*oldest_blob_file_number*/), + }; + std::vector file_metas; + CreateDummyTableFiles(sst_files, &file_metas); + + std::vector blob_files = { + BlobInfo(102, true /*file_missing*/, "a", "blob1"), + BlobInfo(103, true /*file_missing*/, "a", "blob2"), + }; + std::vector blob_meta; + CreateDummyBlobFiles(blob_files, &blob_meta); + + std::vector> added_files; + for (size_t i = 0; i < sst_files.size(); i++) { + const auto& info = sst_files[i]; + const auto& meta = file_metas[i]; + added_files.emplace_back(info.level, meta); + } + RecoverFromManifestWithMissingFiles(added_files, blob_meta); + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_TRUE(all_table_files.empty()); +} + +TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingL0SuffixOnly) { + std::vector sst_files = { + SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */, + 100 /* epoch_number */, false /* file_missing */), + SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */, + 101 /* epoch_number */, false /* file_missing */), + SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */, + 102 /* epoch_number */, true /* file_missing */), + }; + std::vector file_metas; + CreateDummyTableFiles(sst_files, &file_metas); + + std::vector> added_files; + for (size_t i = 0; i < sst_files.size(); i++) { + const auto& info = sst_files[i]; + const auto& meta = file_metas[i]; + added_files.emplace_back(info.level, meta); + } + RecoverFromManifestWithMissingFiles(added_files, + std::vector()); + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_EQ(2, all_table_files.size()); + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + ASSERT_EQ(1, vstorage->LevelFiles(0).size()); + ASSERT_EQ(1, vstorage->LevelFiles(1).size()); +} + +TEST_F(BestEffortsRecoverIncompleteVersionTest, + MissingL0SuffixAndTheirBlobFiles) { + std::vector sst_files = { + SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */, + 100 /* epoch_number */, false /* file_missing */), + SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */, + 101 /* epoch_number */, false /* file_missing */, + 103 /*oldest_blob_file_number*/), + SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */, + 102 /* epoch_number */, true /* file_missing */, + 104 /*oldest_blob_file_number*/), + }; + std::vector file_metas; + CreateDummyTableFiles(sst_files, &file_metas); + + std::vector blob_files = { + BlobInfo(103, false /*file_missing*/, "a", "blob1"), + BlobInfo(104, true /*file_missing*/, "a", "blob2"), + }; + std::vector blob_meta; + CreateDummyBlobFiles(blob_files, &blob_meta); + + std::vector> added_files; + for (size_t i = 0; i < sst_files.size(); i++) { + const auto& info = sst_files[i]; + const auto& meta = file_metas[i]; + added_files.emplace_back(info.level, meta); + } + RecoverFromManifestWithMissingFiles(added_files, blob_meta); + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_EQ(2, all_table_files.size()); + ASSERT_EQ(1, all_blob_files.size()); + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + ASSERT_EQ(1, vstorage->LevelFiles(0).size()); + ASSERT_EQ(1, vstorage->LevelFiles(1).size()); + ASSERT_EQ(1, vstorage->GetBlobFiles().size()); +} + class ChargeFileMetadataTest : public DBTestBase { public: ChargeFileMetadataTest() diff --git a/db/write_batch.cc b/db/write_batch.cc index 7294d9845f..3820dccd05 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -929,15 +929,19 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, } if (0 == ts_sz) { - return WriteBatchInternal::Put(this, cf_id, key, value); + s = WriteBatchInternal::Put(this, cf_id, key, value); + } else { + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2), + SliceParts(&value, 1)); } - - needs_in_place_update_ts_ = true; - has_key_with_ts_ = true; - std::string dummy_ts(ts_sz, '\0'); - std::array key_with_ts{{key, dummy_ts}}; - return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2), - SliceParts(&value, 1)); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts_sz); + } + return s; } Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key, @@ -962,7 +966,7 @@ Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key, Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, const Slice& ts, const Slice& value) { - const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + Status s = CheckColumnFamilyTimestampSize(column_family, ts); if (!s.ok()) { return s; } @@ -970,8 +974,12 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, assert(column_family); uint32_t cf_id = column_family->GetID(); std::array key_with_ts{{key, ts}}; - return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2), - SliceParts(&value, 1)); + s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2), + SliceParts(&value, 1)); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts.size()); + } + return s; } Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key, @@ -1039,7 +1047,11 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key, } if (ts_sz == 0) { - return WriteBatchInternal::Put(this, cf_id, key, value); + s = WriteBatchInternal::Put(this, cf_id, key, value); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts_sz); + } + return s; } return Status::InvalidArgument( @@ -1246,20 +1258,24 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) { } if (0 == ts_sz) { - return WriteBatchInternal::Delete(this, cf_id, key); + s = WriteBatchInternal::Delete(this, cf_id, key); + } else { + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + s = WriteBatchInternal::Delete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); } - - needs_in_place_update_ts_ = true; - has_key_with_ts_ = true; - std::string dummy_ts(ts_sz, '\0'); - std::array key_with_ts{{key, dummy_ts}}; - return WriteBatchInternal::Delete(this, cf_id, - SliceParts(key_with_ts.data(), 2)); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts_sz); + } + return s; } Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key, const Slice& ts) { - const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + Status s = CheckColumnFamilyTimestampSize(column_family, ts); if (!s.ok()) { return s; } @@ -1267,8 +1283,12 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key, has_key_with_ts_ = true; uint32_t cf_id = column_family->GetID(); std::array key_with_ts{{key, ts}}; - return WriteBatchInternal::Delete(this, cf_id, - SliceParts(key_with_ts.data(), 2)); + s = WriteBatchInternal::Delete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts.size()); + } + return s; } Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, @@ -1313,7 +1333,11 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family, } if (0 == ts_sz) { - return WriteBatchInternal::Delete(this, cf_id, key); + s = WriteBatchInternal::Delete(this, cf_id, key); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts_sz); + } + return s; } return Status::InvalidArgument( @@ -1361,20 +1385,24 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, } if (0 == ts_sz) { - return WriteBatchInternal::SingleDelete(this, cf_id, key); + s = WriteBatchInternal::SingleDelete(this, cf_id, key); + } else { + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + s = WriteBatchInternal::SingleDelete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); } - - needs_in_place_update_ts_ = true; - has_key_with_ts_ = true; - std::string dummy_ts(ts_sz, '\0'); - std::array key_with_ts{{key, dummy_ts}}; - return WriteBatchInternal::SingleDelete(this, cf_id, - SliceParts(key_with_ts.data(), 2)); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts_sz); + } + return s; } Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, const Slice& ts) { - const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + Status s = CheckColumnFamilyTimestampSize(column_family, ts); if (!s.ok()) { return s; } @@ -1382,8 +1410,12 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, assert(column_family); uint32_t cf_id = column_family->GetID(); std::array key_with_ts{{key, ts}}; - return WriteBatchInternal::SingleDelete(this, cf_id, - SliceParts(key_with_ts.data(), 2)); + s = WriteBatchInternal::SingleDelete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts.size()); + } + return s; } Status WriteBatchInternal::SingleDelete(WriteBatch* b, @@ -1430,7 +1462,11 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, } if (0 == ts_sz) { - return WriteBatchInternal::SingleDelete(this, cf_id, key); + s = WriteBatchInternal::SingleDelete(this, cf_id, key); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts_sz); + } + return s; } return Status::InvalidArgument( @@ -1480,23 +1516,27 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, } if (0 == ts_sz) { - return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key); + s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key); + } else { + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array begin_key_with_ts{{begin_key, dummy_ts}}; + std::array end_key_with_ts{{end_key, dummy_ts}}; + s = WriteBatchInternal::DeleteRange(this, cf_id, + SliceParts(begin_key_with_ts.data(), 2), + SliceParts(end_key_with_ts.data(), 2)); } - - needs_in_place_update_ts_ = true; - has_key_with_ts_ = true; - std::string dummy_ts(ts_sz, '\0'); - std::array begin_key_with_ts{{begin_key, dummy_ts}}; - std::array end_key_with_ts{{end_key, dummy_ts}}; - return WriteBatchInternal::DeleteRange( - this, cf_id, SliceParts(begin_key_with_ts.data(), 2), - SliceParts(end_key_with_ts.data(), 2)); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts_sz); + } + return s; } Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key, const Slice& end_key, const Slice& ts) { - const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + Status s = CheckColumnFamilyTimestampSize(column_family, ts); if (!s.ok()) { return s; } @@ -1505,9 +1545,13 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, uint32_t cf_id = column_family->GetID(); std::array key_with_ts{{begin_key, ts}}; std::array end_key_with_ts{{end_key, ts}}; - return WriteBatchInternal::DeleteRange(this, cf_id, - SliceParts(key_with_ts.data(), 2), - SliceParts(end_key_with_ts.data(), 2)); + s = WriteBatchInternal::DeleteRange(this, cf_id, + SliceParts(key_with_ts.data(), 2), + SliceParts(end_key_with_ts.data(), 2)); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts.size()); + } + return s; } Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id, @@ -1554,7 +1598,11 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, } if (0 == ts_sz) { - return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key); + s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts_sz); + } + return s; } return Status::InvalidArgument( @@ -1608,21 +1656,25 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, } if (0 == ts_sz) { - return WriteBatchInternal::Merge(this, cf_id, key, value); + s = WriteBatchInternal::Merge(this, cf_id, key, value); + } else { + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + + s = WriteBatchInternal::Merge( + this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1)); } - - needs_in_place_update_ts_ = true; - has_key_with_ts_ = true; - std::string dummy_ts(ts_sz, '\0'); - std::array key_with_ts{{key, dummy_ts}}; - - return WriteBatchInternal::Merge( - this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1)); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts_sz); + } + return s; } Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, const Slice& ts, const Slice& value) { - const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + Status s = CheckColumnFamilyTimestampSize(column_family, ts); if (!s.ok()) { return s; } @@ -1630,8 +1682,12 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, assert(column_family); uint32_t cf_id = column_family->GetID(); std::array key_with_ts{{key, ts}}; - return WriteBatchInternal::Merge( - this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1)); + s = WriteBatchInternal::Merge(this, cf_id, SliceParts(key_with_ts.data(), 2), + SliceParts(&value, 1)); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts.size()); + } + return s; } Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, @@ -1680,7 +1736,11 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, } if (0 == ts_sz) { - return WriteBatchInternal::Merge(this, cf_id, key, value); + s = WriteBatchInternal::Merge(this, cf_id, key, value); + if (s.ok()) { + MaybeTrackTimestampSize(cf_id, ts_sz); + } + return s; } return Status::InvalidArgument( diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 5380fc2d58..67f82808fe 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -125,7 +125,6 @@ DECLARE_int32(level0_stop_writes_trigger); DECLARE_int32(block_size); DECLARE_int32(format_version); DECLARE_int32(index_block_restart_interval); -DECLARE_bool(disable_auto_compactions); DECLARE_int32(max_background_compactions); DECLARE_int32(num_bottom_pri_threads); DECLARE_int32(compaction_thread_pool_adjust_interval); @@ -151,6 +150,7 @@ DECLARE_bool(charge_filter_construction); DECLARE_bool(charge_table_reader); DECLARE_bool(charge_file_metadata); DECLARE_bool(charge_blob_cache); +DECLARE_bool(decouple_partitioned_filters); DECLARE_int32(top_level_index_pinning); DECLARE_int32(partition_pinning); DECLARE_int32(unpartitioned_pinning); @@ -274,6 +274,7 @@ DECLARE_bool(verification_only); DECLARE_string(last_level_temperature); DECLARE_string(default_write_temperature); DECLARE_string(default_temperature); +DECLARE_bool(paranoid_memory_checks); // Options for transaction dbs. // Use TransactionDB (a.k.a. Pessimistic Transaction DB) @@ -318,7 +319,6 @@ DECLARE_int32(prepopulate_blob_cache); DECLARE_int32(approximate_size_one_in); DECLARE_bool(best_efforts_recovery); DECLARE_bool(skip_verifydb); -DECLARE_bool(enable_compaction_filter); DECLARE_bool(paranoid_file_checks); DECLARE_bool(fail_if_options_file_error); DECLARE_uint64(batch_protection_bytes_per_key); diff --git a/db_stress_tool/db_stress_compaction_filter.h b/db_stress_tool/db_stress_compaction_filter.h index 408bb48f3e..c67b9f2073 100644 --- a/db_stress_tool/db_stress_compaction_filter.h +++ b/db_stress_tool/db_stress_compaction_filter.h @@ -49,7 +49,7 @@ class DbStressCompactionFilter : public CompactionFilter { return Decision::kKeep; } // Reaching here means we acquired the lock. - + key_mutex->AssertHeld(); bool key_exists = state_->Exists(cf_id_, key_num); const bool allow_overwrite = state_->AllowsOverwrite(key_num); diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc index 0da91d742a..d5fb3e6436 100644 --- a/db_stress_tool/db_stress_driver.cc +++ b/db_stress_tool/db_stress_driver.cc @@ -167,7 +167,10 @@ bool RunStressTestImpl(SharedState* shared) { {FileType::kWalFile}); } } - now = clock->NowMicros(); + if (ShouldDisableAutoCompactionsBeforeVerifyDb()) { + Status s = stress->EnableAutoCompaction(); + assert(s.ok()); + } fprintf(stdout, "%s Starting database operations\n", clock->TimeToString(now / 1000000).c_str()); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 3fb2afa7ca..bb2d9d453e 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -380,6 +380,11 @@ DEFINE_bool(charge_blob_cache, false, "CacheEntryRoleOptions::charged of " "kBlobCache"); +DEFINE_bool( + decouple_partitioned_filters, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().decouple_partitioned_filters, + "Decouple filter partitioning from index partitioning."); + DEFINE_int32( top_level_index_pinning, static_cast(ROCKSDB_NAMESPACE::PinningTier::kFallback), @@ -1443,4 +1448,8 @@ DEFINE_uint32(uncache_aggressiveness, "obsolete. 0 = disabled, 1 = minimum, 100 = moderate, 10000 = " "normal max"); +DEFINE_bool(paranoid_memory_checks, + ROCKSDB_NAMESPACE::Options().paranoid_memory_checks, + "Sets CF option paranoid_memory_checks."); + #endif // GFLAGS diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h index 7129c22946..cdd9f71708 100644 --- a/db_stress_tool/db_stress_shared_state.h +++ b/db_stress_tool/db_stress_shared_state.h @@ -45,6 +45,8 @@ DECLARE_int32(open_write_fault_one_in); DECLARE_int32(open_read_fault_one_in); DECLARE_int32(inject_error_severity); +DECLARE_bool(disable_auto_compactions); +DECLARE_bool(enable_compaction_filter); namespace ROCKSDB_NAMESPACE { class StressTest; @@ -262,14 +264,10 @@ class SharedState { // This is useful for crash-recovery testing when the process may crash // before updating the corresponding expected value // - // It can fail and `*prepared` will be set to false if the previous write or - // delete is still in pending state (e.g, still in recovery for retryable IO - // errors). If succeeds,`*prepared` will be set to true - // // Requires external locking covering `key` in `cf` to prevent // concurrent write or delete to the same `key`. - PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared) { - return expected_state_manager_->PreparePut(cf, key, prepared); + PendingExpectedValue PreparePut(int cf, int64_t key) { + return expected_state_manager_->PreparePut(cf, key); } // Does not requires external locking. @@ -281,31 +279,24 @@ class SharedState { // This is useful for crash-recovery testing when the process may crash // before updating the corresponding expected value // - // It can fail and `*prepared` will be set to false if the previous write or - // delete is still in pending state (e.g, still in recovery for retryable IO - // errors). If succeeds,`*prepared` will be set to true - // // Requires external locking covering `key` in `cf` to prevent concurrent // write or delete to the same `key`. - PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared) { - return expected_state_manager_->PrepareDelete(cf, key, prepared); + PendingExpectedValue PrepareDelete(int cf, int64_t key) { + return expected_state_manager_->PrepareDelete(cf, key); } // Requires external locking covering `key` in `cf` to prevent concurrent // write or delete to the same `key`. - PendingExpectedValue PrepareSingleDelete(int cf, int64_t key, - bool* prepared) { - return expected_state_manager_->PrepareSingleDelete(cf, key, prepared); + PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) { + return expected_state_manager_->PrepareSingleDelete(cf, key); } // Requires external locking covering keys in `[begin_key, end_key)` in `cf` // to prevent concurrent write or delete to the same `key`. std::vector PrepareDeleteRange(int cf, int64_t begin_key, - int64_t end_key, - bool* prepared) { - return expected_state_manager_->PrepareDeleteRange(cf, begin_key, end_key, - prepared); + int64_t end_key) { + return expected_state_manager_->PrepareDeleteRange(cf, begin_key, end_key); } bool AllowsOverwrite(int64_t key) const { diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 9bbc9b24b8..b8ab0cc4f5 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -632,10 +632,8 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, for (auto cfh : column_families_) { for (int64_t k = 0; k != number_of_keys; ++k) { const std::string key = Key(k); - bool prepare = false; PendingExpectedValue pending_expected_value = - shared->PreparePut(cf_idx, k, &prepare); - assert(prepare); + shared->PreparePut(cf_idx, k); const uint32_t value_base = pending_expected_value.GetFinalValueBase(); const size_t sz = GenerateValue(value_base, value, sizeof(value)); @@ -3676,7 +3674,7 @@ void StressTest::Reopen(ThreadState* thread) { // crash-recovery verification does. Therefore it always expects no data loss // and we should ensure no data loss in testing. // TODO(hx235): eliminate the FlushWAL(true /* sync */)/SyncWAL() below - if (!FLAGS_disable_wal && !FLAGS_avoid_flush_during_shutdown) { + if (!FLAGS_disable_wal && FLAGS_avoid_flush_during_shutdown) { Status s; if (FLAGS_manual_wal_flush_one_in > 0) { s = db_->FlushWAL(/*sync=*/true); @@ -3834,6 +3832,10 @@ void CheckAndSetOptionsForUserTimestamp(Options& options) { FLAGS_persist_user_defined_timestamps; } +bool ShouldDisableAutoCompactionsBeforeVerifyDb() { + return !FLAGS_disable_auto_compactions && FLAGS_enable_compaction_filter; +} + bool InitializeOptionsFromFile(Options& options) { DBOptions db_options; ConfigOptions config_options; @@ -3861,6 +3863,8 @@ void InitializeOptionsFromFlags( const std::shared_ptr& filter_policy, Options& options) { BlockBasedTableOptions block_based_options; + block_based_options.decouple_partitioned_filters = + FLAGS_decouple_partitioned_filters; block_based_options.block_cache = cache; block_based_options.cache_index_and_filter_blocks = FLAGS_cache_index_and_filter_blocks; @@ -3947,7 +3951,11 @@ void InitializeOptionsFromFlags( new WriteBufferManager(FLAGS_db_write_buffer_size, block_cache)); } options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering; - options.disable_auto_compactions = FLAGS_disable_auto_compactions; + if (ShouldDisableAutoCompactionsBeforeVerifyDb()) { + options.disable_auto_compactions = true; + } else { + options.disable_auto_compactions = FLAGS_disable_auto_compactions; + } options.max_background_compactions = FLAGS_max_background_compactions; options.max_background_flushes = FLAGS_max_background_flushes; options.compaction_style = @@ -4047,6 +4055,7 @@ void InitializeOptionsFromFlags( options.memtable_protection_bytes_per_key = FLAGS_memtable_protection_bytes_per_key; options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key; + options.paranoid_memory_checks = FLAGS_paranoid_memory_checks; // Integrated BlobDB options.enable_blob_files = FLAGS_enable_blob_files; @@ -4262,6 +4271,7 @@ void InitializeOptionsGeneral( options.disable_auto_compactions = true; } + options.table_properties_collector_factories.clear(); options.table_properties_collector_factories.emplace_back( std::make_shared()); diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h index 31a1d2d622..cf6f174b8d 100644 --- a/db_stress_tool/db_stress_test_base.h +++ b/db_stress_tool/db_stress_test_base.h @@ -48,7 +48,11 @@ class StressTest { return FLAGS_sync_fault_injection || FLAGS_disable_wal || FLAGS_manual_wal_flush_one_in > 0; } - + Status EnableAutoCompaction() { + assert(options_.disable_auto_compactions); + Status s = db_->EnableAutoCompaction(column_families_); + return s; + } void CleanUp(); protected: @@ -64,6 +68,42 @@ class StressTest { } } + void UpdateIfInitialWriteFails(Env* db_stress_env, const Status& write_s, + Status* initial_write_s, + bool* initial_wal_write_may_succeed, + uint64_t* wait_for_recover_start_time) { + assert(db_stress_env && initial_write_s && initial_wal_write_may_succeed && + wait_for_recover_start_time); + // Only update `initial_write_s`, `initial_wal_write_may_succeed` when the + // first write fails + if (!write_s.ok() && (*initial_write_s).ok()) { + *initial_write_s = write_s; + *initial_wal_write_may_succeed = + !FaultInjectionTestFS::IsFailedToWriteToWALError(*initial_write_s); + *wait_for_recover_start_time = db_stress_env->NowMicros(); + } + } + + void PrintWriteRecoveryWaitTimeIfNeeded(Env* db_stress_env, + const Status& initial_write_s, + bool initial_wal_write_may_succeed, + uint64_t wait_for_recover_start_time, + const std::string& thread_name) { + assert(db_stress_env); + bool waited_for_recovery = !initial_write_s.ok() && + IsErrorInjectedAndRetryable(initial_write_s) && + initial_wal_write_may_succeed; + if (waited_for_recovery) { + uint64_t elapsed_sec = + (db_stress_env->NowMicros() - wait_for_recover_start_time) / 1000000; + if (elapsed_sec > 10) { + fprintf(stdout, + "%s thread slept to wait for write recovery for " + "%" PRIu64 " seconds\n", + thread_name.c_str(), elapsed_sec); + } + } + } void GetDeleteRangeKeyLocks( ThreadState* thread, int rand_column_family, int64_t rand_key, std::vector>* range_locks) { @@ -411,5 +451,6 @@ void InitializeOptionsGeneral( // user-defined timestamp. void CheckAndSetOptionsForUserTimestamp(Options& options); +bool ShouldDisableAutoCompactionsBeforeVerifyDb(); } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/expected_state.cc b/db_stress_tool/expected_state.cc index b5db519672..22b31b23d1 100644 --- a/db_stress_tool/expected_state.cc +++ b/db_stress_tool/expected_state.cc @@ -32,41 +32,29 @@ void ExpectedState::Precommit(int cf, int64_t key, const ExpectedValue& value) { std::atomic_thread_fence(std::memory_order_release); } -PendingExpectedValue ExpectedState::PreparePut(int cf, int64_t key, - bool* prepared) { - assert(prepared); +PendingExpectedValue ExpectedState::PreparePut(int cf, int64_t key) { ExpectedValue expected_value = Load(cf, key); // Calculate the original expected value const ExpectedValue orig_expected_value = expected_value; // Calculate the pending expected value - bool res = expected_value.Put(true /* pending */); - if (!res) { - PendingExpectedValue ret = PendingExpectedValue( - &Value(cf, key), orig_expected_value, orig_expected_value); - *prepared = false; - return ret; - } + expected_value.Put(true /* pending */); const ExpectedValue pending_expected_value = expected_value; // Calculate the final expected value - res = expected_value.Put(false /* pending */); - assert(res); + expected_value.Put(false /* pending */); const ExpectedValue final_expected_value = expected_value; // Precommit Precommit(cf, key, pending_expected_value); - *prepared = true; return PendingExpectedValue(&Value(cf, key), orig_expected_value, final_expected_value); } ExpectedValue ExpectedState::Get(int cf, int64_t key) { return Load(cf, key); } -PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key, - bool* prepared) { - assert(prepared); +PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key) { ExpectedValue expected_value = Load(cf, key); // Calculate the original expected value @@ -77,47 +65,32 @@ PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key, if (!res) { PendingExpectedValue ret = PendingExpectedValue( &Value(cf, key), orig_expected_value, orig_expected_value); - *prepared = false; return ret; } const ExpectedValue pending_expected_value = expected_value; // Calculate the final expected value - res = expected_value.Delete(false /* pending */); - assert(res); + expected_value.Delete(false /* pending */); const ExpectedValue final_expected_value = expected_value; // Precommit Precommit(cf, key, pending_expected_value); - *prepared = true; return PendingExpectedValue(&Value(cf, key), orig_expected_value, final_expected_value); } -PendingExpectedValue ExpectedState::PrepareSingleDelete(int cf, int64_t key, - bool* prepared) { - return PrepareDelete(cf, key, prepared); +PendingExpectedValue ExpectedState::PrepareSingleDelete(int cf, int64_t key) { + return PrepareDelete(cf, key); } std::vector ExpectedState::PrepareDeleteRange( - int cf, int64_t begin_key, int64_t end_key, bool* prepared) { + int cf, int64_t begin_key, int64_t end_key) { std::vector pending_expected_values; - bool has_prepared_failed = false; for (int64_t key = begin_key; key < end_key; ++key) { - bool each_prepared = false; - PendingExpectedValue pending_expected_value = - PrepareDelete(cf, key, &each_prepared); - if (each_prepared) { - pending_expected_values.push_back(pending_expected_value); - } else { - has_prepared_failed = true; - pending_expected_value.PermitUnclosedPendingState(); - break; - } + pending_expected_values.push_back(PrepareDelete(cf, key)); } - *prepared = !has_prepared_failed; return pending_expected_values; } @@ -759,8 +732,31 @@ Status FileExpectedStateManager::Restore(DB* db) { s = Env::Default()->DeleteFile(state_file_path); } if (s.ok()) { - saved_seqno_ = kMaxSequenceNumber; - s = Env::Default()->DeleteFile(trace_file_path); + std::vector expected_state_dir_children; + s = Env::Default()->GetChildren(expected_state_dir_path_, + &expected_state_dir_children); + if (s.ok()) { + for (size_t i = 0; i < expected_state_dir_children.size(); ++i) { + const auto& filename = expected_state_dir_children[i]; + if (filename.size() >= kTraceFilenameSuffix.size() && + filename.rfind(kTraceFilenameSuffix) == + filename.size() - kTraceFilenameSuffix.size()) { + SequenceNumber found_seqno = ParseUint64(filename.substr( + 0, filename.size() - kTraceFilenameSuffix.size())); + // Delete older trace files, but keep the one we just replayed for + // debugging purposes + if (found_seqno < saved_seqno_) { + s = Env::Default()->DeleteFile(GetPathForFilename(filename)); + } + } + if (!s.ok()) { + break; + } + } + } + if (s.ok()) { + saved_seqno_ = kMaxSequenceNumber; + } } return s; } diff --git a/db_stress_tool/expected_state.h b/db_stress_tool/expected_state.h index 2d75622888..bab546fa42 100644 --- a/db_stress_tool/expected_state.h +++ b/db_stress_tool/expected_state.h @@ -44,7 +44,7 @@ class ExpectedState { // // Requires external locking covering `key` in `cf` to prevent concurrent // write or delete to the same `key`. - PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared); + PendingExpectedValue PreparePut(int cf, int64_t key); // Does not requires external locking. ExpectedValue Get(int cf, int64_t key); @@ -55,18 +55,17 @@ class ExpectedState { // // Requires external locking covering `key` in `cf` to prevent concurrent // write or delete to the same `key`. - PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared); + PendingExpectedValue PrepareDelete(int cf, int64_t key); // Requires external locking covering `key` in `cf` to prevent concurrent // write or delete to the same `key`. - PendingExpectedValue PrepareSingleDelete(int cf, int64_t key, bool* prepared); + PendingExpectedValue PrepareSingleDelete(int cf, int64_t key); // Requires external locking covering keys in `[begin_key, end_key)` in `cf` // to prevent concurrent write or delete to the same `key`. std::vector PrepareDeleteRange(int cf, int64_t begin_key, - int64_t end_key, - bool* prepared); + int64_t end_key); // Update the expected value for start of an incomplete write or delete // operation on the key assoicated with this expected value @@ -197,30 +196,28 @@ class ExpectedStateManager { void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); } // See ExpectedState::PreparePut() - PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared) { - return latest_->PreparePut(cf, key, prepared); + PendingExpectedValue PreparePut(int cf, int64_t key) { + return latest_->PreparePut(cf, key); } // See ExpectedState::Get() ExpectedValue Get(int cf, int64_t key) { return latest_->Get(cf, key); } // See ExpectedState::PrepareDelete() - PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared) { - return latest_->PrepareDelete(cf, key, prepared); + PendingExpectedValue PrepareDelete(int cf, int64_t key) { + return latest_->PrepareDelete(cf, key); } // See ExpectedState::PrepareSingleDelete() - PendingExpectedValue PrepareSingleDelete(int cf, int64_t key, - bool* prepared) { - return latest_->PrepareSingleDelete(cf, key, prepared); + PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) { + return latest_->PrepareSingleDelete(cf, key); } // See ExpectedState::PrepareDeleteRange() std::vector PrepareDeleteRange(int cf, int64_t begin_key, - int64_t end_key, - bool* prepared) { - return latest_->PrepareDeleteRange(cf, begin_key, end_key, prepared); + int64_t end_key) { + return latest_->PrepareDeleteRange(cf, begin_key, end_key); } // See ExpectedState::Exists() diff --git a/db_stress_tool/expected_value.cc b/db_stress_tool/expected_value.cc index 8ac95f2341..7cbcebabf2 100644 --- a/db_stress_tool/expected_value.cc +++ b/db_stress_tool/expected_value.cc @@ -10,11 +10,7 @@ #include namespace ROCKSDB_NAMESPACE { -bool ExpectedValue::Put(bool pending) { - if (pending && (PendingWrite() || PendingDelete())) { - return false; - } - +void ExpectedValue::Put(bool pending) { if (pending) { SetPendingWrite(); } else { @@ -22,15 +18,10 @@ bool ExpectedValue::Put(bool pending) { ClearDeleted(); ClearPendingWrite(); } - return true; } bool ExpectedValue::Delete(bool pending) { - if (pending && (PendingWrite() || PendingDelete())) { - return false; - } - - if (!Exists()) { + if (pending && !Exists()) { return false; } if (pending) { diff --git a/db_stress_tool/expected_value.h b/db_stress_tool/expected_value.h index 36047947dd..428c389cb6 100644 --- a/db_stress_tool/expected_value.h +++ b/db_stress_tool/expected_value.h @@ -37,11 +37,14 @@ class ExpectedValue { explicit ExpectedValue(uint32_t expected_value) : expected_value_(expected_value) {} - bool Exists() const { return PendingWrite() || !IsDeleted(); } + bool Exists() const { + assert(!PendingWrite() && !PendingDelete()); + return !IsDeleted(); + } uint32_t Read() const { return expected_value_; } - bool Put(bool pending); + void Put(bool pending); bool Delete(bool pending); diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index 262d303640..1e628d7d2f 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -1619,28 +1619,21 @@ class NonBatchedOpsStressTest : public StressTest { // write bool initial_wal_write_may_succeed = true; - bool prepared = false; PendingExpectedValue pending_expected_value = - shared->PreparePut(rand_column_family, rand_key, &prepared); - if (!prepared) { - pending_expected_value.PermitUnclosedPendingState(); - return s; - } + shared->PreparePut(rand_column_family, rand_key); const uint32_t value_base = pending_expected_value.GetFinalValueBase(); const size_t sz = GenerateValue(value_base, value, sizeof(value)); const Slice v(value, sz); + uint64_t wait_for_recover_start_time = 0; do { // In order to commit the expected state for the initial write failed with // injected retryable error and successful WAL write, retry the write // until it succeeds after the recovery finishes if (!s.ok() && IsErrorInjectedAndRetryable(s) && initial_wal_write_may_succeed) { - lock.reset(); std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000)); - lock.reset(new MutexLock( - shared->GetMutexForKey(rand_column_family, rand_key))); } if (FLAGS_use_put_entity_one_in > 0 && (value_base % FLAGS_use_put_entity_one_in) == 0) { @@ -1691,13 +1684,10 @@ class NonBatchedOpsStressTest : public StressTest { }); } } - // Only update `initial_write_s`, `initial_wal_write_may_succeed` when the - // first write fails - if (!s.ok() && initial_write_s.ok()) { - initial_write_s = s; - initial_wal_write_may_succeed = - !FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s); - } + UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s, + &initial_wal_write_may_succeed, + &wait_for_recover_start_time); + } while (!s.ok() && IsErrorInjectedAndRetryable(s) && initial_wal_write_may_succeed); @@ -1719,6 +1709,9 @@ class NonBatchedOpsStressTest : public StressTest { thread->shared->SafeTerminate(); } } else { + PrintWriteRecoveryWaitTimeIfNeeded( + db_stress_env, initial_write_s, initial_wal_write_may_succeed, + wait_for_recover_start_time, "TestPut"); pending_expected_value.Commit(); thread->stats.AddBytesForWrites(1, sz); PrintKeyValue(rand_column_family, static_cast(rand_key), value, @@ -1756,25 +1749,18 @@ class NonBatchedOpsStressTest : public StressTest { // Use delete if the key may be overwritten and a single deletion // otherwise. if (shared->AllowsOverwrite(rand_key)) { - bool prepared = false; PendingExpectedValue pending_expected_value = - shared->PrepareDelete(rand_column_family, rand_key, &prepared); - if (!prepared) { - pending_expected_value.PermitUnclosedPendingState(); - return s; - } + shared->PrepareDelete(rand_column_family, rand_key); + uint64_t wait_for_recover_start_time = 0; do { // In order to commit the expected state for the initial write failed // with injected retryable error and successful WAL write, retry the // write until it succeeds after the recovery finishes if (!s.ok() && IsErrorInjectedAndRetryable(s) && initial_wal_write_may_succeed) { - lock.reset(); std::this_thread::sleep_for( std::chrono::microseconds(1 * 1000 * 1000)); - lock.reset(new MutexLock( - shared->GetMutexForKey(rand_column_family, rand_key))); } if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size == 0) { @@ -1787,13 +1773,9 @@ class NonBatchedOpsStressTest : public StressTest { return txn.Delete(cfh, key); }); } - // Only update `initial_write_s`, `initial_wal_write_may_succeed` when - // the first write fails - if (!s.ok() && initial_write_s.ok()) { - initial_write_s = s; - initial_wal_write_may_succeed = - !FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s); - } + UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s, + &initial_wal_write_may_succeed, + &wait_for_recover_start_time); } while (!s.ok() && IsErrorInjectedAndRetryable(s) && initial_wal_write_may_succeed); @@ -1816,29 +1798,25 @@ class NonBatchedOpsStressTest : public StressTest { thread->shared->SafeTerminate(); } } else { + PrintWriteRecoveryWaitTimeIfNeeded( + db_stress_env, initial_write_s, initial_wal_write_may_succeed, + wait_for_recover_start_time, "TestDelete"); pending_expected_value.Commit(); thread->stats.AddDeletes(1); } } else { - bool prepared = false; PendingExpectedValue pending_expected_value = - shared->PrepareSingleDelete(rand_column_family, rand_key, &prepared); - if (!prepared) { - pending_expected_value.PermitUnclosedPendingState(); - return s; - } + shared->PrepareSingleDelete(rand_column_family, rand_key); + uint64_t wait_for_recover_start_time = 0; do { // In order to commit the expected state for the initial write failed // with injected retryable error and successful WAL write, retry the // write until it succeeds after the recovery finishes if (!s.ok() && IsErrorInjectedAndRetryable(s) && initial_wal_write_may_succeed) { - lock.reset(); std::this_thread::sleep_for( std::chrono::microseconds(1 * 1000 * 1000)); - lock.reset(new MutexLock( - shared->GetMutexForKey(rand_column_family, rand_key))); } if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size == 0) { @@ -1851,13 +1829,9 @@ class NonBatchedOpsStressTest : public StressTest { return txn.SingleDelete(cfh, key); }); } - // Only update `initial_write_s`, `initial_wal_write_may_succeed` when - // the first write fails - if (!s.ok() && initial_write_s.ok()) { - initial_write_s = s; - initial_wal_write_may_succeed = - !FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s); - } + UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s, + &initial_wal_write_may_succeed, + &wait_for_recover_start_time); } while (!s.ok() && IsErrorInjectedAndRetryable(s) && initial_wal_write_may_succeed); @@ -1880,6 +1854,9 @@ class NonBatchedOpsStressTest : public StressTest { thread->shared->SafeTerminate(); } } else { + PrintWriteRecoveryWaitTimeIfNeeded( + db_stress_env, initial_write_s, initial_wal_write_may_succeed, + wait_for_recover_start_time, "TestDelete"); pending_expected_value.Commit(); thread->stats.AddSingleDeletes(1); } @@ -1914,18 +1891,9 @@ class NonBatchedOpsStressTest : public StressTest { // write bool initial_wal_write_may_succeed = true; - bool prepared = false; std::vector pending_expected_values = shared->PrepareDeleteRange(rand_column_family, rand_key, - rand_key + FLAGS_range_deletion_width, - &prepared); - if (!prepared) { - for (PendingExpectedValue& pending_expected_value : - pending_expected_values) { - pending_expected_value.PermitUnclosedPendingState(); - } - return s; - } + rand_key + FLAGS_range_deletion_width); const int covered = static_cast(pending_expected_values.size()); std::string keystr = Key(rand_key); @@ -1935,6 +1903,7 @@ class NonBatchedOpsStressTest : public StressTest { Slice end_key = end_keystr; std::string write_ts_str; Slice write_ts; + uint64_t wait_for_recover_start_time = 0; do { // In order to commit the expected state for the initial write failed with @@ -1942,10 +1911,7 @@ class NonBatchedOpsStressTest : public StressTest { // until it succeeds after the recovery finishes if (!s.ok() && IsErrorInjectedAndRetryable(s) && initial_wal_write_may_succeed) { - range_locks.clear(); std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000)); - GetDeleteRangeKeyLocks(thread, rand_column_family, rand_key, - &range_locks); } if (FLAGS_user_timestamp_size) { write_ts_str = GetNowNanos(); @@ -1954,13 +1920,9 @@ class NonBatchedOpsStressTest : public StressTest { } else { s = db_->DeleteRange(write_opts, cfh, key, end_key); } - // Only update `initial_write_s`, `initial_wal_write_may_succeed` when the - // first write fails - if (!s.ok() && initial_write_s.ok()) { - initial_write_s = s; - initial_wal_write_may_succeed = - !FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s); - } + UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s, + &initial_wal_write_may_succeed, + &wait_for_recover_start_time); } while (!s.ok() && IsErrorInjectedAndRetryable(s) && initial_wal_write_may_succeed); @@ -1985,6 +1947,9 @@ class NonBatchedOpsStressTest : public StressTest { thread->shared->SafeTerminate(); } } else { + PrintWriteRecoveryWaitTimeIfNeeded( + db_stress_env, initial_write_s, initial_wal_write_may_succeed, + wait_for_recover_start_time, "TestDeleteRange"); for (PendingExpectedValue& pending_expected_value : pending_expected_values) { pending_expected_value.Commit(); @@ -2057,16 +2022,8 @@ class NonBatchedOpsStressTest : public StressTest { } keys.push_back(key); - bool prepared = false; PendingExpectedValue pending_expected_value = - shared->PreparePut(column_family, key, &prepared); - if (!prepared) { - pending_expected_value.PermitUnclosedPendingState(); - for (PendingExpectedValue& pev : pending_expected_values) { - pev.PermitUnclosedPendingState(); - } - return; - } + shared->PreparePut(column_family, key); const uint32_t value_base = pending_expected_value.GetFinalValueBase(); values.push_back(value_base); @@ -2630,6 +2587,8 @@ class NonBatchedOpsStressTest : public StressTest { // Value doesn't exist in db, update state to reflect that shared->SyncDelete(cf, key); return true; + } else { + assert(false); } } char expected_value_data[kValueMaxLen]; @@ -2728,7 +2687,11 @@ class NonBatchedOpsStressTest : public StressTest { SharedState* const shared = thread->shared; assert(shared); - if (!shared->AllowsOverwrite(key) && shared->Exists(column_family, key)) { + const ExpectedValue expected_value = + thread->shared->Get(column_family, key); + bool may_exist = !ExpectedValueHelper::MustHaveNotExisted(expected_value, + expected_value); + if (!shared->AllowsOverwrite(key) && may_exist) { // Just do read your write checks for keys that allow overwrites. return; } diff --git a/env/file_system.cc b/env/file_system.cc index 27c7207f0f..1f02f7a7ee 100644 --- a/env/file_system.cc +++ b/env/file_system.cc @@ -181,10 +181,10 @@ FileOptions FileSystem::OptimizeForBlobFileRead( IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, const std::string& fname, bool should_sync, - const IOOptions& io_options) { + const IOOptions& io_options, + const FileOptions& file_options) { std::unique_ptr file; - EnvOptions soptions; - IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr); + IOStatus s = fs->NewWritableFile(fname, file_options, &file, nullptr); if (!s.ok()) { return s; } diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index 203a326530..edb5e879d1 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -31,6 +31,7 @@ DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs, total_trash_size_(0), rate_bytes_per_sec_(rate_bytes_per_sec), pending_files_(0), + next_trash_bucket_(0), bytes_max_delete_chunk_(bytes_max_delete_chunk), closing_(false), cv_(&mu_), @@ -66,10 +67,8 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path, total_trash_size_.load() > total_size * max_trash_db_ratio_.load())) { // Rate limiting is disabled or trash size makes up more than // max_trash_db_ratio_ (default 25%) of the total DB size - TEST_SYNC_POINT("DeleteScheduler::DeleteFile"); - Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr); + Status s = DeleteFileImmediately(file_path, /*accounted=*/true); if (s.ok()) { - s = sst_file_manager_->OnDeleteFile(file_path); ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately, rate_bytes_per_sec %" PRIi64 ", total_trash_size %" PRIu64 ", total_size %" PRIi64 @@ -77,15 +76,57 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path, file_path.c_str(), rate_bytes_per_sec_.load(), total_trash_size_.load(), total_size, max_trash_db_ratio_.load()); - InstrumentedMutexLock l(&mu_); - RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY); } return s; } + return AddFileToDeletionQueue(file_path, dir_to_sync, /*bucket=*/std::nullopt, + /*accounted=*/true); +} +Status DeleteScheduler::DeleteUnaccountedFile(const std::string& file_path, + const std::string& dir_to_sync, + const bool force_bg, + std::optional bucket) { + uint64_t num_hard_links = 1; + fs_->NumFileLinks(file_path, IOOptions(), &num_hard_links, nullptr) + .PermitUncheckedError(); + + // We can tolerate rare races where we might immediately delete both links + // to a file. + if (rate_bytes_per_sec_.load() <= 0 || (!force_bg && num_hard_links > 1)) { + Status s = DeleteFileImmediately(file_path, /*accounted=*/false); + if (s.ok()) { + ROCKS_LOG_INFO(info_log_, + "Deleted file %s immediately, rate_bytes_per_sec %" PRIi64, + file_path.c_str(), rate_bytes_per_sec_.load()); + } + return s; + } + return AddFileToDeletionQueue(file_path, dir_to_sync, bucket, + /*accounted=*/false); +} + +Status DeleteScheduler::DeleteFileImmediately(const std::string& file_path, + bool accounted) { + TEST_SYNC_POINT("DeleteScheduler::DeleteFile"); + TEST_SYNC_POINT_CALLBACK("DeleteScheduler::DeleteFile::cb", + const_cast(&file_path)); + Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr); + if (s.ok()) { + s = OnDeleteFile(file_path, accounted); + InstrumentedMutexLock l(&mu_); + RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY); + } + return s; +} + +Status DeleteScheduler::AddFileToDeletionQueue(const std::string& file_path, + const std::string& dir_to_sync, + std::optional bucket, + bool accounted) { // Move file to trash std::string trash_file; - Status s = MarkAsTrash(file_path, &trash_file); + Status s = MarkAsTrash(file_path, accounted, &trash_file); ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(), s.ToString().c_str()); @@ -94,7 +135,7 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path, file_path.c_str(), s.ToString().c_str()); s = fs_->DeleteFile(file_path, IOOptions(), nullptr); if (s.ok()) { - s = sst_file_manager_->OnDeleteFile(file_path); + s = OnDeleteFile(file_path, accounted); ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately", trash_file.c_str()); InstrumentedMutexLock l(&mu_); @@ -104,11 +145,13 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path, } // Update the total trash size - uint64_t trash_file_size = 0; - IOStatus io_s = - fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr); - if (io_s.ok()) { - total_trash_size_.fetch_add(trash_file_size); + if (accounted) { + uint64_t trash_file_size = 0; + IOStatus io_s = + fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr); + if (io_s.ok()) { + total_trash_size_.fetch_add(trash_file_size); + } } //**TODO: What should we do if we failed to // get the file size? @@ -117,8 +160,15 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path, { InstrumentedMutexLock l(&mu_); RecordTick(stats_.get(), FILES_MARKED_TRASH); - queue_.emplace(trash_file, dir_to_sync); + queue_.emplace(trash_file, dir_to_sync, accounted, bucket); pending_files_++; + if (bucket.has_value()) { + auto iter = pending_files_in_buckets_.find(bucket.value()); + assert(iter != pending_files_in_buckets_.end()); + if (iter != pending_files_in_buckets_.end()) { + iter->second++; + } + } if (pending_files_ == 1) { cv_.SignalAll(); } @@ -177,7 +227,7 @@ Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm, } Status DeleteScheduler::MarkAsTrash(const std::string& file_path, - std::string* trash_file) { + bool accounted, std::string* trash_file) { // Sanity check of the path size_t idx = file_path.rfind('/'); if (idx == std::string::npos || idx == file_path.size() - 1) { @@ -211,7 +261,7 @@ Status DeleteScheduler::MarkAsTrash(const std::string& file_path, } cnt++; } - if (s.ok()) { + if (s.ok() && accounted) { s = sst_file_manager_->OnMoveFile(file_path, *trash_file); } return s; @@ -235,6 +285,8 @@ void DeleteScheduler::BackgroundEmptyTrash() { uint64_t total_deleted_bytes = 0; int64_t current_delete_rate = rate_bytes_per_sec_.load(); while (!queue_.empty() && !closing_) { + // Satisfy static analysis. + std::optional bucket = std::nullopt; if (current_delete_rate != rate_bytes_per_sec_.load()) { // User changed the delete rate current_delete_rate = rate_bytes_per_sec_.load(); @@ -247,14 +299,17 @@ void DeleteScheduler::BackgroundEmptyTrash() { // Get new file to delete const FileAndDir& fad = queue_.front(); std::string path_in_trash = fad.fname; + std::string dir_to_sync = fad.dir; + bool accounted = fad.accounted; + bucket = fad.bucket; // We don't need to hold the lock while deleting the file mu_.Unlock(); uint64_t deleted_bytes = 0; bool is_complete = true; // Delete file from trash and update total_penlty value - Status s = - DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete); + Status s = DeleteTrashFile(path_in_trash, dir_to_sync, accounted, + &deleted_bytes, &is_complete); total_deleted_bytes += deleted_bytes; mu_.Lock(); if (is_complete) { @@ -288,12 +343,20 @@ void DeleteScheduler::BackgroundEmptyTrash() { TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait", &total_penalty); + int32_t pending_files_in_bucket = std::numeric_limits::max(); if (is_complete) { pending_files_--; + if (bucket.has_value()) { + auto iter = pending_files_in_buckets_.find(bucket.value()); + assert(iter != pending_files_in_buckets_.end()); + if (iter != pending_files_in_buckets_.end()) { + pending_files_in_bucket = iter->second--; + } + } } - if (pending_files_ == 0) { - // Unblock WaitForEmptyTrash since there are no more files waiting - // to be deleted + if (pending_files_ == 0 || pending_files_in_bucket == 0) { + // Unblock WaitForEmptyTrash or WaitForEmptyTrashBucket since there are + // no more files waiting to be deleted cv_.SignalAll(); } } @@ -302,12 +365,14 @@ void DeleteScheduler::BackgroundEmptyTrash() { Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, const std::string& dir_to_sync, - uint64_t* deleted_bytes, + bool accounted, uint64_t* deleted_bytes, bool* is_complete) { uint64_t file_size; Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr); *is_complete = true; TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile"); + TEST_SYNC_POINT_CALLBACK("DeleteScheduler::DeleteTrashFile::cb", + const_cast(&path_in_trash)); if (s.ok()) { bool need_full_delete = true; if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) { @@ -374,7 +439,7 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, } if (s.ok()) { *deleted_bytes = file_size; - s = sst_file_manager_->OnDeleteFile(path_in_trash); + s = OnDeleteFile(path_in_trash, accounted); } } } @@ -384,12 +449,24 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, path_in_trash.c_str(), s.ToString().c_str()); *deleted_bytes = 0; } else { - total_trash_size_.fetch_sub(*deleted_bytes); + if (accounted) { + total_trash_size_.fetch_sub(*deleted_bytes); + } } return s; } +Status DeleteScheduler::OnDeleteFile(const std::string& file_path, + bool accounted) { + if (accounted) { + return sst_file_manager_->OnDeleteFile(file_path); + } + TEST_SYNC_POINT_CALLBACK("DeleteScheduler::OnDeleteFile", + const_cast(&file_path)); + return Status::OK(); +} + void DeleteScheduler::WaitForEmptyTrash() { InstrumentedMutexLock l(&mu_); while (pending_files_ > 0 && !closing_) { @@ -397,6 +474,30 @@ void DeleteScheduler::WaitForEmptyTrash() { } } +std::optional DeleteScheduler::NewTrashBucket() { + if (rate_bytes_per_sec_.load() <= 0) { + return std::nullopt; + } + InstrumentedMutexLock l(&mu_); + int32_t bucket_number = next_trash_bucket_++; + pending_files_in_buckets_.emplace(bucket_number, 0); + return bucket_number; +} + +void DeleteScheduler::WaitForEmptyTrashBucket(int32_t bucket) { + InstrumentedMutexLock l(&mu_); + if (bucket >= next_trash_bucket_) { + return; + } + auto iter = pending_files_in_buckets_.find(bucket); + while (iter != pending_files_in_buckets_.end() && iter->second > 0 && + !closing_) { + cv_.Wait(); + iter = pending_files_in_buckets_.find(bucket); + } + pending_files_in_buckets_.erase(bucket); +} + void DeleteScheduler::MaybeCreateBackgroundThread() { if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) { bg_thread_.reset( diff --git a/file/delete_scheduler.h b/file/delete_scheduler.h index da3735aed8..6aa340cf85 100644 --- a/file/delete_scheduler.h +++ b/file/delete_scheduler.h @@ -7,6 +7,7 @@ #include +#include #include #include #include @@ -48,16 +49,45 @@ class DeleteScheduler { MaybeCreateBackgroundThread(); } - // Mark file as trash directory and schedule its deletion. If force_bg is - // set, it forces the file to always be deleted in the background thread, - // except when rate limiting is disabled + // Delete an accounted file that is tracked by `SstFileManager` and should be + // tracked by this `DeleteScheduler` when it's deleted. + // The file is deleted immediately if slow deletion is disabled. If force_bg + // is not set and trash to db size ratio exceeded the configured threshold, + // it is immediately deleted too. In all other cases, the file will be moved + // to a trash directory and scheduled for deletion by a background thread. Status DeleteFile(const std::string& fname, const std::string& dir_to_sync, const bool force_bg = false); - // Wait for all files being deleteing in the background to finish or for + // Delete an unaccounted file that is not tracked by `SstFileManager` and + // should not be tracked by this `DeleteScheduler` when it's deleted. + // The file is deleted immediately if slow deletion is disabled. If force_bg + // is not set and the file have more than 1 hard link, it is immediately + // deleted too. In all other cases, the file will be moved to a trash + // directory and scheduled for deletion by a background thread. + // This API also supports assign a file to a specified bucket created by + // `NewTrashBucket` when delete files in the background. So the caller can + // wait for a specific bucket to be empty by checking the + // `WaitForEmptyTrashBucket` API. + Status DeleteUnaccountedFile(const std::string& file_path, + const std::string& dir_to_sync, + const bool force_bg = false, + std::optional bucket = std::nullopt); + + // Wait for all files being deleted in the background to finish or for // destructor to be called. void WaitForEmptyTrash(); + // Creates a new trash bucket. A bucket is only created and returned when slow + // deletion is enabled. + // For each bucket that is created, the user should also call + // `WaitForEmptyTrashBucket` after scheduling file deletions to make sure the + // trash files are all cleared. + std::optional NewTrashBucket(); + + // Wait for all the files in the specified bucket to be deleted in the + // background or for the destructor to be called. + void WaitForEmptyTrashBucket(int32_t bucket); + // Return a map containing errors that happened in BackgroundEmptyTrash // file_path => error status std::map GetBackgroundErrors(); @@ -87,12 +117,21 @@ class DeleteScheduler { } private: - Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash); + Status DeleteFileImmediately(const std::string& file_path, bool accounted); + + Status AddFileToDeletionQueue(const std::string& file_path, + const std::string& dir_to_sync, + std::optional bucket, bool accounted); + + Status MarkAsTrash(const std::string& file_path, bool accounted, + std::string* path_in_trash); Status DeleteTrashFile(const std::string& path_in_trash, - const std::string& dir_to_sync, + const std::string& dir_to_sync, bool accounted, uint64_t* deleted_bytes, bool* is_complete); + Status OnDeleteFile(const std::string& file_path, bool accounted); + void BackgroundEmptyTrash(); void MaybeCreateBackgroundThread(); @@ -104,19 +143,28 @@ class DeleteScheduler { std::atomic total_trash_size_; // Maximum number of bytes that should be deleted per second std::atomic rate_bytes_per_sec_; - // Mutex to protect queue_, pending_files_, bg_errors_, closing_, stats_ + // Mutex to protect queue_, pending_files_, next_trash_bucket_, + // pending_files_in_buckets_, bg_errors_, closing_, stats_ InstrumentedMutex mu_; struct FileAndDir { - FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {} + FileAndDir(const std::string& _fname, const std::string& _dir, + bool _accounted, std::optional _bucket) + : fname(_fname), dir(_dir), accounted(_accounted), bucket(_bucket) {} std::string fname; std::string dir; // empty will be skipped. + bool accounted; + std::optional bucket; }; // Queue of trash files that need to be deleted std::queue queue_; // Number of trash files that are waiting to be deleted int32_t pending_files_; + // Next trash bucket that can be created + int32_t next_trash_bucket_; + // A mapping from trash bucket to number of pending files in the bucket + std::map pending_files_in_buckets_; uint64_t bytes_max_delete_chunk_; // Errors that happened in BackgroundEmptyTrash (file_path => error) std::map bg_errors_; @@ -127,6 +175,7 @@ class DeleteScheduler { // Condition variable signaled in these conditions // - pending_files_ value change from 0 => 1 // - pending_files_ value change from 1 => 0 + // - a value in pending_files_in_buckets change from 1 => 0 // - closing_ value is set to true InstrumentedCondVar cv_; // Background thread running BackgroundEmptyTrash @@ -138,6 +187,10 @@ class DeleteScheduler { // If the trash size constitutes for more than this fraction of the total DB // size we will start deleting new files passed to DeleteScheduler // immediately + // Unaccounted files passed for deletion will not cause change in + // total_trash_size_ or affect the DeleteScheduler::total_trash_size_ over + // SstFileManager::total_size_ ratio. Their slow deletion is not subject to + // this configured threshold either. std::atomic max_trash_db_ratio_; static const uint64_t kMicrosInSecond = 1000 * 1000LL; std::shared_ptr stats_; diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc index 25d9f1acd8..6f0cff20ce 100644 --- a/file/delete_scheduler_test.cc +++ b/file/delete_scheduler_test.cc @@ -78,7 +78,7 @@ class DeleteSchedulerTest : public testing::Test { } std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024, - size_t dummy_files_dirs_idx = 0) { + size_t dummy_files_dirs_idx = 0, bool track = true) { std::string file_path = dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name; std::unique_ptr f; @@ -86,7 +86,9 @@ class DeleteSchedulerTest : public testing::Test { std::string data(size, 'A'); EXPECT_OK(f->Append(data)); EXPECT_OK(f->Close()); - EXPECT_OK(sst_file_mgr_->OnAddFile(file_path)); + if (track) { + EXPECT_OK(sst_file_mgr_->OnAddFile(file_path)); + } return file_path; } @@ -353,6 +355,8 @@ TEST_F(DeleteSchedulerTest, DisableRateLimiting) { ASSERT_EQ(num_files, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); + ASSERT_FALSE(delete_scheduler_->NewTrashBucket().has_value()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -718,6 +722,141 @@ TEST_F(DeleteSchedulerTest, IsTrashCheck) { ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx")); } +TEST_F(DeleteSchedulerTest, DeleteAccountedAndUnaccountedFiles) { + rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / s + NewDeleteScheduler(); + + // Create 100 files, every file is 1 KB + int num_files = 100; // 100 files + uint64_t file_size = 1024; // 1 KB as a file size + std::vector generated_files; + for (int i = 0; i < num_files; i++) { + std::string file_name = "file" + std::to_string(i) + ".data"; + generated_files.push_back(NewDummyFile(file_name, file_size, + /*dummy_files_dirs_idx*/ 0, + /*track=*/false)); + } + + for (int i = 0; i < num_files; i++) { + if (i % 2) { + ASSERT_OK(sst_file_mgr_->OnAddFile(generated_files[i], file_size)); + ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], "")); + } else { + ASSERT_OK( + delete_scheduler_->DeleteUnaccountedFile(generated_files[i], "")); + } + } + + delete_scheduler_->WaitForEmptyTrash(); + ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize()); + ASSERT_EQ(0, sst_file_mgr_->GetTotalSize()); +} + +TEST_F(DeleteSchedulerTest, ConcurrentlyDeleteUnaccountedFilesInBuckets) { + int bg_delete_file = 0; + int fg_delete_file = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; }); + rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / s + NewDeleteScheduler(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Create 1000 files, every file is 1 KB + int num_files = 1000; + uint64_t file_size = 1024; // 1 KB as a file size + std::vector generated_files; + for (int i = 0; i < num_files; i++) { + std::string file_name = "file" + std::to_string(i) + ".data"; + generated_files.push_back(NewDummyFile(file_name, file_size, + /*dummy_files_dirs_idx*/ 0, + /*track=*/false)); + } + // Concurrently delete files in different buckets and check all the buckets + // are empty. + int thread_cnt = 10; + int files_per_thread = 100; + std::atomic thread_num(0); + std::vector threads; + std::function delete_thread = [&]() { + std::optional bucket = delete_scheduler_->NewTrashBucket(); + ASSERT_TRUE(bucket.has_value()); + int idx = thread_num.fetch_add(1); + int range_start = idx * files_per_thread; + int range_end = range_start + files_per_thread; + for (int j = range_start; j < range_end; j++) { + ASSERT_OK(delete_scheduler_->DeleteUnaccountedFile( + generated_files[j], "", /*false_bg=*/false, bucket)); + } + delete_scheduler_->WaitForEmptyTrashBucket(bucket.value()); + }; + + for (int i = 0; i < thread_cnt; i++) { + threads.emplace_back(delete_thread); + } + + for (size_t i = 0; i < threads.size(); i++) { + threads[i].join(); + } + + ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize()); + ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); + ASSERT_EQ(1000, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(0, fg_delete_file); + ASSERT_EQ(1000, bg_delete_file); + + // OK to re check an already empty bucket + delete_scheduler_->WaitForEmptyTrashBucket(9); + // Invalid bucket return too. + delete_scheduler_->WaitForEmptyTrashBucket(100); + std::optional next_bucket = delete_scheduler_->NewTrashBucket(); + ASSERT_TRUE(next_bucket.has_value()); + ASSERT_EQ(10, next_bucket.value()); + delete_scheduler_->WaitForEmptyTrashBucket(10); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DeleteSchedulerTest, + ImmediatelyDeleteUnaccountedFilesWithRemainingLinks) { + int bg_delete_file = 0; + int fg_delete_file = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec + NewDeleteScheduler(); + + std::string file1 = NewDummyFile("data_1", 500 * 1024, + /*dummy_files_dirs_idx*/ 0, /*track=*/false); + std::string file2 = NewDummyFile("data_2", 100 * 1024, + /*dummy_files_dirs_idx*/ 0, /*track=*/false); + + ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b")); + ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b")); + + // Should delete in 4 batch if there is no hardlink + ASSERT_OK( + delete_scheduler_->DeleteUnaccountedFile(file1, "", /*force_bg=*/false)); + ASSERT_OK( + delete_scheduler_->DeleteUnaccountedFile(file2, "", /*force_bg=*/false)); + + delete_scheduler_->WaitForEmptyTrash(); + + ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize()); + ASSERT_EQ(0, bg_delete_file); + ASSERT_EQ(2, fg_delete_file); + ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(2, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/file/file_util.cc b/file/file_util.cc index 1e1a5d0be0..105e886902 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -125,8 +125,8 @@ IOStatus CreateFile(FileSystem* fs, const std::string& destination, Status DeleteDBFile(const ImmutableDBOptions* db_options, const std::string& fname, const std::string& dir_to_sync, const bool force_bg, const bool force_fg) { - SstFileManagerImpl* sfm = - static_cast(db_options->sst_file_manager.get()); + SstFileManagerImpl* sfm = static_cast_with_check( + db_options->sst_file_manager.get()); if (sfm && !force_fg) { return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg); } else { @@ -134,6 +134,21 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options, } } +Status DeleteUnaccountedDBFile(const ImmutableDBOptions* db_options, + const std::string& fname, + const std::string& dir_to_sync, + const bool force_bg, const bool force_fg, + std::optional bucket) { + SstFileManagerImpl* sfm = static_cast_with_check( + db_options->sst_file_manager.get()); + if (sfm && !force_fg) { + return sfm->ScheduleUnaccountedFileDeletion(fname, dir_to_sync, force_bg, + bucket); + } else { + return db_options->env->DeleteFile(fname); + } +} + // requested_checksum_func_name brings the function name of the checksum // generator in checksum_factory. Empty string is permitted, in which case the // name of the generator created by the factory is unchecked. When diff --git a/file/file_util.h b/file/file_util.h index af6106cf12..8a72fea27a 100644 --- a/file/file_util.h +++ b/file/file_util.h @@ -55,6 +55,16 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options, const std::string& fname, const std::string& path_to_sync, const bool force_bg, const bool force_fg); +// Delete an unaccounted DB file that is not tracked by SstFileManager and will +// not be tracked by its DeleteScheduler when getting deleted. +// If a legitimate bucket is provided and this file is scheduled for slow +// deletion, it will be assigned to the specified trash bucket. +Status DeleteUnaccountedDBFile(const ImmutableDBOptions* db_options, + const std::string& fname, + const std::string& dir_to_sync, + const bool force_bg, const bool force_fg, + std::optional bucket); + // TODO(hx235): pass the whole DBOptions intead of its individual fields IOStatus GenerateOneFileChecksum( FileSystem* fs, const std::string& file_path, diff --git a/file/filename.cc b/file/filename.cc index b34a0e113e..45cbf9d76a 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -388,6 +388,7 @@ bool ParseFileName(const std::string& fname, uint64_t* number, IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, const std::string& dbname, uint64_t descriptor_number, + Temperature temp, FSDirectory* dir_contains_current_file) { // Remove leading "dbname/" and add newline to manifest file name std::string manifest = DescriptorFileName(dbname, descriptor_number); @@ -397,8 +398,11 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, std::string tmp = TempFileName(dbname, descriptor_number); IOOptions opts; IOStatus s = PrepareIOFromWriteOptions(write_options, opts); + FileOptions file_opts; + file_opts.temperature = temp; if (s.ok()) { - s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts); + s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts, + file_opts); } TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s); if (s.ok()) { @@ -423,7 +427,8 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, } Status SetIdentityFile(const WriteOptions& write_options, Env* env, - const std::string& dbname, const std::string& db_id) { + const std::string& dbname, Temperature temp, + const std::string& db_id) { std::string id; if (db_id.empty()) { id = env->GenerateUniqueId(); @@ -437,8 +442,11 @@ Status SetIdentityFile(const WriteOptions& write_options, Env* env, Status s; IOOptions opts; s = PrepareIOFromWriteOptions(write_options, opts); + FileOptions file_opts; + file_opts.temperature = temp; if (s.ok()) { - s = WriteStringToFile(env, id, tmp, true, &opts); + s = WriteStringToFile(env->GetFileSystem().get(), id, tmp, + /*should_sync=*/true, opts, file_opts); } if (s.ok()) { s = env->RenameFile(tmp, identify_file_name); diff --git a/file/filename.h b/file/filename.h index 56bbd78d55..5a52c745ac 100644 --- a/file/filename.h +++ b/file/filename.h @@ -161,11 +161,12 @@ bool ParseFileName(const std::string& filename, uint64_t* number, // when IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, const std::string& dbname, uint64_t descriptor_number, + Temperature temp, FSDirectory* dir_contains_current_file); // Make the IDENTITY file for the db Status SetIdentityFile(const WriteOptions& write_options, Env* env, - const std::string& dbname, + const std::string& dbname, Temperature temp, const std::string& db_id = {}); // Sync manifest file `file`. diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc index 459ea36cdb..68c74424a2 100644 --- a/file/sst_file_manager_impl.cc +++ b/file/sst_file_manager_impl.cc @@ -421,10 +421,28 @@ Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path, return delete_scheduler_.DeleteFile(file_path, path_to_sync, force_bg); } +Status SstFileManagerImpl::ScheduleUnaccountedFileDeletion( + const std::string& file_path, const std::string& dir_to_sync, + const bool force_bg, std::optional bucket) { + TEST_SYNC_POINT_CALLBACK( + "SstFileManagerImpl::ScheduleUnaccountedFileDeletion", + const_cast(&file_path)); + return delete_scheduler_.DeleteUnaccountedFile(file_path, dir_to_sync, + force_bg, bucket); +} + void SstFileManagerImpl::WaitForEmptyTrash() { delete_scheduler_.WaitForEmptyTrash(); } +std::optional SstFileManagerImpl::NewTrashBucket() { + return delete_scheduler_.NewTrashBucket(); +} + +void SstFileManagerImpl::WaitForEmptyTrashBucket(int32_t bucket) { + delete_scheduler_.WaitForEmptyTrashBucket(bucket); +} + void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path, uint64_t file_size) { auto tracked_file = tracked_files_.find(file_path); diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h index a45663f8f1..47a2b5935a 100644 --- a/file/sst_file_manager_impl.h +++ b/file/sst_file_manager_impl.h @@ -5,7 +5,7 @@ #pragma once - +#include #include #include "db/compaction/compaction.h" @@ -118,17 +118,40 @@ class SstFileManagerImpl : public SstFileManager { // not guaranteed bool CancelErrorRecovery(ErrorHandler* db); - // Mark file as trash and schedule it's deletion. If force_bg is set, it + // Mark a file as trash and schedule its deletion. If force_bg is set, it // forces the file to be deleting in the background regardless of DB size, - // except when rate limited delete is disabled + // except when rate limited delete is disabled. virtual Status ScheduleFileDeletion(const std::string& file_path, const std::string& dir_to_sync, const bool force_bg = false); - // Wait for all files being deleteing in the background to finish or for + // Delete an unaccounted file. The file is deleted immediately if slow + // deletion is disabled. A file with more than 1 hard links will be deleted + // immediately unless force_bg is set. In other cases, files will be scheduled + // for slow deletion, and assigned to the specified bucket if a legitimate one + // is provided. A legitimate bucket is one that is created with the + // `NewTrashBucket` API, and for which `WaitForEmptyTrashBucket` hasn't been + // called yet. + virtual Status ScheduleUnaccountedFileDeletion( + const std::string& file_path, const std::string& dir_to_sync, + const bool force_bg = false, + std::optional bucket = std::nullopt); + + // Wait for all files being deleted in the background to finish or for // destructor to be called. virtual void WaitForEmptyTrash(); + // Creates a new trash bucket. A legitimate bucket is only created and + // returned when slow deletion is enabled. + // For each bucket that is created and used, the user should also call + // `WaitForEmptyTrashBucket` after scheduling file deletions to make sure all + // the trash files are cleared. + std::optional NewTrashBucket(); + + // Wait for all the files in the specified bucket to be deleted in the + // background or for destructor to be called. + virtual void WaitForEmptyTrashBucket(int32_t bucket); + DeleteScheduler* delete_scheduler() { return &delete_scheduler_; } // Stop the error recovery background thread. This should be called only diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index cbe1eb52fc..309d0c510a 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -61,18 +61,6 @@ enum CompactionPri : char { kRoundRobin = 0x4, }; -// Temperature of a file. Used to pass to FileSystem for a different -// placement and/or coding. -// Reserve some numbers in the middle, in case we need to insert new tier -// there. -enum class Temperature : uint8_t { - kUnknown = 0, - kHot = 0x04, - kWarm = 0x08, - kCold = 0x0C, - kLastTemperature, -}; - struct FileTemperatureAge { Temperature temperature = Temperature::kUnknown; uint64_t age = 0; @@ -813,7 +801,7 @@ struct AdvancedColumnFamilyOptions { // If this option is set, when creating the last level files, pass this // temperature to FileSystem used. Should be no-op for default FileSystem // and users need to plug in their own FileSystem to take advantage of it. - // When using FIFO compaction, this option is ignored. + // Currently only compatible with universal compaction. // // Dynamically changeable through the SetOptions() API Temperature last_level_temperature = Temperature::kUnknown; @@ -1090,6 +1078,13 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through the SetOptions() API. uint32_t bottommost_file_compaction_delay = 0; + // Enables additional integrity checks during reads/scans. + // Specifically, for skiplist-based memtables, we verify that keys visited + // are in order. This is helpful to detect corrupted memtable keys during + // reads. Enabling this feature incurs a performance overhead due to an + // additional key comparison during memtable lookup. + bool paranoid_memory_checks = false; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index a259696b29..83f2dd6053 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -30,6 +30,7 @@ #include "rocksdb/port_defs.h" #include "rocksdb/status.h" #include "rocksdb/thread_status.h" +#include "rocksdb/types.h" #ifdef _WIN32 // Windows API macro interference @@ -159,6 +160,9 @@ class Env : public Customizable { // Size of file in bytes uint64_t size_bytes; + + // EXPERIMENTAL - only provided by some implementations + Temperature temperature = Temperature::kUnknown; }; Env(); diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 8d21c91946..042b38305c 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -195,7 +195,9 @@ struct FileOptions : EnvOptions { FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const DBOptions& opts) - : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} + : EnvOptions(opts), + temperature(opts.metadata_write_temperature), + handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const EnvOptions& opts) : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} @@ -1952,7 +1954,8 @@ class FSDirectoryWrapper : public FSDirectory { // A utility routine: write "data" to the named file. IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, const std::string& fname, bool should_sync = false, - const IOOptions& io_options = IOOptions()); + const IOOptions& io_options = IOOptions(), + const FileOptions& file_options = FileOptions()); // A utility routine: read contents of named file into *data IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 039b826de7..e40e483c39 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -47,7 +47,8 @@ class FilterBitsReader; // structs because this is expected to be a temporary, stack-allocated object. struct FilterBuildingContext { // This constructor is for internal use only and subject to change. - FilterBuildingContext(const BlockBasedTableOptions& table_options); + // Keeps a reference to table_options. + explicit FilterBuildingContext(const BlockBasedTableOptions& table_options); // Options for the table being built const BlockBasedTableOptions& table_options; diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index d109a542fe..fd63f127f4 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -194,6 +194,15 @@ class MemTableRep { virtual void Get(const LookupKey& k, void* callback_args, bool (*callback_func)(void* arg, const char* entry)); + // Same as Get() but performs data integrity validation. + virtual Status GetAndValidate(const LookupKey& /* k */, + void* /* callback_args */, + bool (* /* callback_func */)(void* arg, + const char* entry), + bool /*allow_data_in_error*/) { + return Status::NotSupported("GetAndValidate() not implemented."); + } + virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, const Slice& /*end_key*/) { return 0; @@ -235,13 +244,38 @@ class MemTableRep { // REQUIRES: Valid() virtual void Next() = 0; + // Advances to the next position and performs integrity validations on the + // skip list. Iterator becomes invalid and Corruption is returned if a + // corruption is found. + // REQUIRES: Valid() + virtual Status NextAndValidate(bool /* allow_data_in_errors */) { + return Status::NotSupported("NextAndValidate() not implemented."); + } + // Advances to the previous position. // REQUIRES: Valid() virtual void Prev() = 0; + // Advances to the previous position and performs integrity validations on + // the skip list. Iterator becomes invalid and Corruption is returned if a + // corruption is found. + // REQUIRES: Valid() + virtual Status PrevAndValidate(bool /* allow_data_in_errors */) { + return Status::NotSupported("PrevAndValidate() not implemented."); + } + // Advance to the first entry with a key >= target virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; + // Seek and perform integrity validations on the skip list. + // Iterator becomes invalid and Corruption is returned if a + // corruption is found. + virtual Status SeekAndValidate(const Slice& /* internal_key */, + const char* /* memtable_key */, + bool /* allow_data_in_errors */) { + return Status::NotSupported("SeekAndValidate() not implemented."); + } + // retreat to the first entry with a key <= target virtual void SeekForPrev(const Slice& internal_key, const char* memtable_key) = 0; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 0018d9f800..e3eb0368d0 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -512,6 +512,10 @@ class CompactionService : public Customizable { return CompactionServiceJobStatus::kUseLocal; } + // Optional callback function upon Installation. + virtual void OnInstallation(const std::string& /*scheduled_job_id*/, + CompactionServiceJobStatus /*status*/) {} + // Deprecated. Please implement Schedule() and Wait() API to handle remote // compaction @@ -1434,7 +1438,17 @@ struct DBOptions { // For example, if an SST or blob file referenced by the MANIFEST is missing, // BER might be able to find a set of files corresponding to an old "point in // time" version of the column family, possibly from an older MANIFEST - // file. Some other kinds of DB files (e.g. CURRENT, LOCK, IDENTITY) are + // file. + // Besides complete "point in time" version, an incomplete version with + // only a suffix of L0 files missing can also be recovered to if the + // versioning history doesn't include an atomic flush. From the users' + // perspective, missing a suffix of L0 files means missing the + // user's most recently written data. So the remaining available files still + // presents a valid point in time view, although for some previous time. It's + // not done for atomic flush because that guarantees a consistent view across + // column families. We cannot guarantee that if recovering an incomplete + // version. + // Some other kinds of DB files (e.g. CURRENT, LOCK, IDENTITY) are // either ignored or replaced with BER, or quietly fixed regardless of BER // setting. BER does require at least one valid MANIFEST to recover to a // non-trivial DB state, unlike `ldb repair`. @@ -1566,6 +1580,16 @@ struct DBOptions { // Default 100ms uint64_t follower_catchup_retry_wait_ms = 100; + // When DB files other than SST, blob and WAL files are created, use this + // filesystem temperature. (See also `wal_write_temperature` and various + // `*_temperature` CF options.) When not `kUnknown`, this overrides any + // temperature set by OptimizeForManifestWrite functions. + Temperature metadata_write_temperature = Temperature::kUnknown; + + // Use this filesystem temperature when creating WAL files. When not + // `kUnknown`, this overrides any temperature set by OptimizeForLogWrite + // functions. + Temperature wal_write_temperature = Temperature::kUnknown; // End EXPERIMENTAL }; @@ -2107,6 +2131,8 @@ struct CompactRangeOptions { // IngestExternalFileOptions is used by IngestExternalFile() struct IngestExternalFileOptions { // Can be set to true to move the files instead of copying them. + // Note that original file links will be removed after successful ingestion, + // unless `allow_db_generated_files` is true. bool move_files = false; // If set to true, ingestion falls back to copy when move fails. bool failed_move_fall_back_to_copy = true; @@ -2180,22 +2206,19 @@ struct IngestExternalFileOptions { // XXX: "bottommost" is obsolete/confusing terminology to refer to last level bool fail_if_not_bottommost_level = false; // EXPERIMENTAL - // If set to true, ingestion will - // - allow the files to not be generated by SstFileWriter, and - // - ignore cf_id mismatch between cf_id in the files and the CF they are - // being ingested into. - // - // REQUIRES: - // - files to be ingested do not overlap with existing keys. - // - write_global_seqno = false - // - move_files = false - // - // Warning: This ONLY works for SST files where all keys have sequence number - // zero and with no duplicated user keys (this should be guaranteed if the - // file is generated by a DB with zero as the largest sequence number). - // We scan the entire SST files to validate sequence numbers. - // Warning: If a DB contains ingested files generated by another DB/CF, - // RepairDB() may not correctly recover these files. It may lose these files. + // Enables ingestion of files not generated by SstFileWriter. When true: + // - Allows files to be ingested when their cf_id doesn't match the CF they + // are being ingested into. + // - Preserves original file links after successful ingestion when + // `move_files = true`. + // REQUIREMENTS: + // - Ingested files must not overlap with existing keys. + // - `write_global_seqno` must be false. + // - All keys in ingested files should have sequence number 0. We fail + // ingestion if any sequence numbers is non-zero. + // WARNING: If a DB contains ingested files generated by another DB/CF, + // RepairDB() may not recover these files correctly, potentially leading to + // data loss. bool allow_db_generated_files = false; }; diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 47bf8445fc..00b95e8d1f 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -529,6 +529,11 @@ enum Tickers : uint32_t { // Footer corruption detected when opening an SST file for reading SST_FOOTER_CORRUPTION_COUNT, + // Counters for file read retries with the verify_and_reconstruct_read + // file system option after detecting a checksum mismatch + FILE_READ_CORRUPTION_RETRY_COUNT, + FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT, + TICKER_ENUM_MAX }; diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index a801a3349a..c7fe503ff0 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -291,15 +291,11 @@ struct BlockBasedTableOptions { // Same as block_restart_interval but used for the index block. int index_block_restart_interval = 1; - // Block size for partitioned metadata. Currently applied to indexes when - // kTwoLevelIndexSearch is used and to filters when partition_filters is used. - // Note: Since in the current implementation the filters and index partitions - // are aligned, an index/filter block is created when either index or filter - // block size reaches the specified limit. - // Note: this limit is currently applied to only index blocks; a filter - // partition is cut right after an index block is cut - // TODO(myabandeh): remove the note above when filter partitions are cut - // separately + // Target block size for partitioned metadata. Currently applied to indexes + // when kTwoLevelIndexSearch is used and to filters when partition_filters is + // used. When decouple_partitioned_filters=false (original behavior), there is + // much more deviation from this target size. See the comment on + // decouple_partitioned_filters. uint64_t metadata_block_size = 4096; // `cache_usage_options` allows users to specify the default @@ -398,6 +394,23 @@ struct BlockBasedTableOptions { // block cache even when cache_index_and_filter_blocks=false. bool partition_filters = false; + // When both partitioned indexes and partitioned filters are enabled, + // this enables independent partitioning boundaries between the two. Most + // notably, this enables these metadata blocks to hit their target size much + // more accurately, as there is often a disparity between index sizes and + // filter sizes. This should reduce fragmentation and metadata overheads in + // the block cache, as well as treat blocks more fairly for cache eviction + // purposes. + // + // There are no SST format compatibility issues with this option. (All + // versions of RocksDB able to read partitioned filters are able to read + // decoupled partitioned filters.) + // + // decouple_partitioned_filters = false is the original behavior, because of + // limitations in the initial implementation, and the new behavior + // decouple_partitioned_filters = true is expected to become the new default. + bool decouple_partitioned_filters = false; + // Option to generate Bloom/Ribbon filters that minimize memory // internal fragmentation. // @@ -679,6 +692,11 @@ struct BlockBasedTablePropertyNames { static const std::string kWholeKeyFiltering; // value is "1" for true and "0" for false. static const std::string kPrefixFiltering; + // Set to "1" when partitioned filters are decoupled from partitioned indexes. + // This metadata is recorded in case a read-time optimization for coupled + // filter+index partitioning is ever developed; that optimization/assumption + // would be disabled when this is set. + static const std::string kDecoupledPartitionedFilters; }; // Create default block based table factory. diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index 225cd7788a..f247d339aa 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -74,6 +74,7 @@ struct TablePropertiesNames { static const std::string kSequenceNumberTimeMapping; static const std::string kTailStartOffset; static const std::string kUserDefinedTimestampsPersisted; + static const std::string kKeyLargestSeqno; }; // `TablePropertiesCollector` provides the mechanism for users to collect @@ -125,6 +126,8 @@ class TablePropertiesCollector { // Finish() will be called when a table has already been built and is ready // for writing the properties block. // It will be called only once by RocksDB internal. + // When the returned Status is not OK, the collected properties will not be + // written to the file's property block. // // @params properties User will add their collected statistics to // `properties`. @@ -132,6 +135,7 @@ class TablePropertiesCollector { // Return the human-readable properties, where the key is property name and // the value is the human-readable form of value. + // Returned properties are used for logging. // It will only be called after Finish() has been called by RocksDB internal. virtual UserCollectedProperties GetReadableProperties() const = 0; @@ -290,6 +294,12 @@ struct TableProperties { // it's explicitly written to meta properties block. uint64_t user_defined_timestamps_persisted = 1; + // The largest sequence number of keys in this file. + // UINT64_MAX means unknown. + // Only written to properties block if known (should be known unless the + // table is empty). + uint64_t key_largest_seqno = UINT64_MAX; + // DB identity // db_id is an identifier generated the first time the DB is created // If DB identity is unset or unassigned, `db_id` will be an empty string. diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h index dceacbbefc..368736cbd0 100644 --- a/include/rocksdb/types.h +++ b/include/rocksdb/types.h @@ -110,4 +110,16 @@ enum class WriteStallCondition { kNormal, }; +// Temperature of a file. Used to pass to FileSystem for a different +// placement and/or coding. +// Reserve some numbers in the middle, in case we need to insert new tier +// there. +enum class Temperature : uint8_t { + kUnknown = 0, + kHot = 0x04, + kWarm = 0x08, + kCold = 0x0C, + kLastTemperature, +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 62aa786270..7022666400 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -323,6 +323,22 @@ struct TransactionOptions { // description. If a negative value is specified, then the default value from // TransactionDBOptions is used. int64_t write_batch_flush_threshold = -1; + + // DO NOT USE. + // This is only a temporary option dedicated for MyRocks that will soon be + // removed. + // In normal use cases, meta info like column family's timestamp size is + // tracked at the transaction layer, so it's not necessary and even + // detrimental to track such info inside the internal WriteBatch because it + // may let anti-patterns like bypassing Transaction write APIs and directly + // write to its internal `WriteBatch` retrieved like this: + // https://github.com/facebook/mysql-5.6/blob/fb-mysql-8.0.32/storage/rocksdb/ha_rocksdb.cc#L4949-L4950 + // Setting this option to true will keep aforementioned use case continue to + // work before it's refactored out. + // When this flag is enabled, we also intentionally only track the timestamp + // size in APIs that MyRocks currently are using, including Put, Merge, Delete + // DeleteRange, SingleDelete. + bool write_batch_track_timestamp_size = false; }; // The per-write optimizations that do not involve transactions. TransactionDB diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index f1bc9f987e..fbbd9765ce 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -12,7 +12,7 @@ // NOTE: in 'main' development branch, this should be the *next* // minor or major version number planned for release. #define ROCKSDB_MAJOR 9 -#define ROCKSDB_MINOR 6 +#define ROCKSDB_MINOR 7 #define ROCKSDB_PATCH 0 // Do not use these. We made the mistake of declaring macros starting with diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index dfdd2834bf..df7048af36 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include "rocksdb/status.h" #include "rocksdb/write_batch_base.h" @@ -437,6 +437,30 @@ class WriteBatch : public WriteBatchBase { Status UpdateTimestamps(const Slice& ts, std::function ts_sz_func); + // TODO: remove these internal APIs after MyRocks refactor to not directly + // write to a `WriteBatch` retrieved from `Transaction` via + // `Transaction::GetWriteBatch`. + + void SetTrackTimestampSize(bool track_timestamp_size) { + track_timestamp_size_ = track_timestamp_size; + } + + inline void MaybeTrackTimestampSize(uint32_t column_family_id, size_t ts_sz) { + if (!track_timestamp_size_) { + return; + } + auto iter = cf_id_to_ts_sz_.find(column_family_id); + if (iter == cf_id_to_ts_sz_.end()) { + cf_id_to_ts_sz_.emplace(column_family_id, ts_sz); + } + } + + // Return a mapping from column family id to timestamp size of all the column + // families involved in this WriteBatch. + const std::unordered_map& GetColumnFamilyToTimestampSize() { + return cf_id_to_ts_sz_; + } + // Verify the per-key-value checksums of this write batch. // Corruption status will be returned if the verification fails. // If this write batch does not have per-key-value checksum, @@ -511,6 +535,10 @@ class WriteBatch : public WriteBatchBase { size_t default_cf_ts_sz_ = 0; + bool track_timestamp_size_ = false; + + std::unordered_map cf_id_to_ts_sz_; + protected: std::string rep_; // See comment in write_batch.cc for the format of rep_ }; diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 2e66bb2719..bbb37138d4 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5317,6 +5317,10 @@ class TickerTypeJni { return -0x53; case ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT: return -0x55; + case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_COUNT: + return -0x56; + case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT: + return -0x57; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: // -0x54 is the max value at this time. Since these values are exposed // directly to Java clients, we'll keep the value the same till the next @@ -5774,6 +5778,11 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::PREFETCH_HITS; case -0x55: return ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT; + case -0x56: + return ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_COUNT; + case -0x57: + return ROCKSDB_NAMESPACE::Tickers:: + FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT; case -0x54: // -0x54 is the max value at this time. Since these values are exposed // directly to Java clients, we'll keep the value the same till the next diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 90f0b6ba2e..3b488660e8 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -878,6 +878,10 @@ public enum TickerType { SST_FOOTER_CORRUPTION_COUNT((byte) -0x55), + FILE_READ_CORRUPTION_RETRY_COUNT((byte) -0x56), + + FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT((byte) -0x57), + TICKER_ENUM_MAX((byte) -0x54); private final byte value; diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h index 8e2d548b43..06ef0397a2 100644 --- a/memtable/inlineskiplist.h +++ b/memtable/inlineskiplist.h @@ -52,6 +52,7 @@ #include "port/likely.h" #include "port/port.h" #include "rocksdb/slice.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/random.h" @@ -169,13 +170,20 @@ class InlineSkipList { // REQUIRES: Valid() void Next(); + [[nodiscard]] Status NextAndValidate(bool allow_data_in_errors); + // Advances to the previous position. // REQUIRES: Valid() void Prev(); + [[nodiscard]] Status PrevAndValidate(bool allow_data_in_errors); + // Advance to the first entry with a key >= target void Seek(const char* target); + [[nodiscard]] Status SeekAndValidate(const char* target, + bool allow_data_in_errors); + // Retreat to the last entry with a key <= target void SeekForPrev(const char* target); @@ -237,21 +245,20 @@ class InlineSkipList { bool KeyIsAfterNode(const DecodedKey& key, Node* n) const; // Returns the earliest node with a key >= key. - // Return nullptr if there is no such node. - Node* FindGreaterOrEqual(const char* key) const; + // Returns nullptr if there is no such node. + // @param out_of_order_node If not null, will validate the order of visited + // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be + // returned and *out_of_order_node will be set to n2. + Node* FindGreaterOrEqual(const char* key, Node** out_of_order_node) const; - // Return the latest node with a key < key. - // Return head_ if there is no such node. + // Returns the latest node with a key < key. + // Returns head_ if there is no such node. // Fills prev[level] with pointer to previous node at "level" for every // level in [0..max_height_-1], if prev is non-null. - Node* FindLessThan(const char* key, Node** prev = nullptr) const; - - // Return the latest node with a key < key on bottom_level. Start searching - // from root node on the level below top_level. - // Fills prev[level] with pointer to previous node at "level" for every - // level in [bottom_level..top_level-1], if prev is non-null. - Node* FindLessThan(const char* key, Node** prev, Node* root, int top_level, - int bottom_level) const; + // @param out_of_order_node If not null, will validate the order of visited + // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be + // returned and *out_of_order_node will be set to n2. + Node* FindLessThan(const char* key, Node** out_of_order_node) const; // Return the last node in the list. // Return head_ if list is empty. @@ -274,6 +281,8 @@ class InlineSkipList { // lowest_level (inclusive). void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice, int recompute_level); + + static Status Corruption(Node* prev, Node* next, bool allow_data_in_errors); }; // Implementation details follow @@ -392,20 +401,68 @@ inline void InlineSkipList::Iterator::Next() { node_ = node_->Next(0); } +template +inline Status InlineSkipList::Iterator::NextAndValidate( + bool allow_data_in_errors) { + assert(Valid()); + Node* prev_node = node_; + node_ = node_->Next(0); + // Verify that keys are increasing. + if (prev_node != list_->head_ && node_ != nullptr && + list_->compare_(prev_node->Key(), node_->Key()) >= 0) { + Node* node = node_; + // invalidates the iterator + node_ = nullptr; + return Corruption(prev_node, node, allow_data_in_errors); + } + return Status::OK(); +} + template inline void InlineSkipList::Iterator::Prev() { // Instead of using explicit "prev" links, we just search for the // last node that falls before key. assert(Valid()); - node_ = list_->FindLessThan(node_->Key()); + node_ = list_->FindLessThan(node_->Key(), nullptr); if (node_ == list_->head_) { node_ = nullptr; } } +template +inline Status InlineSkipList::Iterator::PrevAndValidate( + const bool allow_data_in_errors) { + assert(Valid()); + // Skip list validation is done in FindLessThan(). + Node* out_of_order_node = nullptr; + node_ = list_->FindLessThan(node_->Key(), &out_of_order_node); + if (out_of_order_node) { + Node* node = node_; + node_ = nullptr; + return Corruption(node, out_of_order_node, allow_data_in_errors); + } + if (node_ == list_->head_) { + node_ = nullptr; + } + return Status::OK(); +} + template inline void InlineSkipList::Iterator::Seek(const char* target) { - node_ = list_->FindGreaterOrEqual(target); + node_ = list_->FindGreaterOrEqual(target, nullptr); +} + +template +inline Status InlineSkipList::Iterator::SeekAndValidate( + const char* target, const bool allow_data_in_errors) { + Node* out_of_order_node = nullptr; + node_ = list_->FindGreaterOrEqual(target, &out_of_order_node); + if (out_of_order_node) { + Node* node = node_; + node_ = nullptr; + return Corruption(node, out_of_order_node, allow_data_in_errors); + } + return Status::OK(); } template @@ -448,6 +505,7 @@ int InlineSkipList::RandomHeight() { rnd->Next() < kScaledInverseBranching_) { height++; } + TEST_SYNC_POINT_CALLBACK("InlineSkipList::RandomHeight::height", &height); assert(height > 0); assert(height <= kMaxHeight_); assert(height <= kMaxPossibleHeight); @@ -472,7 +530,8 @@ bool InlineSkipList::KeyIsAfterNode(const DecodedKey& key, template typename InlineSkipList::Node* -InlineSkipList::FindGreaterOrEqual(const char* key) const { +InlineSkipList::FindGreaterOrEqual( + const char* key, Node** const out_of_order_node) const { // Note: It looks like we could reduce duplication by implementing // this function as FindLessThan(key)->Next(0), but we wouldn't be able // to exit early on equality and the result wouldn't even be correct. @@ -486,6 +545,11 @@ InlineSkipList::FindGreaterOrEqual(const char* key) const { Node* next = x->Next(level); if (next != nullptr) { PREFETCH(next->Next(level), 0, 1); + if (out_of_order_node && x != head_ && + compare_(x->Key(), next->Key()) >= 0) { + *out_of_order_node = next; + return x; + } } // Make sure the lists are sorted assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); @@ -509,18 +573,11 @@ InlineSkipList::FindGreaterOrEqual(const char* key) const { template typename InlineSkipList::Node* -InlineSkipList::FindLessThan(const char* key, Node** prev) const { - return FindLessThan(key, prev, head_, GetMaxHeight(), 0); -} - -template -typename InlineSkipList::Node* -InlineSkipList::FindLessThan(const char* key, Node** prev, - Node* root, int top_level, - int bottom_level) const { - assert(top_level > bottom_level); - int level = top_level - 1; - Node* x = root; +InlineSkipList::FindLessThan(const char* key, + Node** const out_of_order_node) const { + int level = GetMaxHeight() - 1; + assert(level >= 0); + Node* x = head_; // KeyIsAfter(key, last_not_after) is definitely false Node* last_not_after = nullptr; const DecodedKey key_decoded = compare_.decode_key(key); @@ -529,6 +586,11 @@ InlineSkipList::FindLessThan(const char* key, Node** prev, Node* next = x->Next(level); if (next != nullptr) { PREFETCH(next->Next(level), 0, 1); + if (out_of_order_node && x != head_ && + compare_(x->Key(), next->Key()) >= 0) { + *out_of_order_node = next; + return x; + } } assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); assert(x == head_ || KeyIsAfterNode(key_decoded, x)); @@ -537,10 +599,7 @@ InlineSkipList::FindLessThan(const char* key, Node** prev, assert(next != nullptr); x = next; } else { - if (prev != nullptr) { - prev[level] = x; - } - if (level == bottom_level) { + if (level == 0) { return x; } else { // Switch to next list, reuse KeyIsAfterNode() result @@ -910,12 +969,12 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, while (true) { // Checking for duplicate keys on the level 0 is sufficient if (UNLIKELY(i == 0 && splice->next_[i] != nullptr && - compare_(x->Key(), splice->next_[i]->Key()) >= 0)) { + compare_(splice->next_[i]->Key(), key_decoded) <= 0)) { // duplicate key return false; } if (UNLIKELY(i == 0 && splice->prev_[i] != head_ && - compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) { + compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) { // duplicate key return false; } @@ -953,12 +1012,12 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, } // Checking for duplicate keys on the level 0 is sufficient if (UNLIKELY(i == 0 && splice->next_[i] != nullptr && - compare_(x->Key(), splice->next_[i]->Key()) >= 0)) { + compare_(splice->next_[i]->Key(), key_decoded) <= 0)) { // duplicate key return false; } if (UNLIKELY(i == 0 && splice->prev_[i] != head_ && - compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) { + compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) { // duplicate key return false; } @@ -999,7 +1058,7 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, template bool InlineSkipList::Contains(const char* key) const { - Node* x = FindGreaterOrEqual(key); + Node* x = FindGreaterOrEqual(key, nullptr); if (x != nullptr && Equal(key, x->Key())) { return true; } else { @@ -1048,4 +1107,14 @@ void InlineSkipList::TEST_Validate() const { } } +template +Status InlineSkipList::Corruption(Node* prev, Node* next, + bool allow_data_in_errors) { + std::string msg = "Out-of-order keys found in skiplist."; + if (allow_data_in_errors) { + msg.append(" prev key: " + Slice(prev->Key()).ToString(true)); + msg.append(" next key: " + Slice(next->Key()).ToString(true)); + } + return Status::Corruption(msg); +} } // namespace ROCKSDB_NAMESPACE diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index e615ef9f68..3b2f3f4d8d 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -92,6 +92,20 @@ class SkipListRep : public MemTableRep { } } + Status GetAndValidate(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry), + bool allow_data_in_errors) override { + SkipListRep::Iterator iter(&skip_list_); + Slice dummy_slice; + Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(), + allow_data_in_errors); + for (; iter.Valid() && status.ok() && + callback_func(callback_args, iter.key()); + status = iter.NextAndValidate(allow_data_in_errors)) { + } + return status; + } + uint64_t ApproximateNumEntries(const Slice& start_ikey, const Slice& end_ikey) override { std::string tmp; @@ -181,15 +195,24 @@ class SkipListRep : public MemTableRep { // Returns the key at the current position. // REQUIRES: Valid() - const char* key() const override { return iter_.key(); } + const char* key() const override { + assert(Valid()); + return iter_.key(); + } // Advances to the next position. // REQUIRES: Valid() - void Next() override { iter_.Next(); } + void Next() override { + assert(Valid()); + iter_.Next(); + } // Advances to the previous position. // REQUIRES: Valid() - void Prev() override { iter_.Prev(); } + void Prev() override { + assert(Valid()); + iter_.Prev(); + } // Advance to the first entry with a key >= target void Seek(const Slice& user_key, const char* memtable_key) override { @@ -219,6 +242,26 @@ class SkipListRep : public MemTableRep { // Final state of iterator is Valid() iff list is not empty. void SeekToLast() override { iter_.SeekToLast(); } + Status NextAndValidate(bool allow_data_in_errors) override { + assert(Valid()); + return iter_.NextAndValidate(allow_data_in_errors); + } + + Status SeekAndValidate(const Slice& user_key, const char* memtable_key, + bool allow_data_in_errors) override { + if (memtable_key != nullptr) { + return iter_.SeekAndValidate(memtable_key, allow_data_in_errors); + } else { + return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key), + allow_data_in_errors); + } + } + + Status PrevAndValidate(bool allow_data_in_error) override { + assert(Valid()); + return iter_.PrevAndValidate(allow_data_in_error); + } + protected: std::string tmp_; // For passing to EncodeKey }; diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index ed9a089af5..05163d3e29 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -266,6 +266,10 @@ const std::vector> TickersNameMap = { {PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"}, {PREFETCH_HITS, "rocksdb.prefetch.hits"}, {SST_FOOTER_CORRUPTION_COUNT, "rocksdb.footer.corruption.count"}, + {FILE_READ_CORRUPTION_RETRY_COUNT, + "rocksdb.file.read.corruption.retry.count"}, + {FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT, + "rocksdb.file.read.corruption.retry.success.count"}, }; const std::vector> HistogramsNameMap = { diff --git a/options/cf_options.cc b/options/cf_options.cc index cc9e630b9c..7f2cd03132 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -531,6 +531,10 @@ static std::unordered_map {offsetof(struct MutableCFOptions, block_protection_bytes_per_key), OptionType::kUInt8T, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"paranoid_memory_checks", + {offsetof(struct MutableCFOptions, paranoid_memory_checks), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {kOptNameCompOpts, OptionTypeInfo::Struct( kOptNameCompOpts, &compression_options_type_info, @@ -1104,6 +1108,8 @@ void MutableCFOptions::Dump(Logger* log) const { ttl); ROCKS_LOG_INFO(log, " periodic_compaction_seconds: %" PRIu64, periodic_compaction_seconds); + ROCKS_LOG_INFO(log, " paranoid_memory_checks: %d", + paranoid_memory_checks); std::string result; char buf[10]; for (const auto m : max_bytes_for_level_multiplier_additional) { diff --git a/options/cf_options.h b/options/cf_options.h index 372a0daf54..3a0c3b09a8 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -168,6 +168,7 @@ struct MutableCFOptions { memtable_protection_bytes_per_key( options.memtable_protection_bytes_per_key), block_protection_bytes_per_key(options.block_protection_bytes_per_key), + paranoid_memory_checks(options.paranoid_memory_checks), sample_for_compression( options.sample_for_compression), // TODO: is 0 fine here? compression_per_level(options.compression_per_level), @@ -317,6 +318,7 @@ struct MutableCFOptions { Temperature default_write_temperature; uint32_t memtable_protection_bytes_per_key; uint8_t block_protection_bytes_per_key; + bool paranoid_memory_checks; uint64_t sample_for_compression; std::vector compression_per_level; diff --git a/options/db_options.cc b/options/db_options.cc index 8eb28c1edc..2678bb5a76 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -576,6 +576,14 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, follower_catchup_retry_wait_ms), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"metadata_write_temperature", + {offsetof(struct ImmutableDBOptions, metadata_write_temperature), + OptionType::kTemperature, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"wal_write_temperature", + {offsetof(struct ImmutableDBOptions, wal_write_temperature), + OptionType::kTemperature, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, }; const std::string OptionsHelper::kDBOptionsName = "DBOptions"; @@ -778,7 +786,9 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) follower_refresh_catchup_period_ms( options.follower_refresh_catchup_period_ms), follower_catchup_retry_count(options.follower_catchup_retry_count), - follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms) { + follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms), + metadata_write_temperature(options.metadata_write_temperature), + wal_write_temperature(options.wal_write_temperature) { fs = env->GetFileSystem(); clock = env->GetSystemClock().get(); logger = info_log.get(); @@ -956,6 +966,10 @@ void ImmutableDBOptions::Dump(Logger* log) const { db_host_id.c_str()); ROCKS_LOG_HEADER(log, " Options.enforce_single_del_contracts: %s", enforce_single_del_contracts ? "true" : "false"); + ROCKS_LOG_HEADER(log, " Options.metadata_write_temperature: %s", + temperature_to_string[metadata_write_temperature].c_str()); + ROCKS_LOG_HEADER(log, " Options.wal_write_temperature: %s", + temperature_to_string[wal_write_temperature].c_str()); } bool ImmutableDBOptions::IsWalDirSameAsDBPath() const { diff --git a/options/db_options.h b/options/db_options.h index 5de6ab498a..7e07526269 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -103,6 +103,8 @@ struct ImmutableDBOptions { uint64_t follower_refresh_catchup_period_ms; uint64_t follower_catchup_retry_count; uint64_t follower_catchup_retry_wait_ms; + Temperature metadata_write_temperature; + Temperature wal_write_temperature; // Beginning convenience/helper objects that are not part of the base // DBOptions diff --git a/options/options_helper.cc b/options/options_helper.cc index 5cc13f4fe4..011f47b984 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -180,6 +180,15 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.enforce_single_del_contracts = immutable_db_options.enforce_single_del_contracts; options.daily_offpeak_time_utc = mutable_db_options.daily_offpeak_time_utc; + options.follower_refresh_catchup_period_ms = + immutable_db_options.follower_refresh_catchup_period_ms; + options.follower_catchup_retry_count = + immutable_db_options.follower_catchup_retry_count; + options.follower_catchup_retry_wait_ms = + immutable_db_options.follower_catchup_retry_wait_ms; + options.metadata_write_temperature = + immutable_db_options.metadata_write_temperature; + options.wal_write_temperature = immutable_db_options.wal_write_temperature; return options; } @@ -213,6 +222,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, moptions.memtable_protection_bytes_per_key; cf_opts->block_protection_bytes_per_key = moptions.block_protection_bytes_per_key; + cf_opts->paranoid_memory_checks = moptions.paranoid_memory_checks; cf_opts->bottommost_file_compaction_delay = moptions.bottommost_file_compaction_delay; diff --git a/options/options_parser.cc b/options/options_parser.cc index ec32f76447..4e249908be 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -69,8 +69,9 @@ Status PersistRocksDBOptions(const WriteOptions& write_options, } std::unique_ptr wf; - Status s = - fs->NewWritableFile(file_name, FileOptions(), &wf, nullptr); + FileOptions file_options; + file_options.temperature = db_opt.metadata_write_temperature; + Status s = fs->NewWritableFile(file_name, file_options, &wf, nullptr); if (!s.ok()) { return s; } diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 1fec8c20ab..67aab055e1 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -188,6 +188,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "block_size_deviation=8;block_restart_interval=4; " "metadata_block_size=1024;" "partition_filters=false;" + "decouple_partitioned_filters=true;" "optimize_filters_for_memory=true;" "use_delta_encoding=true;" "index_block_restart_interval=4;" @@ -366,7 +367,12 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "lowest_used_cache_tier=kNonVolatileBlockTier;" "allow_data_in_errors=false;" "enforce_single_del_contracts=false;" - "daily_offpeak_time_utc=08:30-19:00;", + "daily_offpeak_time_utc=08:30-19:00;" + "follower_refresh_catchup_period_ms=123;" + "follower_catchup_retry_count=456;" + "follower_catchup_retry_wait_ms=789;" + "metadata_write_temperature=kCold;" + "wal_write_temperature=kHot;", new_options)); ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions), @@ -567,7 +573,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "block_protection_bytes_per_key=1;" "memtable_max_range_deletions=999999;" "bottommost_file_compaction_delay=7200;" - "uncache_aggressiveness=1234;", + "uncache_aggressiveness=1234;" + "paranoid_memory_checks=1;", new_options)); ASSERT_NE(new_options->blob_cache.get(), nullptr); diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index c59097b84d..ec24721b7a 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -96,7 +96,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder( mopt.prefix_extractor.get(), table_opt.whole_key_filtering, filter_bits_builder, table_opt.index_block_restart_interval, use_delta_encoding_for_index_values, p_index_builder, partition_size, - ts_sz, persist_user_defined_timestamps); + ts_sz, persist_user_defined_timestamps, + table_opt.decouple_partitioned_filters); } else { return new FullFilterBlockBuilder(mopt.prefix_extractor.get(), table_opt.whole_key_filtering, @@ -213,10 +214,11 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector public: explicit BlockBasedTablePropertiesCollector( BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering, - bool prefix_filtering) + bool prefix_filtering, bool decoupled_partitioned_filters) : index_type_(index_type), whole_key_filtering_(whole_key_filtering), - prefix_filtering_(prefix_filtering) {} + prefix_filtering_(prefix_filtering), + decoupled_partitioned_filters_(decoupled_partitioned_filters) {} Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/, uint64_t /*file_size*/) override { @@ -240,6 +242,11 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector whole_key_filtering_ ? kPropTrue : kPropFalse}); properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering, prefix_filtering_ ? kPropTrue : kPropFalse}); + if (decoupled_partitioned_filters_) { + properties->insert( + {BlockBasedTablePropertyNames::kDecoupledPartitionedFilters, + kPropTrue}); + } return Status::OK(); } @@ -257,6 +264,7 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector BlockBasedTableOptions::IndexType index_type_; bool whole_key_filtering_; bool prefix_filtering_; + bool decoupled_partitioned_filters_; }; struct BlockBasedTableBuilder::Rep { @@ -296,7 +304,7 @@ struct BlockBasedTableBuilder::Rep { std::string index_separator_scratch; PartitionedIndexBuilder* p_index_builder_ = nullptr; - std::string last_key; + std::string last_ikey; // Internal key or empty (unset) const Slice* first_key_in_next_block = nullptr; CompressionType compression_type; uint64_t sample_for_compression; @@ -594,7 +602,8 @@ struct BlockBasedTableBuilder::Rep { table_properties_collectors.emplace_back( new BlockBasedTablePropertiesCollector( table_options.index_type, table_options.whole_key_filtering, - prefix_extractor != nullptr)); + prefix_extractor != nullptr, + table_options.decouple_partitioned_filters)); if (ts_sz > 0 && persist_user_defined_timestamps) { table_properties_collectors.emplace_back( new TimestampTablePropertiesCollector( @@ -618,6 +627,9 @@ struct BlockBasedTableBuilder::Rep { if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) { ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set"); } + // Default is UINT64_MAX for unknown. Setting it to 0 here + // to allow updating it by taking max in BlockBasedTableBuilder::Add(). + props.key_largest_seqno = 0; if (FormatVersionUsesContextChecksum(table_options.format_version)) { // Must be non-zero and semi- or quasi-random @@ -654,6 +666,7 @@ struct BlockBasedTableBuilder::Rep { }; struct BlockBasedTableBuilder::ParallelCompressionRep { + // TODO: consider replacing with autovector or similar // Keys is a wrapper of vector of strings avoiding // releasing string memories during vector clear() // in order to save memory allocation overhead @@ -998,24 +1011,27 @@ BlockBasedTableBuilder::~BlockBasedTableBuilder() { delete rep_; } -void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { +void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) { Rep* r = rep_; assert(rep_->state != Rep::State::kClosed); if (!ok()) { return; } - ValueType value_type = ExtractValueType(key); + ValueType value_type; + SequenceNumber seq; + UnPackSequenceAndType(ExtractInternalKeyFooter(ikey), &seq, &value_type); + r->props.key_largest_seqno = std::max(r->props.key_largest_seqno, seq); if (IsValueType(value_type)) { #ifndef NDEBUG if (r->props.num_entries > r->props.num_range_deletions) { - assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0); + assert(r->internal_comparator.Compare(ikey, Slice(r->last_ikey)) > 0); } #endif // !NDEBUG - auto should_flush = r->flush_block_policy->Update(key, value); + auto should_flush = r->flush_block_policy->Update(ikey, value); if (should_flush) { assert(!r->data_block.empty()); - r->first_key_in_next_block = &key; + r->first_key_in_next_block = &ikey; Flush(); if (r->state == Rep::State::kBuffered) { bool exceeds_buffer_limit = @@ -1050,7 +1066,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { if (r->IsParallelCompressionEnabled()) { r->pc_rep->curr_block_keys->Clear(); } else { - r->index_builder->AddIndexEntry(r->last_key, &key, r->pending_handle, + r->index_builder->AddIndexEntry(r->last_ikey, &ikey, + r->pending_handle, &r->index_separator_scratch); } } @@ -1060,27 +1077,31 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { // builder after being added to index builder. if (r->state == Rep::State::kUnbuffered) { if (r->IsParallelCompressionEnabled()) { - r->pc_rep->curr_block_keys->PushBack(key); + r->pc_rep->curr_block_keys->PushBack(ikey); } else { if (r->filter_builder != nullptr) { - r->filter_builder->Add( - ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); + r->filter_builder->AddWithPrevKey( + ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz), + r->last_ikey.empty() + ? Slice{} + : ExtractUserKeyAndStripTimestamp(r->last_ikey, r->ts_sz)); } } } - r->data_block.AddWithLastKey(key, value, r->last_key); - r->last_key.assign(key.data(), key.size()); + r->data_block.AddWithLastKey(ikey, value, r->last_ikey); + r->last_ikey.assign(ikey.data(), ikey.size()); + assert(!r->last_ikey.empty()); if (r->state == Rep::State::kBuffered) { // Buffered keys will be replayed from data_block_buffers during // `Finish()` once compression dictionary has been finalized. } else { if (!r->IsParallelCompressionEnabled()) { - r->index_builder->OnKeyAdded(key); + r->index_builder->OnKeyAdded(ikey); } } // TODO offset passed in is not accurate for parallel compression case - NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), + NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(), r->table_properties_collectors, r->ioptions.logger); @@ -1094,9 +1115,9 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { if (r->ts_sz > 0 && !r->persist_user_defined_timestamps) { persisted_end = StripTimestampFromUserKey(value, r->ts_sz); } - r->range_del_block.Add(key, persisted_end); + r->range_del_block.Add(ikey, persisted_end); // TODO offset passed in is not accurate for parallel compression case - NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), + NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(), r->table_properties_collectors, r->ioptions.logger); } else { @@ -1108,7 +1129,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { } r->props.num_entries++; - r->props.raw_key_size += key.size(); + r->props.raw_key_size += ikey.size(); if (!r->persist_user_defined_timestamps) { r->props.raw_key_size -= r->ts_sz; } @@ -1452,6 +1473,8 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() { Rep* r = rep_; ParallelCompressionRep::BlockRepSlot* slot = nullptr; ParallelCompressionRep::BlockRep* block_rep = nullptr; + // Starts empty; see FilterBlockBuilder::AddWithPrevKey + std::string prev_block_last_key_no_ts; while (r->pc_rep->write_queue.pop(slot)) { assert(slot != nullptr); slot->Take(block_rep); @@ -1465,13 +1488,20 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() { continue; } + Slice prev_key_no_ts = prev_block_last_key_no_ts; for (size_t i = 0; i < block_rep->keys->Size(); i++) { auto& key = (*block_rep->keys)[i]; if (r->filter_builder != nullptr) { - r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); + Slice key_no_ts = ExtractUserKeyAndStripTimestamp(key, r->ts_sz); + r->filter_builder->AddWithPrevKey(key_no_ts, prev_key_no_ts); + prev_key_no_ts = key_no_ts; } r->index_builder->OnKeyAdded(key); } + if (r->filter_builder != nullptr) { + prev_block_last_key_no_ts.assign(prev_key_no_ts.data(), + prev_key_no_ts.size()); + } r->pc_rep->file_size_estimator.SetCurrBlockUncompSize( block_rep->data->size()); @@ -1563,6 +1593,13 @@ void BlockBasedTableBuilder::WriteFilterBlock( // No filter block needed return; } + if (!rep_->last_ikey.empty()) { + // We might have been using AddWithPrevKey, so need PrevKeyBeforeFinish + // to be safe. And because we are re-synchronized after buffered/parallel + // operation, rep_->last_ikey is accurate. + rep_->filter_builder->PrevKeyBeforeFinish( + ExtractUserKeyAndStripTimestamp(rep_->last_ikey, rep_->ts_sz)); + } BlockHandle filter_block_handle; bool is_partitioned_filter = rep_->table_options.partition_filters; if (ok()) { @@ -1578,9 +1615,10 @@ void BlockBasedTableBuilder::WriteFilterBlock( // See FilterBlockBuilder::Finish() for more on the difference in // transferred filter data payload among different FilterBlockBuilder // subtypes. - std::unique_ptr filter_data; - Slice filter_content = - rep_->filter_builder->Finish(filter_block_handle, &s, &filter_data); + std::unique_ptr filter_owner; + Slice filter_content; + s = rep_->filter_builder->Finish(filter_block_handle, &filter_content, + &filter_owner); assert(s.ok() || s.IsIncomplete() || s.IsCorruption()); if (s.IsCorruption()) { @@ -1749,6 +1787,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock( rep_->props.user_defined_timestamps_persisted = rep_->persist_user_defined_timestamps; + assert(IsEmpty() || rep_->props.key_largest_seqno != UINT64_MAX); // Add basic properties property_block_builder.AddTableProperty(rep_->props); @@ -1976,6 +2015,10 @@ void BlockBasedTableBuilder::EnterUnbuffered() { for (; iter->Valid(); iter->Next()) { Slice key = iter->key(); if (r->filter_builder != nullptr) { + // NOTE: AddWithPrevKey here would only save key copying if prev is + // pinned (iter->IsKeyPinned()), which is probably rare with delta + // encoding. OK to go from Add() here to AddWithPrevKey() in + // unbuffered operation. r->filter_builder->Add( ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); } @@ -1989,6 +2032,7 @@ void BlockBasedTableBuilder::EnterUnbuffered() { Slice* first_key_in_next_block_ptr = &first_key_in_next_block; iter->SeekToLast(); + assert(iter->Valid()); r->index_builder->AddIndexEntry( iter->key(), first_key_in_next_block_ptr, r->pending_handle, &r->index_separator_scratch); @@ -2027,7 +2071,7 @@ Status BlockBasedTableBuilder::Finish() { // block, we will finish writing all index entries first. if (ok() && !empty_data_block) { r->index_builder->AddIndexEntry( - r->last_key, nullptr /* no next data block */, r->pending_handle, + r->last_ikey, nullptr /* no next data block */, r->pending_handle, &r->index_separator_scratch); } } diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 34081621a4..5382db0976 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -304,6 +304,10 @@ static std::unordered_map {offsetof(struct BlockBasedTableOptions, partition_filters), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"decouple_partitioned_filters", + {offsetof(struct BlockBasedTableOptions, decouple_partitioned_filters), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"optimize_filters_for_memory", {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory), OptionType::kBoolean, OptionVerificationType::kNormal, @@ -971,6 +975,8 @@ const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering = "rocksdb.block.based.table.whole.key.filtering"; const std::string BlockBasedTablePropertyNames::kPrefixFiltering = "rocksdb.block.based.table.prefix.filtering"; +const std::string BlockBasedTablePropertyNames::kDecoupledPartitionedFilters = + "rocksdb.block.based.table.decoupled.partitioned.filters"; const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes"; const std::string kHashIndexPrefixesMetadataBlock = "rocksdb.hashindex.metadata"; diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index a811c1bf7e..f9bdfc9b07 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -693,6 +693,10 @@ Status BlockBasedTable::Open( s = ReadFooterFromFile(retry_opts, file.get(), *ioptions.fs, prefetch_buffer.get(), file_size, &footer, kBlockBasedTableMagicNumber); + RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT); + if (s.ok()) { + RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT); + } } } if (!s.ok()) { diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h index 6ab330dea5..608b3882d6 100644 --- a/table/block_based/block_based_table_reader_sync_and_async.h +++ b/table/block_based/block_based_table_reader_sync_and_async.h @@ -223,13 +223,16 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) s = VerifyBlockChecksum(footer, data, handle.size(), rep_->file->file_name(), handle.offset()); RecordTick(ioptions.stats, BLOCK_CHECKSUM_COMPUTE_COUNT); + if (!s.ok()) { + RecordTick(ioptions.stats, BLOCK_CHECKSUM_MISMATCH_COUNT); + } TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s); if (!s.ok() && CheckFSFeatureSupport(ioptions.fs.get(), FSSupportedOps::kVerifyAndReconstructRead)) { assert(s.IsCorruption()); assert(!ioptions.allow_mmap_reads); - RecordTick(ioptions.stats, BLOCK_CHECKSUM_MISMATCH_COUNT); + RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT); // Repeat the read for this particular block using the regular // synchronous Read API. We can use the same chunk of memory @@ -246,6 +249,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) assert(result.size() == BlockSizeWithTrailer(handle)); s = VerifyBlockChecksum(footer, data, handle.size(), rep_->file->file_name(), handle.offset()); + if (s.ok()) { + RecordTick(ioptions.stats, + FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT); + } } else { s = io_s; } diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index 1730974489..e2aaea6434 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -52,27 +52,42 @@ class FilterBlockBuilder { virtual ~FilterBlockBuilder() {} - virtual void Add( - const Slice& key_without_ts) = 0; // Add a key to current filter + // Add a key to current filter. + virtual void Add(const Slice& key_without_ts) = 0; + // A potentially more efficient version of Add(), though you cannot go back + // to Add() after using AddWithPrevKey() on a FilterBlockBuilder. + // prev_key_without_ts should be the empty Slice for the first key added + // (regardless of comparator; e.g. for bootstrapping delta encoding). + // More detail: The previous key is used when filters are key-range + // partitioned, and the PartitionedFilterBlockBuilder doesn't need to buffer + // the previous key when it is provided by calling this function. + virtual void AddWithPrevKey(const Slice& key_without_ts, + const Slice& /*prev_key_without_ts*/) = 0; + virtual bool IsEmpty() const = 0; // Empty == none added // For reporting stats on how many entries the builder considered unique virtual size_t EstimateEntriesAdded() = 0; - Slice Finish() { // Generate Filter - const BlockHandle empty_handle; - Status dont_care_status; - auto ret = Finish(empty_handle, &dont_care_status); - assert(dont_care_status.ok()); - return ret; - } - // If filter_data is not nullptr, Finish() may transfer ownership of + + // When using AddWithPrevKey, this must be called before Finish(). (May also + // be called without AddWithPrevKey, but prev_key_without_ts must be + // accurate regardless.) + virtual void PrevKeyBeforeFinish(const Slice& /*prev_key_without_ts*/) {} + + // Generate a filter block. Returns OK if finished, or Incomplete if more + // filters are needed (partitioned filter). In the latter case, subsequent + // calls require the BlockHandle of the most recently generated and written + // filter, in last_partition_block_handle. + // + // If filter_owner is not nullptr, Finish() may transfer ownership of // underlying filter data to the caller, so that it can be freed as soon as // possible. BlockBasedFilterBlock will ignore this parameter. // - virtual Slice Finish( - const BlockHandle& tmp /* only used in PartitionedFilterBlock as - last_partition_block_handle */ - , - Status* status, std::unique_ptr* filter_data = nullptr) = 0; + // For either OK or Incomplete, *filter is set to point to the next filter + // bytes, which survive until either this is destroyed, *filter_owner is + // destroyed, or next call to Finish. + virtual Status Finish( + const BlockHandle& last_partition_block_handle, Slice* filter, + std::unique_ptr* filter_owner = nullptr) = 0; // This is called when finishes using the FilterBitsBuilder // in order to release memory usage and cache charge @@ -85,6 +100,16 @@ class FilterBlockBuilder { virtual Status MaybePostVerifyFilter(const Slice& /* filter_content */) { return Status::OK(); } + +#ifndef NDEBUG + Slice TEST_Finish() { // Generate Filter + const BlockHandle empty_handle; + Slice filter; + Status status = Finish(empty_handle, &filter); + assert(status.ok()); + return filter; + } +#endif // NDEBUG }; // A FilterBlockReader is used to parse filter from SST table. diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index 3cd63ffad8..233ced7033 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -82,21 +82,32 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { // requirements. if (hash_entries_info_.entries.empty() || hash != hash_entries_info_.entries.back()) { - if (detect_filter_construct_corruption_) { - hash_entries_info_.xor_checksum ^= hash; - } - hash_entries_info_.entries.push_back(hash); - if (cache_res_mgr_ && - // Traditional rounding to whole bucket size - ((hash_entries_info_.entries.size() % - kUint64tHashEntryCacheResBucketSize) == - kUint64tHashEntryCacheResBucketSize / 2)) { - hash_entries_info_.cache_res_bucket_handles.emplace_back(nullptr); - Status s = cache_res_mgr_->MakeCacheReservation( - kUint64tHashEntryCacheResBucketSize * sizeof(hash), - &hash_entries_info_.cache_res_bucket_handles.back()); - s.PermitUncheckedError(); - } + AddHash(hash); + } + } + + void AddKeyAndAlt(const Slice& key, const Slice& alt) override { + uint64_t key_hash = GetSliceHash64(key); + uint64_t alt_hash = GetSliceHash64(alt); + std::optional prev_key_hash; + std::optional prev_alt_hash = hash_entries_info_.prev_alt_hash; + if (!hash_entries_info_.entries.empty()) { + prev_key_hash = hash_entries_info_.entries.back(); + } + // Add alt first, so that entries.back() always contains previous key + // ASSUMING a change from one alt to the next implies a change to + // corresponding key + if (alt_hash != prev_alt_hash && alt_hash != key_hash && + alt_hash != prev_key_hash) { + AddHash(alt_hash); + } + // Overwrite prev_alt_hash for cases like alt_hash == key_hash + hash_entries_info_.prev_alt_hash = alt_hash; + // NOTE: checking key_hash != prev_alt_hash for cases like + // key == prefix(key) at the end of a prefix grouping as in reverse + // byte-wise comparator + if (key_hash != prev_key_hash && key_hash != prev_alt_hash) { + AddHash(key_hash); } } @@ -116,6 +127,24 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / sizeof(uint64_t); + void AddHash(uint64_t hash) { + if (detect_filter_construct_corruption_) { + hash_entries_info_.xor_checksum ^= hash; + } + hash_entries_info_.entries.push_back(hash); + if (cache_res_mgr_ && + // Traditional rounding to whole bucket size + ((hash_entries_info_.entries.size() % + kUint64tHashEntryCacheResBucketSize) == + kUint64tHashEntryCacheResBucketSize / 2)) { + hash_entries_info_.cache_res_bucket_handles.emplace_back(nullptr); + Status s = cache_res_mgr_->MakeCacheReservation( + kUint64tHashEntryCacheResBucketSize * sizeof(hash), + &hash_entries_info_.cache_res_bucket_handles.back()); + s.PermitUncheckedError(); + } + } + // For delegating between XXPH3FilterBitsBuilders void SwapEntriesWith(XXPH3FilterBitsBuilder* other) { assert(other != nullptr); @@ -266,6 +295,15 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { bool detect_filter_construct_corruption_; struct HashEntriesInfo { +#ifdef ROCKSDB_VALGRIND_RUN + HashEntriesInfo() { + // Valgrind can report uninitialized FPs on std::optional usage. See e.g. + // https://stackoverflow.com/q/51616179 + std::memset((void*)&prev_alt_hash, 0, sizeof(prev_alt_hash)); + prev_alt_hash = {}; + } +#endif + // A deque avoids unnecessary copying of already-saved values // and has near-minimal peak memory use. std::deque entries; @@ -282,17 +320,22 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { // Otherwise, it is 0. uint64_t xor_checksum = 0; + // A single-element cache to help AddKeyAndAlt + std::optional prev_alt_hash; + void Swap(HashEntriesInfo* other) { assert(other != nullptr); std::swap(entries, other->entries); std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles); std::swap(xor_checksum, other->xor_checksum); + std::swap(prev_alt_hash, other->prev_alt_hash); } void Reset() { entries.clear(); cache_res_bucket_handles.clear(); xor_checksum = 0; + prev_alt_hash = {}; } }; @@ -331,6 +374,14 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { Slice Finish(std::unique_ptr* buf, Status* status) override { size_t num_entries = hash_entries_info_.entries.size(); + if (num_entries == 0) { + // This case migrated from FullFilterBlockBuilder::Finish + if (status) { + *status = Status::OK(); + } + return FinishAlwaysFalse(buf); + } + size_t len_with_metadata = CalculateSpace(num_entries); std::unique_ptr mutable_buf; @@ -1023,6 +1074,7 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { ~LegacyBloomBitsBuilder() override; void AddKey(const Slice& key) override; + void AddKeyAndAlt(const Slice& key, const Slice& alt) override; size_t EstimateEntriesAdded() override { return hash_entries_.size(); } @@ -1050,6 +1102,9 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { int bits_per_key_; int num_probes_; std::vector hash_entries_; + // A single-element cache to help AddKeyAndAlt. (-1 == empty) + int64_t prev_alt_hash_ = -1; + Logger* info_log_; // Get totalbits that optimized for cpu cache line @@ -1079,14 +1134,39 @@ LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() = default; void LegacyBloomBitsBuilder::AddKey(const Slice& key) { uint32_t hash = BloomHash(key); - if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { + if (hash_entries_.empty() || hash_entries_.back() != hash) { hash_entries_.push_back(hash); } } +void LegacyBloomBitsBuilder::AddKeyAndAlt(const Slice& key, const Slice& alt) { + // Modified from XXPH3FilterBitsBuilder::AddKeyAndAlt + uint32_t key_hash = BloomHash(key); + uint32_t alt_hash = BloomHash(alt); + + int64_t prev_key_hash = -1; + int64_t prev_alt_hash = prev_alt_hash_; + if (!hash_entries_.empty()) { + prev_key_hash = hash_entries_.back(); + } + if (alt_hash != prev_alt_hash && alt_hash != key_hash && + alt_hash != prev_key_hash) { + hash_entries_.push_back(alt_hash); + } + prev_alt_hash_ = alt_hash; + if (key_hash != prev_key_hash && key_hash != prev_alt_hash) { + hash_entries_.push_back(key_hash); + } +} + Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr* buf) { - uint32_t total_bits, num_lines; size_t num_entries = hash_entries_.size(); + if (num_entries == 0) { + // This case migrated from FullFilterBlockBuilder::Finish + return FinishAlwaysFalse(buf); + } + + uint32_t total_bits, num_lines; char* data = ReserveSpace(static_cast(num_entries), &total_bits, &num_lines); assert(data); @@ -1127,6 +1207,7 @@ Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr* buf) { const char* const_data = data; buf->reset(const_data); hash_entries_.clear(); + prev_alt_hash_ = -1; return Slice(data, total_bits / 8 + kMetadataLen); } diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h index 3919c8c6d2..0583274835 100644 --- a/table/block_based/filter_policy_internal.h +++ b/table/block_based/filter_policy_internal.h @@ -31,10 +31,22 @@ class FilterBitsBuilder { // added. virtual void AddKey(const Slice& key) = 0; + // Add two entries to the filter, typically a key and, as the alternate, + // its prefix. This differs from AddKey(key); AddKey(alt); in that there + // is extra state for de-duplicating successive `alt` entries, as well + // as successive `key` entries. And there is de-duplication between `key` + // and `alt` entries, even in adjacent calls, because a whole key might + // be its own prefix. More specifically, + // AddKey(k1); + // AddKeyAndAlt(k2, a2); // de-dup k2<>k1, k2<>a2, a2<>k1 + // AddKeyAndAlt(k3, a3); // de-dup k3<>k2, a3<>a2, k3<>a2, a3<>k2 + // AddKey(k4); // de-dup k4<>k3 BUT NOT k4<>a3 + virtual void AddKeyAndAlt(const Slice& key, const Slice& alt) = 0; + // Called by RocksDB before Finish to populate // TableProperties::num_filter_entries, so should represent the - // number of unique keys (and/or prefixes) added, but does not have - // to be exact. `return 0;` may be used to conspicuously indicate "unknown". + // number of unique keys (and/or prefixes) added. MUST return 0 + // if and only if none have been added, but otherwise can be estimated. virtual size_t EstimateEntriesAdded() = 0; // Generate the filter using the keys that are added diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index 212666a3b4..af741787a3 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -20,13 +20,8 @@ namespace ROCKSDB_NAMESPACE { FullFilterBlockBuilder::FullFilterBlockBuilder( const SliceTransform* _prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder) - : need_last_prefix_(whole_key_filtering && _prefix_extractor != nullptr), - prefix_extractor_(_prefix_extractor), - whole_key_filtering_(whole_key_filtering), - last_whole_key_recorded_(false), - last_prefix_recorded_(false), - last_key_in_domain_(false), - any_added_(false) { + : prefix_extractor_(_prefix_extractor), + whole_key_filtering_(whole_key_filtering) { assert(filter_bits_builder != nullptr); filter_bits_builder_.reset(filter_bits_builder); } @@ -35,96 +30,31 @@ size_t FullFilterBlockBuilder::EstimateEntriesAdded() { return filter_bits_builder_->EstimateEntriesAdded(); } +void FullFilterBlockBuilder::AddWithPrevKey( + const Slice& key_without_ts, const Slice& /*prev_key_without_ts*/) { + FullFilterBlockBuilder::Add(key_without_ts); +} + void FullFilterBlockBuilder::Add(const Slice& key_without_ts) { - const bool add_prefix = - prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts); - - if (need_last_prefix_ && !last_prefix_recorded_ && last_key_in_domain_) { - // We can reach here when a new filter partition starts in partitioned - // filter. The last prefix in the previous partition should be added if - // necessary regardless of key_without_ts, to support prefix SeekForPrev. - AddKey(last_prefix_str_); - last_prefix_recorded_ = true; - } - - if (whole_key_filtering_) { - if (!add_prefix) { - AddKey(key_without_ts); + if (prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts)) { + Slice prefix = prefix_extractor_->Transform(key_without_ts); + if (whole_key_filtering_) { + filter_bits_builder_->AddKeyAndAlt(key_without_ts, prefix); } else { - // if both whole_key and prefix are added to bloom then we will have whole - // key_without_ts and prefix addition being interleaved and thus cannot - // rely on the bits builder to properly detect the duplicates by comparing - // with the last item. - Slice last_whole_key = Slice(last_whole_key_str_); - if (!last_whole_key_recorded_ || - last_whole_key.compare(key_without_ts) != 0) { - AddKey(key_without_ts); - last_whole_key_recorded_ = true; - last_whole_key_str_.assign(key_without_ts.data(), - key_without_ts.size()); - } + filter_bits_builder_->AddKey(prefix); } - } - if (add_prefix) { - last_key_in_domain_ = true; - AddPrefix(key_without_ts); - } else { - last_key_in_domain_ = false; + } else if (whole_key_filtering_) { + filter_bits_builder_->AddKey(key_without_ts); } } -// Add key to filter if needed -inline void FullFilterBlockBuilder::AddKey(const Slice& key) { - filter_bits_builder_->AddKey(key); - any_added_ = true; -} - -// Add prefix to filter if needed -void FullFilterBlockBuilder::AddPrefix(const Slice& key) { - assert(prefix_extractor_ && prefix_extractor_->InDomain(key)); - Slice prefix = prefix_extractor_->Transform(key); - if (need_last_prefix_) { - // WART/FIXME: Because last_prefix_str_ is needed above to make - // SeekForPrev work with partitioned + prefix filters, we are currently - // use this inefficient code in that case (in addition to prefix+whole - // key). Hopefully this can be optimized with some refactoring up the call - // chain to BlockBasedTableBuilder. Even in PartitionedFilterBlockBuilder, - // we don't currently have access to the previous key/prefix by the time we - // know we are starting a new partition. - - // if both whole_key and prefix are added to bloom then we will have whole - // key and prefix addition being interleaved and thus cannot rely on the - // bits builder to properly detect the duplicates by comparing with the last - // item. - Slice last_prefix = Slice(last_prefix_str_); - if (!last_prefix_recorded_ || last_prefix.compare(prefix) != 0) { - AddKey(prefix); - last_prefix_recorded_ = true; - last_prefix_str_.assign(prefix.data(), prefix.size()); - } - } else { - AddKey(prefix); - } -} - -void FullFilterBlockBuilder::Reset() { - last_whole_key_recorded_ = false; - last_prefix_recorded_ = false; -} - -Slice FullFilterBlockBuilder::Finish( - const BlockHandle& /*tmp*/, Status* status, - std::unique_ptr* filter_data) { - Reset(); - // In this impl we ignore BlockHandle - *status = Status::OK(); - if (any_added_) { - any_added_ = false; - Slice filter_content = filter_bits_builder_->Finish( - filter_data ? filter_data : &filter_data_, status); - return filter_content; - } - return Slice(); +Status FullFilterBlockBuilder::Finish( + const BlockHandle& /*last_partition_block_handle*/, Slice* filter, + std::unique_ptr* filter_owner) { + Status s = Status::OK(); + *filter = filter_bits_builder_->Finish( + filter_owner ? filter_owner : &filter_data_, &s); + return s; } FullFilterBlockReader::FullFilterBlockReader( diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index 53d59c6da3..784f0eb881 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -50,10 +50,15 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { ~FullFilterBlockBuilder() {} void Add(const Slice& key_without_ts) override; - bool IsEmpty() const override { return !any_added_; } + void AddWithPrevKey(const Slice& key_without_ts, + const Slice& prev_key_without_ts) override; + + bool IsEmpty() const override { + return filter_bits_builder_->EstimateEntriesAdded() == 0; + } size_t EstimateEntriesAdded() override; - Slice Finish(const BlockHandle& tmp, Status* status, - std::unique_ptr* filter_data = nullptr) override; + Status Finish(const BlockHandle& last_partition_block_handle, Slice* filter, + std::unique_ptr* filter_owner = nullptr) override; using FilterBlockBuilder::Finish; void ResetFilterBitsBuilder() override { filter_bits_builder_.reset(); } @@ -63,30 +68,17 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { } protected: - virtual void AddKey(const Slice& key); + const SliceTransform* prefix_extractor() const { return prefix_extractor_; } + bool whole_key_filtering() const { return whole_key_filtering_; } + std::unique_ptr filter_bits_builder_; - virtual void Reset(); - void AddPrefix(const Slice& key); - const SliceTransform* prefix_extractor() { return prefix_extractor_; } - const std::string& last_prefix_str() const { return last_prefix_str_; } - bool need_last_prefix_; private: // important: all of these might point to invalid addresses // at the time of destruction of this filter block. destructor // should NOT dereference them. - const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; - bool last_whole_key_recorded_; - std::string last_whole_key_str_; - bool last_prefix_recorded_; - std::string last_prefix_str_; - // Whether prefix_extractor_->InDomain(last_whole_key_) is true. - // Used in partitioned filters so that the last prefix from the previous - // filter partition will be added to the current partition if - // last_key_in_domain_ is true, regardless of the current key. - bool last_key_in_domain_; - bool any_added_; + const SliceTransform* const prefix_extractor_; + const bool whole_key_filtering_; std::unique_ptr filter_data_; }; diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index 083d287763..f90492d858 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -29,6 +29,10 @@ class TestFilterBitsBuilder : public FilterBitsBuilder { void AddKey(const Slice& key) override { hash_entries_.push_back(Hash(key.data(), key.size(), 1)); } + void AddKeyAndAlt(const Slice& key, const Slice& alt) override { + AddKey(key); + AddKey(alt); + } using FilterBitsBuilder::Finish; @@ -104,7 +108,7 @@ class PluginFullFilterBlockTest : public mock::MockBlockBasedTableTester, TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) { FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); - Slice slice = builder.Finish(); + Slice slice = builder.TEST_Finish(); ASSERT_EQ("", EscapeString(slice)); CachableEntry block( @@ -127,7 +131,7 @@ TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { builder.Add("box"); builder.Add("box"); builder.Add("hello"); - Slice slice = builder.Finish(); + Slice slice = builder.TEST_Finish(); CachableEntry block( new ParsedFullFilterBlock(table_options_.filter_policy.get(), @@ -174,7 +178,7 @@ class FullFilterBlockTest : public mock::MockBlockBasedTableTester, TEST_F(FullFilterBlockTest, EmptyBuilder) { FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); - Slice slice = builder.Finish(); + Slice slice = builder.TEST_Finish(); ASSERT_EQ("", EscapeString(slice)); CachableEntry block( @@ -203,6 +207,11 @@ class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder { b_->AddKey(key); uniq_.insert(key.ToString()); } + void AddKeyAndAlt(const Slice& key, const Slice& alt) override { + b_->AddKeyAndAlt(key, alt); + uniq_.insert(key.ToString()); + uniq_.insert(alt.ToString()); + } using FilterBitsBuilder::Finish; @@ -274,8 +283,8 @@ TEST_F(FullFilterBlockTest, SingleChunk) { // "box" only counts once ASSERT_EQ(4, builder.EstimateEntriesAdded()); ASSERT_FALSE(builder.IsEmpty()); - Status s; - Slice slice = builder.Finish(BlockHandle(), &s); + Slice slice; + Status s = builder.Finish(BlockHandle(), &slice); ASSERT_OK(s); CachableEntry block( diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index c2c5405cf1..ce0b691a47 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -27,9 +27,13 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, const uint32_t partition_size, size_t ts_sz, - const bool persist_user_defined_timestamps) + const bool persist_user_defined_timestamps, + bool decouple_from_index_partitions) : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering, filter_bits_builder), + p_index_builder_(p_index_builder), + ts_sz_(ts_sz), + decouple_from_index_partitions_(decouple_from_index_partitions), index_on_filter_block_builder_( index_block_restart_interval, true /*use_delta_encoding*/, use_value_delta_encoding, @@ -41,12 +45,7 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( use_value_delta_encoding, BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */, 0.75 /* data_block_hash_table_util_ratio */, ts_sz, - persist_user_defined_timestamps, true /* is_user_key */), - p_index_builder_(p_index_builder), - keys_added_to_partition_(0), - total_added_in_built_(0) { - // See FullFilterBlockBuilder::AddPrefix - need_last_prefix_ = prefix_extractor() != nullptr; + persist_user_defined_timestamps, true /* is_user_key */) { // Compute keys_per_partition_ keys_per_partition_ = static_cast( filter_bits_builder_->ApproximateNumEntries(partition_size)); @@ -70,36 +69,60 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( } } } + if (keys_per_partition_ > 1 && prefix_extractor()) { + // Correct for adding next prefix in CutAFilterBlock *after* checking + // against this threshold + keys_per_partition_--; + } } PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() { partitioned_filters_construction_status_.PermitUncheckedError(); } -void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( - const Slice* next_key) { - // Use == to send the request only once - if (keys_added_to_partition_ == keys_per_partition_) { - // Currently only index builder is in charge of cutting a partition. We keep - // requesting until it is granted. - p_index_builder_->RequestPartitionCut(); - } - if (!p_index_builder_->ShouldCutFilterBlock()) { - return; +bool PartitionedFilterBlockBuilder::DecideCutAFilterBlock() { + size_t added = filter_bits_builder_->EstimateEntriesAdded(); + if (decouple_from_index_partitions_) { + // NOTE: Can't just use ==, because estimated might be incremented by more + // than one. + return added >= keys_per_partition_; + } else { + // NOTE: Can't just use ==, because estimated might be incremented by more + // than one. + if (added >= keys_per_partition_) { + // Currently only index builder is in charge of cutting a partition. We + // keep requesting until it is granted. + p_index_builder_->RequestPartitionCut(); + } + return p_index_builder_->ShouldCutFilterBlock(); } +} - // Add the prefix of the next key before finishing the partition without - // updating last_prefix_str_. This hack fixes a bug with format_verison=3 - // where seeking for the prefix would lead us to the previous partition. - const bool maybe_add_prefix = - next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key); - if (maybe_add_prefix) { - const Slice next_key_prefix = prefix_extractor()->Transform(*next_key); - if (next_key_prefix.compare(last_prefix_str()) != 0) { - AddKey(next_key_prefix); +void PartitionedFilterBlockBuilder::CutAFilterBlock(const Slice* next_key, + const Slice* next_prefix, + const Slice& prev_key) { + // When there is a next partition, add the prefix of the first key in the + // next partition before closing this one out. This is needed to support + // prefix Seek, because there could exist a key k where + // * last_key < k < next_key + // * prefix(last_key) != prefix(k) + // * prefix(k) == prefix(next_key) + // * seeking to k lands in this partition, not the next + // in which case the iterator needs to find next_key despite starting in + // the partition before it. (This fixes a bug in the original implementation + // of format_version=3.) + if (next_prefix) { + if (whole_key_filtering()) { + // NOTE: At the end of building filter bits, we need a special case for + // treating prefix as an "alt" entry. See AddKeyAndAlt() comment. This is + // a reasonable hack for that. + filter_bits_builder_->AddKeyAndAlt(*next_prefix, *next_prefix); + } else { + filter_bits_builder_->AddKey(*next_prefix); } } + // Cut the partition total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded(); std::unique_ptr filter_data; Status filter_construction_status = Status::OK(); @@ -108,34 +131,103 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( if (filter_construction_status.ok()) { filter_construction_status = filter_bits_builder_->MaybePostVerify(filter); } - filters.push_back( - {p_index_builder_->GetPartitionKey(), std::move(filter_data), filter}); - if (!filter_construction_status.ok() && - partitioned_filters_construction_status_.ok()) { - partitioned_filters_construction_status_ = filter_construction_status; + std::string ikey; + if (decouple_from_index_partitions_) { + if (ts_sz_ > 0) { + AppendKeyWithMinTimestamp(&ikey, prev_key, ts_sz_); + } else { + ikey = prev_key.ToString(); + } + AppendInternalKeyFooter(&ikey, /*seqno*/ 0, ValueType::kTypeDeletion); + } else { + ikey = p_index_builder_->GetPartitionKey(); + } + filters_.push_back({std::move(ikey), std::move(filter_data), filter}); + partitioned_filters_construction_status_.UpdateIfOk( + filter_construction_status); + + // If we are building another filter partition, the last prefix in the + // previous partition should be added to support prefix SeekForPrev. + // (Analogous to above fix for prefix Seek.) + if (next_key && prefix_extractor() && + prefix_extractor()->InDomain(prev_key)) { + // NOTE: At the beginning of building filter bits, we don't need a special + // case for treating prefix as an "alt" entry. + // See DBBloomFilterTest.FilterBitsBuilderDedup + filter_bits_builder_->AddKey(prefix_extractor()->Transform(prev_key)); } - keys_added_to_partition_ = 0; - Reset(); } -void PartitionedFilterBlockBuilder::Add(const Slice& key) { - MaybeCutAFilterBlock(&key); - FullFilterBlockBuilder::Add(key); +void PartitionedFilterBlockBuilder::Add(const Slice& key_without_ts) { + assert(!DEBUG_add_with_prev_key_called_); + AddImpl(key_without_ts, prev_key_without_ts_); + prev_key_without_ts_.assign(key_without_ts.data(), key_without_ts.size()); } -void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { - FullFilterBlockBuilder::AddKey(key); - keys_added_to_partition_++; +void PartitionedFilterBlockBuilder::AddWithPrevKey( + const Slice& key_without_ts, const Slice& prev_key_without_ts) { +#ifndef NDEBUG + if (!DEBUG_add_with_prev_key_called_) { + assert(prev_key_without_ts.compare(prev_key_without_ts_) == 0); + DEBUG_add_with_prev_key_called_ = true; + } else { + assert(prev_key_without_ts.compare(DEBUG_prev_key_without_ts_) == 0); + } + DEBUG_prev_key_without_ts_.assign(key_without_ts.data(), + key_without_ts.size()); +#endif + AddImpl(key_without_ts, prev_key_without_ts); +} + +void PartitionedFilterBlockBuilder::AddImpl(const Slice& key_without_ts, + const Slice& prev_key_without_ts) { + // When filter partitioning is coupled to index partitioning, we need to + // check for cutting a block even if we aren't adding anything this time. + bool cut = DecideCutAFilterBlock(); + if (prefix_extractor() && prefix_extractor()->InDomain(key_without_ts)) { + Slice prefix = prefix_extractor()->Transform(key_without_ts); + if (cut) { + CutAFilterBlock(&key_without_ts, &prefix, prev_key_without_ts); + } + if (whole_key_filtering()) { + filter_bits_builder_->AddKeyAndAlt(key_without_ts, prefix); + } else { + filter_bits_builder_->AddKey(prefix); + } + } else { + if (cut) { + CutAFilterBlock(&key_without_ts, nullptr /*no prefix*/, + prev_key_without_ts); + } + if (whole_key_filtering()) { + filter_bits_builder_->AddKey(key_without_ts); + } + } } size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() { return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded(); } -Slice PartitionedFilterBlockBuilder::Finish( - const BlockHandle& last_partition_block_handle, Status* status, - std::unique_ptr* filter_data) { - if (finishing_filters == true) { +void PartitionedFilterBlockBuilder::PrevKeyBeforeFinish( + const Slice& prev_key_without_ts) { + assert(prev_key_without_ts.compare(DEBUG_add_with_prev_key_called_ + ? DEBUG_prev_key_without_ts_ + : prev_key_without_ts_) == 0); + if (filter_bits_builder_->EstimateEntriesAdded() > 0) { + CutAFilterBlock(nullptr /*no next key*/, nullptr /*no next prefix*/, + prev_key_without_ts); + } +} + +Status PartitionedFilterBlockBuilder::Finish( + const BlockHandle& last_partition_block_handle, Slice* filter, + std::unique_ptr* filter_owner) { + if (finishing_front_filter_) { + assert(!filters_.empty()); + auto& e = filters_.front(); + + assert(last_partition_block_handle != BlockHandle{}); // Record the handle of the last written filter block in the index std::string handle_encoding; last_partition_block_handle.EncodeTo(&handle_encoding); @@ -145,54 +237,59 @@ Slice PartitionedFilterBlockBuilder::Finish( last_partition_block_handle.size() - last_encoded_handle_.size()); last_encoded_handle_ = last_partition_block_handle; const Slice handle_delta_encoding_slice(handle_delta_encoding); - index_on_filter_block_builder_.Add(last_filter_entry_key, handle_encoding, + + index_on_filter_block_builder_.Add(e.ikey, handle_encoding, &handle_delta_encoding_slice); if (!p_index_builder_->seperator_is_key_plus_seq()) { index_on_filter_block_builder_without_seq_.Add( - ExtractUserKey(last_filter_entry_key), handle_encoding, + ExtractUserKey(e.ikey), handle_encoding, &handle_delta_encoding_slice); } + + filters_.pop_front(); } else { - MaybeCutAFilterBlock(nullptr); + assert(last_partition_block_handle == BlockHandle{}); + if (filter_bits_builder_->EstimateEntriesAdded() > 0) { + // PrevKeyBeforeFinish was not called + assert(!DEBUG_add_with_prev_key_called_); + CutAFilterBlock(nullptr, nullptr, prev_key_without_ts_); + } + // Nothing uncommitted + assert(filter_bits_builder_->EstimateEntriesAdded() == 0); } - if (!partitioned_filters_construction_status_.ok()) { - *status = partitioned_filters_construction_status_; - return Slice(); - } + Status s = partitioned_filters_construction_status_; + assert(!s.IsIncomplete()); - // If there is no filter partition left, then return the index on filter - // partitions - if (UNLIKELY(filters.empty())) { - *status = Status::OK(); - last_filter_data.reset(); - if (finishing_filters) { - // Simplest to just add them all at the end - total_added_in_built_ = 0; - if (p_index_builder_->seperator_is_key_plus_seq()) { - return index_on_filter_block_builder_.Finish(); + if (s.ok()) { + // If there is no filter partition left, then return the index on filter + // partitions + if (UNLIKELY(filters_.empty())) { + if (!index_on_filter_block_builder_.empty()) { + // Simplest to just add them all at the end + if (p_index_builder_->seperator_is_key_plus_seq()) { + *filter = index_on_filter_block_builder_.Finish(); + } else { + *filter = index_on_filter_block_builder_without_seq_.Finish(); + } } else { - return index_on_filter_block_builder_without_seq_.Finish(); + // This is the rare case where no key was added to the filter + *filter = Slice{}; } } else { - // This is the rare case where no key was added to the filter - return Slice(); - } - } else { - // Return the next filter partition in line and set Incomplete() status to - // indicate we expect more calls to Finish - *status = Status::Incomplete(); - finishing_filters = true; + // Return the next filter partition in line and set Incomplete() status to + // indicate we expect more calls to Finish + s = Status::Incomplete(); + finishing_front_filter_ = true; - last_filter_entry_key = filters.front().key; - Slice filter = filters.front().filter; - last_filter_data = std::move(filters.front().filter_data); - if (filter_data != nullptr) { - *filter_data = std::move(last_filter_data); + auto& e = filters_.front(); + if (filter_owner != nullptr) { + *filter_owner = std::move(e.filter_owner); + } + *filter = e.filter; } - filters.pop_front(); - return filter; } + return s; } PartitionedFilterBlockReader::PartitionedFilterBlockReader( diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 24241cc779..8faed24a92 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -32,22 +32,30 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, const uint32_t partition_size, size_t ts_sz, - const bool persist_user_defined_timestamps); + const bool persist_user_defined_timestamps, + bool decouple_from_index_partitions); virtual ~PartitionedFilterBlockBuilder(); - void AddKey(const Slice& key) override; - void Add(const Slice& key) override; + void Add(const Slice& key_without_ts) override; + void AddWithPrevKey(const Slice& key_without_ts, + const Slice& prev_key_without_ts) override; + bool IsEmpty() const override { + return filter_bits_builder_->EstimateEntriesAdded() == 0 && + filters_.empty(); + } + size_t EstimateEntriesAdded() override; - Slice Finish(const BlockHandle& last_partition_block_handle, Status* status, - std::unique_ptr* filter_data = nullptr) override; + void PrevKeyBeforeFinish(const Slice& prev_key_without_ts) override; + Status Finish(const BlockHandle& last_partition_block_handle, Slice* filter, + std::unique_ptr* filter_owner = nullptr) override; void ResetFilterBitsBuilder() override { - // Previously constructed partitioned filters by - // this to-be-reset FiterBitsBuilder can also be - // cleared - filters.clear(); + filters_.clear(); + total_added_in_built_ = 0; + index_on_filter_block_builder_.Reset(); + index_on_filter_block_builder_without_seq_.Reset(); FullFilterBlockBuilder::ResetFilterBitsBuilder(); } @@ -59,44 +67,65 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { return Status::OK(); } - private: + private: // fns + // Whether to cut a filter block before the next key + bool DecideCutAFilterBlock(); + void CutAFilterBlock(const Slice* next_key, const Slice* next_prefix, + const Slice& prev_key); + + void AddImpl(const Slice& key_without_ts, const Slice& prev_key_without_ts); + + private: // data + // Currently we keep the same number of partitions for filters and indexes. + // This would allow for some potentioal optimizations in future. If such + // optimizations did not realize we can use different number of partitions and + // eliminate p_index_builder_ + PartitionedIndexBuilder* const p_index_builder_; + const size_t ts_sz_; + const bool decouple_from_index_partitions_; + // Filter data - BlockBuilder index_on_filter_block_builder_; // top-level index builder - BlockBuilder - index_on_filter_block_builder_without_seq_; // same for user keys struct FilterEntry { - std::string key; - std::unique_ptr filter_data; + std::string ikey; // internal key or separator *after* this filter + std::unique_ptr filter_owner; Slice filter; }; - std::deque filters; // list of partitioned filters and keys used - // in building the index + std::deque filters_; // list of partitioned filters and keys + // used in building the index + // The desired number of keys per partition + uint32_t keys_per_partition_; + // According to the bits builders, how many keys/prefixes added + // in all the filters we have fully built + uint64_t total_added_in_built_ = 0; // Set to the first non-okay status if any of the filter // partitions experiences construction error. // If partitioned_filters_construction_status_ is non-okay, // then the whole partitioned filters should not be used. Status partitioned_filters_construction_status_; - std::string last_filter_entry_key; - std::unique_ptr last_filter_data; - std::unique_ptr value; - bool finishing_filters = - false; // true if Finish is called once but not complete yet. - // The policy of when cut a filter block and Finish it - void MaybeCutAFilterBlock(const Slice* next_key); - // Currently we keep the same number of partitions for filters and indexes. - // This would allow for some potentioal optimizations in future. If such - // optimizations did not realize we can use different number of partitions and - // eliminate p_index_builder_ - PartitionedIndexBuilder* const p_index_builder_; - // The desired number of keys per partition - uint32_t keys_per_partition_; - // The number of keys added to the last partition so far - uint32_t keys_added_to_partition_; - // According to the bits builders, how many keys/prefixes added - // in all the filters we have fully built - uint64_t total_added_in_built_; + + // For Add without prev key + std::string prev_key_without_ts_; + +#ifndef NDEBUG + // For verifying accurate previous keys are provided by the caller, so that + // release code can be fast + bool DEBUG_add_with_prev_key_called_ = false; + std::string DEBUG_prev_key_without_ts_; +#endif // NDEBUG + + // ===== State for Finish() ===== + + // top-level index builder on internal keys + BlockBuilder index_on_filter_block_builder_; + // same for user keys + BlockBuilder index_on_filter_block_builder_without_seq_; + // For delta-encoding handles BlockHandle last_encoded_handle_; + // True if we are between two calls to Finish(), because we have returned + // the filter at the front of filters_ but haven't yet added it to the + // partition index. + bool finishing_front_filter_ = false; }; class PartitionedFilterBlockReader diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 8d2dd62c3b..80cb131a99 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -58,7 +58,7 @@ class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader { class PartitionedFilterBlockTest : public testing::Test, virtual public ::testing::WithParamInterface< - std::tuple> { + std::tuple> { public: Options options_; ImmutableOptions ioptions_; @@ -70,6 +70,7 @@ class PartitionedFilterBlockTest int bits_per_key_; size_t ts_sz_; bool user_defined_timestamps_persisted_; + bool decouple_partitioned_filters; PartitionedFilterBlockTest() : bits_per_key_(10) { auto udt_test_mode = std::get<1>(GetParam()); @@ -85,6 +86,8 @@ class PartitionedFilterBlockTest NewBloomFilterPolicy(bits_per_key_, false)); table_options_.format_version = std::get<0>(GetParam()); table_options_.index_block_restart_interval = 3; + table_options_.decouple_partitioned_filters = decouple_partitioned_filters = + std::get<2>(GetParam()); } ~PartitionedFilterBlockTest() override = default; @@ -160,18 +163,24 @@ class PartitionedFilterBlockTest FilterBuildingContext(table_options_)), table_options_.index_block_restart_interval, !kValueDeltaEncoded, p_index_builder, partition_size, ts_sz_, - user_defined_timestamps_persisted_); + user_defined_timestamps_persisted_, decouple_partitioned_filters); } PartitionedFilterBlockReader* NewReader( - PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) { + PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib, + bool expect_empty = false) { BlockHandle bh; Status status; Slice slice; std::unique_ptr filter_data; do { - slice = builder->Finish(bh, &status, &filter_data); + status = builder->Finish(bh, &slice, &filter_data); bh = Write(slice); + if (expect_empty) { + // Ensure most efficient "empty" filter is used + EXPECT_OK(status); + EXPECT_EQ(0, slice.size()); + } } while (status.IsIncomplete()); constexpr bool skip_filters = false; @@ -196,7 +205,7 @@ class PartitionedFilterBlockTest void VerifyReader(PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib, bool empty = false) { std::unique_ptr reader( - NewReader(builder, pib)); + NewReader(builder, pib, empty)); // Querying added keys std::vector keys = PrepareKeys(keys_without_ts, kKeyNum); for (const auto& key : keys) { @@ -312,10 +321,9 @@ class PartitionedFilterBlockTest void CutABlock(PartitionedIndexBuilder* builder, const std::string& user_key, const std::string& next_user_key) { // Assuming a block is cut, add an entry to the index - std::string key = - std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep()); - std::string next_key = std::string( - *InternalKey(next_user_key, 0, ValueType::kTypeValue).rep()); + std::string key = *InternalKey(user_key, 0, ValueType::kTypeValue).rep(); + std::string next_key = + *InternalKey(next_user_key, 0, ValueType::kTypeValue).rep(); BlockHandle dont_care_block_handle(1, 1); Slice slice = Slice(next_key.data(), next_key.size()); std::string scratch; @@ -338,10 +346,10 @@ class PartitionedFilterBlockTest // Format versions potentially intersting to partitioning INSTANTIATE_TEST_CASE_P( FormatVersions, PartitionedFilterBlockTest, - testing::Combine(testing::ValuesIn(std::set{ - 2, 3, 4, test::kDefaultFormatVersion, - kLatestFormatVersion}), - testing::ValuesIn(test::GetUDTTestModes()))); + testing::Combine( + testing::ValuesIn(std::set{ + 2, 3, 4, 5, test::kDefaultFormatVersion, kLatestFormatVersion}), + testing::ValuesIn(test::GetUDTTestModes()), testing::Bool())); TEST_P(PartitionedFilterBlockTest, EmptyBuilder) { std::unique_ptr pib(NewIndexBuilder()); diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 1316d7302d..0637440bdc 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -84,7 +84,9 @@ inline bool BlockFetcher::TryGetFromPrefetchBuffer() { if (io_status_.ok()) { got_from_prefetch_buffer_ = true; used_buf_ = const_cast(slice_.data()); - } else if (!(io_status_.IsCorruption() && retry_corrupt_read_)) { + } else if (io_status_.IsCorruption()) { + // Returning true apparently indicates we either got some data from + // the prefetch buffer, or we tried and encountered an error. return true; } } @@ -334,9 +336,15 @@ void BlockFetcher::ReadBlock(bool retry) { ProcessTrailerIfPresent(); } + if (retry) { + RecordTick(ioptions_.stats, FILE_READ_CORRUPTION_RETRY_COUNT); + } if (io_status_.ok()) { InsertCompressedBlockToPersistentCacheIfNeeded(); fs_buf_ = std::move(read_req.fs_scratch); + if (retry) { + RecordTick(ioptions_.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT); + } } else { ReleaseFileSystemProvidedBuffer(&read_req); direct_io_buf_.reset(); @@ -355,7 +363,11 @@ IOStatus BlockFetcher::ReadBlockContents() { return IOStatus::OK(); } if (TryGetFromPrefetchBuffer()) { + if (io_status_.IsCorruption() && retry_corrupt_read_) { + ReadBlock(/*retry=*/true); + } if (!io_status_.ok()) { + assert(!fs_buf_); return io_status_; } } else if (!TryGetSerializedBlockFromPersistentCache()) { diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 55f5935b11..26467a2805 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -59,12 +59,11 @@ PropertyBlockBuilder::PropertyBlockBuilder() void PropertyBlockBuilder::Add(const std::string& name, const std::string& val) { + assert(props_.find(name) == props_.end()); props_.insert({name, val}); } void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) { - assert(props_.find(name) == props_.end()); - std::string dst; PutVarint64(&dst, val); @@ -164,11 +163,19 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { Add(TablePropertiesNames::kSequenceNumberTimeMapping, props.seqno_to_time_mapping); } + if (props.key_largest_seqno != UINT64_MAX) { + Add(TablePropertiesNames::kKeyLargestSeqno, props.key_largest_seqno); + } } Slice PropertyBlockBuilder::Finish() { for (const auto& prop : props_) { + assert(last_prop_added_to_block_.empty() || + comparator_->Compare(prop.first, last_prop_added_to_block_) > 0); properties_block_->Add(prop.first, prop.second); +#ifndef NDEBUG + last_prop_added_to_block_ = prop.first; +#endif /* !NDEBUG */ } return properties_block_->Finish(); @@ -218,12 +225,21 @@ bool NotifyCollectTableCollectorsOnFinish( UserCollectedProperties& readable_properties) { bool all_succeeded = true; for (auto& collector : collectors) { - Status s = collector->Finish(&user_collected_properties); + UserCollectedProperties user_properties; + Status s = collector->Finish(&user_properties); if (s.ok()) { for (const auto& prop : collector->GetReadableProperties()) { readable_properties.insert(prop); } - builder->Add(user_collected_properties); +#ifndef NDEBUG + // Check different user properties collectors are not adding properties of + // the same name. + for (const auto& pair : user_properties) { + assert(user_collected_properties.find(pair.first) == + user_collected_properties.end()); + } +#endif /* !NDEBUG */ + user_collected_properties.merge(user_properties); } else { LogPropertiesCollectionError(info_log, "Finish" /* method */, collector->Name()); @@ -232,6 +248,7 @@ bool NotifyCollectTableCollectorsOnFinish( } } } + builder->Add(user_collected_properties); return all_succeeded; } @@ -322,6 +339,8 @@ Status ReadTablePropertiesHelper( &new_table_properties->tail_start_offset}, {TablePropertiesNames::kUserDefinedTimestampsPersisted, &new_table_properties->user_defined_timestamps_persisted}, + {TablePropertiesNames::kKeyLargestSeqno, + &new_table_properties->key_largest_seqno}, }; std::string last_key; diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 3d1edb5018..a6aacdf503 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -73,6 +73,10 @@ class PropertyBlockBuilder { private: std::unique_ptr properties_block_; stl_wrappers::KVMap props_; +#ifndef NDEBUG + const Comparator* comparator_ = BytewiseComparator(); + Slice last_prop_added_to_block_; +#endif /* !NDEBUG */ }; // Were we encounter any error occurs during user-defined statistics collection, diff --git a/table/table_properties.cc b/table/table_properties.cc index 0a899af37a..037e483f6d 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -113,6 +113,8 @@ std::string TableProperties::ToString(const std::string& prop_delim, user_defined_timestamps_persisted ? std::string("true") : std::string("false"), prop_delim, kv_delim); + AppendProperty(result, "largest sequence number in file", key_largest_seqno, + prop_delim, kv_delim); AppendProperty( result, "merge operator name", @@ -311,6 +313,8 @@ const std::string TablePropertiesNames::kTailStartOffset = "rocksdb.tail.start.offset"; const std::string TablePropertiesNames::kUserDefinedTimestampsPersisted = "rocksdb.user.defined.timestamps.persisted"; +const std::string TablePropertiesNames::kKeyLargestSeqno = + "rocksdb.key.largest.seqno"; #ifndef NDEBUG // WARNING: TEST_SetRandomTableProperties assumes the following layout of diff --git a/table/table_test.cc b/table/table_test.cc index 1ffd53c8fe..9eee267614 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -4726,7 +4726,7 @@ static void DoCompressionTest(CompressionType comp) { ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3550)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3550)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7075)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7100)); c.ResetTableReader(); } diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh index 89ca0971cb..549abe39d6 100755 --- a/tools/check_format_compatible.sh +++ b/tools/check_format_compatible.sh @@ -125,7 +125,7 @@ EOF # To check for DB forward compatibility with loading options (old version # reading data from new), as well as backward compatibility -declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb") +declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb", "9.6.fb") # To check for DB forward compatibility without loading options (in addition # to the "with loading options" set), as well as backward compatibility declare -a db_forward_no_options_refs=() # N/A at the moment diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index be150dadc4..d51dbf30ad 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -641,6 +641,11 @@ DEFINE_bool(use_cache_jemalloc_no_dump_allocator, false, DEFINE_bool(use_cache_memkind_kmem_allocator, false, "Use memkind kmem allocator for block/blob cache."); +DEFINE_bool( + decouple_partitioned_filters, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().decouple_partitioned_filters, + "Decouple filter partitioning from index partitioning."); + DEFINE_bool(partition_index_and_filters, false, "Partition index and filter blocks."); @@ -1275,6 +1280,9 @@ DEFINE_bool( auto_readahead_size, false, "When set true, RocksDB does auto tuning of readahead size during Scans"); +DEFINE_bool(paranoid_memory_checks, false, + "Sets CF option paranoid_memory_checks"); + static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType( const char* ctype) { assert(ctype); @@ -4362,6 +4370,8 @@ class Benchmark { } else { block_based_options.index_type = BlockBasedTableOptions::kBinarySearch; } + block_based_options.decouple_partitioned_filters = + FLAGS_decouple_partitioned_filters; if (FLAGS_partition_index_and_filters || FLAGS_partition_index) { if (FLAGS_index_with_first_key) { fprintf(stderr, @@ -4732,6 +4742,7 @@ class Benchmark { FLAGS_memtable_protection_bytes_per_key; options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key; + options.paranoid_memory_checks = FLAGS_paranoid_memory_checks; } void InitializeOptionsGeneral(Options* opts) { diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index d611257252..c95851310d 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -75,6 +75,7 @@ default_params = { "compaction_pri": random.randint(0, 4), "key_may_exist_one_in": lambda: random.choice([100, 100000]), "data_block_index_type": lambda: random.choice([0, 1]), + "decouple_partitioned_filters": lambda: random.choice([0, 1, 1]), "delpercent": 4, "delrangepercent": 1, "destroy_db_initially": 0, @@ -130,7 +131,8 @@ default_params = { "prefixpercent": 5, "progress_reports": 0, "readpercent": 45, - "recycle_log_file_num": lambda: random.randint(0, 1), + # See disabled DBWALTest.RecycleMultipleWalsCrash + "recycle_log_file_num": 0, "snapshot_hold_ops": 100000, "sqfc_name": lambda: random.choice(["foo", "bar"]), # 0 = disable writing SstQueryFilters @@ -338,6 +340,7 @@ default_params = { "check_multiget_entity_consistency": lambda: random.choice([0, 0, 0, 1]), "use_timed_put_one_in": lambda: random.choice([0] * 7 + [1, 5, 10]), "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]), + "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]), } _TEST_DIR_ENV_VAR = "TEST_TMPDIR" # If TEST_TMPDIR_EXPECTED is not specified, default value will be TEST_TMPDIR diff --git a/unreleased_history/behavior_changes/ingest-live-file-with-move.md b/unreleased_history/behavior_changes/ingest-live-file-with-move.md new file mode 100644 index 0000000000..303c5754e2 --- /dev/null +++ b/unreleased_history/behavior_changes/ingest-live-file-with-move.md @@ -0,0 +1 @@ +* Support ingesting db generated files using hard link, i.e. IngestExternalFileOptions::move_files and IngestExternalFileOptions::allow_db_generated_files._ \ No newline at end of file diff --git a/unreleased_history/bug_fixes/duplicate_wal_entries.md b/unreleased_history/bug_fixes/duplicate_wal_entries.md deleted file mode 100644 index 993520efc1..0000000000 --- a/unreleased_history/bug_fixes/duplicate_wal_entries.md +++ /dev/null @@ -1 +0,0 @@ -Fixed a bug where successful write right after error recovery for last failed write finishes causes duplicate WAL entries diff --git a/unreleased_history/bug_fixes/invalid_manifest_number.md b/unreleased_history/bug_fixes/invalid_manifest_number.md deleted file mode 100644 index d98bd6ab18..0000000000 --- a/unreleased_history/bug_fixes/invalid_manifest_number.md +++ /dev/null @@ -1 +0,0 @@ -*Fix a bug where file snapshot functions like backup, checkpoint may attempt to copy a non-existing manifest file. #12882 \ No newline at end of file diff --git a/unreleased_history/bug_fixes/prevent_duplicate_txn_name.md b/unreleased_history/bug_fixes/prevent_duplicate_txn_name.md deleted file mode 100644 index 152869d7d2..0000000000 --- a/unreleased_history/bug_fixes/prevent_duplicate_txn_name.md +++ /dev/null @@ -1 +0,0 @@ -Fix a race condition in pessimistic transactions that could allow multiple transactions with the same name to be registered simultaneously, resulting in a crash or other unpredictable behavior. diff --git a/unreleased_history/new_features/tp_largest_seqno.md b/unreleased_history/new_features/tp_largest_seqno.md new file mode 100644 index 0000000000..6776cfcf23 --- /dev/null +++ b/unreleased_history/new_features/tp_largest_seqno.md @@ -0,0 +1 @@ +* Add a new table property "rocksdb.key.largest.seqno" which records the largest sequence number of all keys in file. It is verified to be zero during SST file ingestion. \ No newline at end of file diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index 923c8b46a9..9d60082e6b 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -760,7 +760,7 @@ TEST_F(BlobDBTest, SstFileManager) { // run the same test for Get(), MultiGet() and Iterator each. std::shared_ptr sst_file_manager( NewSstFileManager(mock_env_.get())); - sst_file_manager->SetDeleteRateBytesPerSecond(1); + sst_file_manager->SetDeleteRateBytesPerSecond(1024 * 1024); SstFileManagerImpl *sfm = static_cast(sst_file_manager.get()); @@ -818,7 +818,7 @@ TEST_F(BlobDBTest, SstFileManagerRestart) { // run the same test for Get(), MultiGet() and Iterator each. std::shared_ptr sst_file_manager( NewSstFileManager(mock_env_.get())); - sst_file_manager->SetDeleteRateBytesPerSecond(1); + sst_file_manager->SetDeleteRateBytesPerSecond(1024 * 1024); SstFileManagerImpl *sfm = static_cast(sst_file_manager.get()); diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index f29b02f774..3bdc1a9402 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -46,28 +46,41 @@ Status Checkpoint::CreateCheckpoint(const std::string& /*checkpoint_dir*/, return Status::NotSupported(""); } -void CheckpointImpl::CleanStagingDirectory(const std::string& full_private_path, - Logger* info_log) { +Status CheckpointImpl::CleanStagingDirectory( + const std::string& full_private_path, Logger* info_log) { std::vector subchildren; Status s = db_->GetEnv()->FileExists(full_private_path); if (s.IsNotFound()) { - return; + // Nothing to clean + return Status::OK(); + } else if (!s.ok()) { + return s; } + assert(s.ok()); ROCKS_LOG_INFO(info_log, "File exists %s -- %s", full_private_path.c_str(), s.ToString().c_str()); + s = db_->GetEnv()->GetChildren(full_private_path, &subchildren); if (s.ok()) { for (auto& subchild : subchildren) { + Status del_s; std::string subchild_path = full_private_path + "/" + subchild; - s = db_->GetEnv()->DeleteFile(subchild_path); + del_s = db_->GetEnv()->DeleteFile(subchild_path); ROCKS_LOG_INFO(info_log, "Delete file %s -- %s", subchild_path.c_str(), - s.ToString().c_str()); + del_s.ToString().c_str()); + if (!del_s.ok() && s.ok()) { + s = del_s; + } } } - // finally delete the private dir - s = db_->GetEnv()->DeleteDir(full_private_path); - ROCKS_LOG_INFO(info_log, "Delete dir %s -- %s", full_private_path.c_str(), - s.ToString().c_str()); + + // Then delete the private dir + if (s.ok()) { + s = db_->GetEnv()->DeleteDir(full_private_path); + ROCKS_LOG_INFO(info_log, "Delete dir %s -- %s", full_private_path.c_str(), + s.ToString().c_str()); + } + return s; } Status Checkpoint::ExportColumnFamily( @@ -82,14 +95,17 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir, uint64_t* sequence_number_ptr) { DBOptions db_options = db_->GetDBOptions(); - Status s = db_->GetEnv()->FileExists(checkpoint_dir); - if (s.ok()) { + Status file_exists_s = db_->GetEnv()->FileExists(checkpoint_dir); + if (file_exists_s.ok()) { return Status::InvalidArgument("Directory exists"); - } else if (!s.IsNotFound()) { - assert(s.IsIOError()); - return s; - } + } else if (!file_exists_s.IsNotFound()) { + assert(file_exists_s.IsIOError()); + return file_exists_s; + } else { + assert(file_exists_s.IsNotFound()); + }; + Status s; ROCKS_LOG_INFO( db_options.info_log, "Started the snapshot process -- creating snapshot in directory %s", @@ -101,6 +117,7 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir, // directory, but it shouldn't be because we verified above the directory // doesn't exist. assert(checkpoint_dir.empty()); + s.PermitUncheckedError(); return Status::InvalidArgument("invalid checkpoint directory name"); } @@ -109,7 +126,14 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir, ROCKS_LOG_INFO(db_options.info_log, "Snapshot process -- using temporary directory %s", full_private_path.c_str()); - CleanStagingDirectory(full_private_path, db_options.info_log.get()); + + s = CleanStagingDirectory(full_private_path, db_options.info_log.get()); + if (!s.ok()) { + return Status::Aborted( + "Failed to clean the temporary directory " + full_private_path + + " needed before checkpoint creation : " + s.ToString()); + } + // create snapshot directory s = db_->GetEnv()->CreateDir(full_private_path); uint64_t sequence_number = 0; @@ -180,10 +204,15 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir, ROCKS_LOG_INFO(db_options.info_log, "Snapshot sequence number: %" PRIu64, sequence_number); } else { - // clean all the files we might have created ROCKS_LOG_INFO(db_options.info_log, "Snapshot failed -- %s", s.ToString().c_str()); - CleanStagingDirectory(full_private_path, db_options.info_log.get()); + // clean all the files and directory we might have created + Status del_s = + CleanStagingDirectory(full_private_path, db_options.info_log.get()); + ROCKS_LOG_INFO(db_options.info_log, + "Clean files or directory we might have created %s: %s", + full_private_path.c_str(), del_s.ToString().c_str()); + del_s.PermitUncheckedError(); } return s; } diff --git a/utilities/checkpoint/checkpoint_impl.h b/utilities/checkpoint/checkpoint_impl.h index 3cb9a6477f..58ac928bf6 100644 --- a/utilities/checkpoint/checkpoint_impl.h +++ b/utilities/checkpoint/checkpoint_impl.h @@ -44,7 +44,7 @@ class CheckpointImpl : public Checkpoint { bool get_live_table_checksum = false); private: - void CleanStagingDirectory(const std::string& path, Logger* info_log); + Status CleanStagingDirectory(const std::string& path, Logger* info_log); // Export logic customization by providing callbacks for link or copy. Status ExportFilesInMetaData( @@ -61,4 +61,3 @@ class CheckpointImpl : public Checkpoint { }; } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index dc19d02fde..e71c795f65 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -25,6 +25,7 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/sst_file_manager.h" #include "rocksdb/utilities/transaction_db.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" @@ -1031,6 +1032,100 @@ TEST_F(CheckpointTest, CheckpointWithArchievedLog) { delete snapshot_db; } +class CheckpointDestroyTest : public CheckpointTest, + public testing::WithParamInterface {}; + +TEST_P(CheckpointDestroyTest, DisableEnableSlowDeletion) { + bool slow_deletion = GetParam(); + Options options = CurrentOptions(); + options.num_levels = 2; + options.disable_auto_compactions = true; + Status s; + options.sst_file_manager.reset(NewSstFileManager( + options.env, options.info_log, "", slow_deletion ? 1024 * 1024 : 0, + false /* delete_existing_trash */, &s, 1)); + ASSERT_OK(s); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "a")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("bar", "b")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("bar", "val" + std::to_string(i))); + ASSERT_OK(Flush()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + ASSERT_EQ(NumTableFilesAtLevel(1), 2); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + + delete checkpoint; + checkpoint = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 2); + + DB* snapshot_db; + ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db)); + ReadOptions read_opts; + std::string get_result; + ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result)); + ASSERT_EQ("a", get_result); + ASSERT_OK(snapshot_db->Get(read_opts, "bar", &get_result)); + ASSERT_EQ("val9", get_result); + delete snapshot_db; + + // Make sure original obsolete files for hard linked files are all deleted. + DBImpl* db_impl = static_cast_with_check(db_); + db_impl->TEST_DeleteObsoleteFiles(); + auto sfm = static_cast_with_check( + options.sst_file_manager.get()); + ASSERT_NE(nullptr, sfm); + sfm->WaitForEmptyTrash(); + // SST file 2-12 for "bar" will be compacted into one file on L1 during the + // compaction after checkpoint is created. SST file 1 on L1: foo, seq: + // 1 (hard links is 1 after checkpoint destroy) + std::atomic bg_delete_sst{0}; + std::atomic fg_delete_sst{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteFile::cb", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto file_name = *static_cast(arg); + if (file_name.size() >= 4 && + file_name.compare(file_name.size() - 4, 4, ".sst") == 0) { + fg_delete_sst.fetch_add(1); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile::cb", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto file_name = *static_cast(arg); + if (file_name.size() >= 10 && + file_name.compare(file_name.size() - 10, 10, ".sst.trash") == 0) { + bg_delete_sst.fetch_add(1); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(DestroyDB(snapshot_name_, options)); + if (slow_deletion) { + ASSERT_EQ(fg_delete_sst, 1); + ASSERT_EQ(bg_delete_sst, 11); + } else { + ASSERT_EQ(fg_delete_sst, 12); + } + + ASSERT_EQ("a", Get("foo")); + ASSERT_EQ("val9", Get("bar")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +INSTANTIATE_TEST_CASE_P(CheckpointDestroyTest, CheckpointDestroyTest, + ::testing::Values(true, false)); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index 45f84ef9f3..ab7ea4f62b 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -73,6 +73,8 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) { deadlock_detect_ = txn_options.deadlock_detect; deadlock_detect_depth_ = txn_options.deadlock_detect_depth; write_batch_.SetMaxBytes(txn_options.max_write_batch_size); + write_batch_.GetWriteBatch()->SetTrackTimestampSize( + txn_options.write_batch_track_timestamp_size); skip_concurrency_control_ = txn_options.skip_concurrency_control; lock_timeout_ = txn_options.lock_timeout * 1000; @@ -763,8 +765,16 @@ Status WriteCommittedTxn::CommitWithoutPrepareInternal() { EncodeFixed64(commit_ts_buf, commit_timestamp_); Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf)); - Status s = - wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t { + Status s = wb->UpdateTimestamps( + commit_ts, [wb, wbwi, this](uint32_t cf) -> size_t { + // First search through timestamp info kept inside the WriteBatch + // in case some writes bypassed the Transaction's write APIs. + auto cf_id_to_ts_sz = wb->GetColumnFamilyToTimestampSize(); + auto iter = cf_id_to_ts_sz.find(cf); + if (iter != cf_id_to_ts_sz.end()) { + size_t ts_sz = iter->second; + return ts_sz; + } auto cf_iter = cfs_with_ts_tracked_when_indexing_disabled_.find(cf); if (cf_iter != cfs_with_ts_tracked_when_indexing_disabled_.end()) { return sizeof(kMaxTxnTimestamp); @@ -840,16 +850,24 @@ Status WriteCommittedTxn::CommitInternal() { s = WriteBatchInternal::MarkCommitWithTimestamp(working_batch, name_, commit_ts); if (s.ok()) { - s = wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t { - if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) != - cfs_with_ts_tracked_when_indexing_disabled_.end()) { - return sizeof(kMaxTxnTimestamp); - } - const Comparator* ucmp = - WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf); - return ucmp ? ucmp->timestamp_size() - : std::numeric_limits::max(); - }); + s = wb->UpdateTimestamps( + commit_ts, [wb, wbwi, this](uint32_t cf) -> size_t { + // first search through timestamp info kept inside the WriteBatch + // in case some writes bypassed the Transaction's write APIs. + auto cf_id_to_ts_sz = wb->GetColumnFamilyToTimestampSize(); + auto iter = cf_id_to_ts_sz.find(cf); + if (iter != cf_id_to_ts_sz.end()) { + return iter->second; + } + if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) != + cfs_with_ts_tracked_when_indexing_disabled_.end()) { + return sizeof(kMaxTxnTimestamp); + } + const Comparator* ucmp = + WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf); + return ucmp ? ucmp->timestamp_size() + : std::numeric_limits::max(); + }); } } diff --git a/utilities/transactions/write_committed_transaction_ts_test.cc b/utilities/transactions/write_committed_transaction_ts_test.cc index abafb88e80..47b1a0df4d 100644 --- a/utilities/transactions/write_committed_transaction_ts_test.cc +++ b/utilities/transactions/write_committed_transaction_ts_test.cc @@ -130,6 +130,128 @@ void CheckKeyValueTsWithIterator( } } +// This is an incorrect usage of this API, supporting this should be removed +// after MyRocks remove this pattern in a refactor. +TEST_P(WriteCommittedTxnWithTsTest, WritesBypassTransactionAPIs) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + ASSERT_OK(ReOpen()); + + const std::string test_cf_name = "test_cf"; + ColumnFamilyOptions cf_options; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options)); + options.avoid_flush_during_shutdown = true; + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + // Write in each transaction a mixture of column families that enable + // timestamp and disable timestamps. + + TransactionOptions txn_opts; + txn_opts.write_batch_track_timestamp_size = true; + std::unique_ptr txn0(NewTxn(WriteOptions(), txn_opts)); + assert(txn0); + ASSERT_OK(txn0->Put(handles_[0], "key1", "key1_val")); + // Timestamp size info for writes like this can only be correctly tracked if + // TransactionOptions.write_batch_track_timestamp_size is true. + ASSERT_OK(txn0->GetWriteBatch()->GetWriteBatch()->Put(handles_[1], "foo", + "foo_val")); + ASSERT_OK(txn0->SetName("txn0")); + ASSERT_OK(txn0->SetCommitTimestamp(2)); + ASSERT_OK(txn0->Prepare()); + ASSERT_OK(txn0->Commit()); + txn0.reset(); + + // For keys written from transactions that disable + // `write_batch_track_timestamp_size` + // The keys has incorrect behavior like: + // *Cannot be found after commit: because transaction's UpdateTimestamp do not + // have correct timestamp size when this write bypass transaction write APIs. + // *Can be found again after DB restart recovers the write from WAL log: + // because recovered transaction's UpdateTimestamp get correct timestamp size + // info directly from VersionSet. + // If there is a flush that persisted this transaction into sst files after + // it's committed, the key will be forever corrupted. + std::unique_ptr txn1( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn1); + ASSERT_OK(txn1->Put(handles_[0], "key2", "key2_val")); + // Writing a key with more than 8 bytes so that we can manifest the error as + // a NotFound error instead of an issue during `WriteBatch::UpdateTimestamp`. + ASSERT_OK(txn1->GetWriteBatch()->GetWriteBatch()->Put( + handles_[1], "foobarbaz", "baz_val")); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->SetCommitTimestamp(2)); + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn1->Commit()); + txn1.reset(); + + ASSERT_OK(db->Flush(FlushOptions(), handles_[1])); + + std::unique_ptr txn2( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn2); + ASSERT_OK(txn2->Put(handles_[0], "key3", "key3_val")); + ASSERT_OK(txn2->GetWriteBatch()->GetWriteBatch()->Put( + handles_[1], "bazbazbaz", "bazbazbaz_val")); + ASSERT_OK(txn2->SetCommitTimestamp(2)); + ASSERT_OK(txn2->SetName("txn2")); + ASSERT_OK(txn2->Prepare()); + ASSERT_OK(txn2->Commit()); + txn2.reset(); + + std::unique_ptr txn3( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn3); + std::string value; + ReadOptions ropts; + std::string read_ts; + Slice timestamp = EncodeU64Ts(2, &read_ts); + ropts.timestamp = ×tamp; + ASSERT_OK(txn3->Get(ropts, handles_[0], "key1", &value)); + ASSERT_EQ("key1_val", value); + ASSERT_OK(txn3->Get(ropts, handles_[0], "key2", &value)); + ASSERT_EQ("key2_val", value); + ASSERT_OK(txn3->Get(ropts, handles_[0], "key3", &value)); + ASSERT_EQ("key3_val", value); + txn3.reset(); + + std::unique_ptr txn4( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn4); + ASSERT_OK(txn4->Get(ReadOptions(), handles_[1], "foo", &value)); + ASSERT_EQ("foo_val", value); + // Incorrect behavior: committed keys cannot be found + ASSERT_TRUE( + txn4->Get(ReadOptions(), handles_[1], "foobarbaz", &value).IsNotFound()); + ASSERT_TRUE( + txn4->Get(ReadOptions(), handles_[1], "bazbazbaz", &value).IsNotFound()); + txn4.reset(); + + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + std::unique_ptr txn5( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn5); + ASSERT_OK(txn5->Get(ReadOptions(), handles_[1], "foo", &value)); + ASSERT_EQ("foo_val", value); + // Incorrect behavior: + // *unflushed key can be found after reopen replays the entries from WAL + // (this is not suggesting using flushing as a workaround but to show a + // possible misleading behavior) + // *flushed key is forever corrupted. + ASSERT_TRUE( + txn5->Get(ReadOptions(), handles_[1], "foobarbaz", &value).IsNotFound()); + ASSERT_OK(txn5->Get(ReadOptions(), handles_[1], "bazbazbaz", &value)); + ASSERT_EQ("bazbazbaz_val", value); + txn5.reset(); +} + TEST_P(WriteCommittedTxnWithTsTest, ReOpenWithTimestamp) { options.merge_operator = MergeOperators::CreateUInt64AddOperator(); ASSERT_OK(ReOpenNoDelete()); diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index e1fcd3f639..11caa17701 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -395,7 +395,9 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) { // unprep_seqs_ will also contain prepared seqnos since they are treated in // the same way in the prepare/commit callbacks. See the comment on the // definition of unprep_seqs_. - unprep_seqs_[prepare_seq] = prepare_batch_cnt_; + if (s.ok()) { + unprep_seqs_[prepare_seq] = prepare_batch_cnt_; + } // Reset transaction state. if (!prepared) {