diff --git a/HISTORY.md b/HISTORY.md index abe16a7992..571bc565a0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,7 @@ # Rocksdb Change Log ## Unreleased +### Behavior changes +* Make best-efforts recovery verify SST unique ID before Version construction (#10962) ## 7.9.0 (11/21/2022) ### Performance Improvements diff --git a/db/db_test2.cc b/db/db_test2.cc index 97d671c144..779b8bf137 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -7509,6 +7509,80 @@ TEST_F(DBTest2, SstUniqueIdVerifyMultiCFs) { ASSERT_TRUE(s.IsCorruption()); } +TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) { + const auto tamper_with_uniq_id = [&](void* arg) { + auto props = static_cast(arg); + assert(props); + // update table property session_id to a different one + props->db_session_id = DBImpl::GenerateDbSessionId(nullptr); + }; + + const auto assert_db = [&](size_t expected_count, + const std::string& expected_v) { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + size_t cnt = 0; + for (it->SeekToFirst(); it->Valid(); it->Next(), ++cnt) { + ASSERT_EQ(std::to_string(cnt), it->key()); + ASSERT_EQ(expected_v, it->value()); + } + ASSERT_EQ(expected_count, cnt); + }; + + const int num_l0_compaction_trigger = 8; + const int num_l0 = num_l0_compaction_trigger - 1; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = num_l0_compaction_trigger; + + for (int k = 0; k < num_l0; ++k) { + // Allow mismatch for now + options.verify_sst_unique_id_in_manifest = false; + + DestroyAndReopen(options); + + constexpr size_t num_keys_per_file = 10; + for (int i = 0; i < num_l0; ++i) { + for (size_t j = 0; j < num_keys_per_file; ++j) { + ASSERT_OK(Put(std::to_string(j), "v" + std::to_string(i))); + } + if (i == k) { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", + tamper_with_uniq_id); + SyncPoint::GetInstance()->EnableProcessing(); + } + ASSERT_OK(Flush()); + } + + options.verify_sst_unique_id_in_manifest = true; + Status s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); + + options.best_efforts_recovery = true; + Reopen(options); + assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1)); + + // Reopen with regular recovery + options.best_efforts_recovery = false; + Reopen(options); + assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + for (size_t i = 0; i < num_keys_per_file; ++i) { + ASSERT_OK(Put(std::to_string(i), "v")); + } + ASSERT_OK(Flush()); + Reopen(options); + { + for (size_t i = 0; i < num_keys_per_file; ++i) { + ASSERT_EQ("v", Get(std::to_string(i))); + } + } + } +} + #ifndef ROCKSDB_LITE TEST_F(DBTest2, GetLatestSeqAndTsForKey) { Destroy(last_options_); diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index 08d04c7c02..145e78789f 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -734,12 +734,13 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( assert(!cfd->ioptions()->cf_paths.empty()); Status s; for (const auto& elem : edit.GetNewFiles()) { + int level = elem.first; const FileMetaData& meta = elem.second; const FileDescriptor& fd = meta.fd; uint64_t file_num = fd.GetNumber(); const std::string fpath = MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num); - s = VerifyFile(fpath, meta); + s = VerifyFile(cfd, fpath, level, meta); if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { missing_files.insert(file_num); s = Status::OK(); @@ -804,6 +805,18 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( auto* version = new Version(cfd, version_set_, version_set_->file_options_, *cfd->GetLatestMutableCFOptions(), io_tracer_, version_set_->current_version_number_++); + s = builder->LoadTableHandlers( + cfd->internal_stats(), + version_set_->db_options_->max_file_opening_threads, false, true, + cfd->GetLatestMutableCFOptions()->prefix_extractor, + MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); + if (!s.ok()) { + delete version; + if (s.IsCorruption()) { + s = Status::OK(); + } + return s; + } s = builder->SaveTo(version->storage_info()); if (s.ok()) { version->PrepareAppend( @@ -823,9 +836,11 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( return s; } -Status VersionEditHandlerPointInTime::VerifyFile(const std::string& fpath, +Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd, + const std::string& fpath, + int level, const FileMetaData& fmeta) { - return version_set_->VerifyFileMetadata(fpath, fmeta); + return version_set_->VerifyFileMetadata(cfd, fpath, level, fmeta); } Status VersionEditHandlerPointInTime::VerifyBlobFile( @@ -843,6 +858,12 @@ Status VersionEditHandlerPointInTime::VerifyBlobFile( return s; } +Status VersionEditHandlerPointInTime::LoadTables( + ColumnFamilyData* /*cfd*/, bool /*prefetch_index_and_filter_in_cache*/, + bool /*is_initial_load*/) { + return Status::OK(); +} + Status ManifestTailer::Initialize() { if (Mode::kRecovery == mode_) { return VersionEditHandler::Initialize(); @@ -930,9 +951,11 @@ void ManifestTailer::CheckIterationResult(const log::Reader& reader, } } -Status ManifestTailer::VerifyFile(const std::string& fpath, +Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd, + const std::string& fpath, int level, const FileMetaData& fmeta) { - Status s = VersionEditHandlerPointInTime::VerifyFile(fpath, fmeta); + Status s = + VersionEditHandlerPointInTime::VerifyFile(cfd, fpath, level, fmeta); // TODO: Open file or create hard link to prevent the file from being // deleted. return s; diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h index 665e0f0d43..fd2379b073 100644 --- a/db/version_edit_handler.h +++ b/db/version_edit_handler.h @@ -164,9 +164,9 @@ class VersionEditHandler : public VersionEditHandlerBase { ColumnFamilyData* cfd, bool force_create_version); - Status LoadTables(ColumnFamilyData* cfd, - bool prefetch_index_and_filter_in_cache, - bool is_initial_load); + virtual Status LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load); virtual bool MustOpenAllColumnFamilies() const { return !read_only_; } @@ -213,11 +213,15 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override; Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) override; - virtual Status VerifyFile(const std::string& fpath, - const FileMetaData& fmeta); + virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, + int level, const FileMetaData& fmeta); virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num, const BlobFileAddition& blob_addition); + Status LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load) override; + std::unordered_map versions_; }; @@ -250,7 +254,7 @@ class ManifestTailer : public VersionEditHandlerPointInTime { void CheckIterationResult(const log::Reader& reader, Status* s) override; - Status VerifyFile(const std::string& fpath, + Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& fmeta) override; enum Mode : uint8_t { diff --git a/db/version_set.cc b/db/version_set.cc index cac3a0a9e1..082e55217c 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -6704,8 +6704,9 @@ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) { return all_versions_blob_file_size; } -Status VersionSet::VerifyFileMetadata(const std::string& fpath, - const FileMetaData& meta) const { +Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd, + const std::string& fpath, int level, + const FileMetaData& meta) { uint64_t fsize = 0; Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr); if (status.ok()) { @@ -6713,6 +6714,38 @@ Status VersionSet::VerifyFileMetadata(const std::string& fpath, status = Status::Corruption("File size mismatch: " + fpath); } } + if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) { + assert(cfd); + TableCache* table_cache = cfd->table_cache(); + assert(table_cache); + + const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions(); + assert(cf_opts); + std::shared_ptr pe = cf_opts->prefix_extractor; + size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts); + + const FileOptions& file_opts = file_options(); + + Version* version = cfd->current(); + assert(version); + VersionStorageInfo& storage_info = version->storage_info_; + const InternalKeyComparator* icmp = storage_info.InternalComparator(); + assert(icmp); + + InternalStats* internal_stats = cfd->internal_stats(); + + FileMetaData meta_copy = meta; + status = table_cache->FindTable( + ReadOptions(), file_opts, *icmp, meta_copy, + &(meta_copy.table_reader_handle), pe, + /*no_io=*/false, /*record_read_stats=*/true, + internal_stats->GetFileReadHist(level), false, level, + /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, + meta_copy.temperature); + if (meta_copy.table_reader_handle) { + table_cache->ReleaseHandle(meta_copy.table_reader_handle); + } + } return status; } diff --git a/db/version_set.h b/db/version_set.h index cf6f9af369..03176a8b5d 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1501,8 +1501,8 @@ class VersionSet { ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, const VersionEdit* edit); - Status VerifyFileMetadata(const std::string& fpath, - const FileMetaData& meta) const; + Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath, + int level, const FileMetaData& meta); // Protected by DB mutex. WalSet wals_; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 3ba13d1299..cd6644c006 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1285,30 +1285,37 @@ struct DBOptions { // Default: nullptr std::shared_ptr file_checksum_gen_factory = nullptr; - // By default, RocksDB recovery fails if any table/blob file referenced in + // By default, RocksDB recovery fails if any table/blob file referenced in the + // final version reconstructed from the // MANIFEST are missing after scanning the MANIFEST pointed to by the - // CURRENT file. - // Best-efforts recovery is another recovery mode that tolerates missing or - // corrupted table or blob files. + // CURRENT file. It can also fail if verification of unique SST id fails. + // Best-efforts recovery is another recovery mode that does not necessarily + // fail when certain table/blob files are missing/corrupted or have mismatched + // unique id table property. Instead, best-efforts recovery recovers each + // column family to a point in the MANIFEST that corresponds to a version. In + // such a version, all valid table/blob files referenced have the expected + // file size. For table files, their unique id table property match the + // MANIFEST. + // // Best-efforts recovery does not need a valid CURRENT file, and tries to // recover the database using one of the available MANIFEST files in the db // directory. - // Best-efforts recovery recovers database to a state in which the database - // includes only table and blob files whose actual sizes match the - // information in the chosen MANIFEST without holes in the history. // Best-efforts recovery tries the available MANIFEST files from high file // numbers (newer) to low file numbers (older), and stops after finding the // first MANIFEST file from which the db can be recovered to a state without - // invalid (missing/file-mismatch) table and blob files. - // It is possible that the database can be restored to an empty state with no - // table or blob files. - // Regardless of this option, the IDENTITY file is updated if needed during - // recovery to match the DB ID in the MANIFEST (if previously using - // write_dbid_to_manifest) or to be in some valid state (non-empty DB ID). - // Currently, not compatible with atomic flush. Furthermore, WAL files will - // not be used for recovery if best_efforts_recovery is true. - // Also requires either 1) LOCK file exists or 2) underlying env's LockFile() - // call returns ok even for non-existing LOCK file. + // invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob + // files. It is possible that the database can be restored to an empty state + // with no table or blob files. + // + // Regardless of this option, the IDENTITY file + // is updated if needed during recovery to match the DB ID in the MANIFEST (if + // previously using write_dbid_to_manifest) or to be in some valid state + // (non-empty DB ID). Currently, not compatible with atomic flush. + // Furthermore, WAL files will not be used for recovery if + // best_efforts_recovery is true. Also requires either 1) LOCK file exists or + // 2) underlying env's LockFile() call returns ok even for non-existing LOCK + // file. + // // Default: false bool best_efforts_recovery = false;