Make best-efforts recovery verify SST unique ID before Version construction (#10962)

Summary:
The check for SST unique IDs added to best-efforts recovery (`Options::best_efforts_recovery` is true).

With best_efforts_recovery being true, RocksDB will recover to the latest point in
MANIFEST such that all valid SST files included up to this point pass unique ID checks as well.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10962

Test Plan: make check

Reviewed By: pdillinger

Differential Revision: D41378241

Pulled By: riversand963

fbshipit-source-id: a036064e2c17dec13d080a24ef2a9f85d607b16c
This commit is contained in:
Yanqin Jin 2022-11-22 22:53:31 -08:00 committed by Facebook GitHub Bot
parent d8e792e4cf
commit 3d0d6b8140
7 changed files with 175 additions and 32 deletions

View file

@ -1,5 +1,7 @@
# Rocksdb Change Log
## Unreleased
### Behavior changes
* Make best-efforts recovery verify SST unique ID before Version construction (#10962)
## 7.9.0 (11/21/2022)
### Performance Improvements

View file

@ -7509,6 +7509,80 @@ TEST_F(DBTest2, SstUniqueIdVerifyMultiCFs) {
ASSERT_TRUE(s.IsCorruption());
}
TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) {
const auto tamper_with_uniq_id = [&](void* arg) {
auto props = static_cast<TableProperties*>(arg);
assert(props);
// update table property session_id to a different one
props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
};
const auto assert_db = [&](size_t expected_count,
const std::string& expected_v) {
std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
size_t cnt = 0;
for (it->SeekToFirst(); it->Valid(); it->Next(), ++cnt) {
ASSERT_EQ(std::to_string(cnt), it->key());
ASSERT_EQ(expected_v, it->value());
}
ASSERT_EQ(expected_count, cnt);
};
const int num_l0_compaction_trigger = 8;
const int num_l0 = num_l0_compaction_trigger - 1;
Options options = CurrentOptions();
options.level0_file_num_compaction_trigger = num_l0_compaction_trigger;
for (int k = 0; k < num_l0; ++k) {
// Allow mismatch for now
options.verify_sst_unique_id_in_manifest = false;
DestroyAndReopen(options);
constexpr size_t num_keys_per_file = 10;
for (int i = 0; i < num_l0; ++i) {
for (size_t j = 0; j < num_keys_per_file; ++j) {
ASSERT_OK(Put(std::to_string(j), "v" + std::to_string(i)));
}
if (i == k) {
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->SetCallBack(
"PropertyBlockBuilder::AddTableProperty:Start",
tamper_with_uniq_id);
SyncPoint::GetInstance()->EnableProcessing();
}
ASSERT_OK(Flush());
}
options.verify_sst_unique_id_in_manifest = true;
Status s = TryReopen(options);
ASSERT_TRUE(s.IsCorruption());
options.best_efforts_recovery = true;
Reopen(options);
assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));
// Reopen with regular recovery
options.best_efforts_recovery = false;
Reopen(options);
assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
for (size_t i = 0; i < num_keys_per_file; ++i) {
ASSERT_OK(Put(std::to_string(i), "v"));
}
ASSERT_OK(Flush());
Reopen(options);
{
for (size_t i = 0; i < num_keys_per_file; ++i) {
ASSERT_EQ("v", Get(std::to_string(i)));
}
}
}
}
#ifndef ROCKSDB_LITE
TEST_F(DBTest2, GetLatestSeqAndTsForKey) {
Destroy(last_options_);

View file

@ -734,12 +734,13 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
assert(!cfd->ioptions()->cf_paths.empty());
Status s;
for (const auto& elem : edit.GetNewFiles()) {
int level = elem.first;
const FileMetaData& meta = elem.second;
const FileDescriptor& fd = meta.fd;
uint64_t file_num = fd.GetNumber();
const std::string fpath =
MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num);
s = VerifyFile(fpath, meta);
s = VerifyFile(cfd, fpath, level, meta);
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
missing_files.insert(file_num);
s = Status::OK();
@ -804,6 +805,18 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
auto* version = new Version(cfd, version_set_, version_set_->file_options_,
*cfd->GetLatestMutableCFOptions(), io_tracer_,
version_set_->current_version_number_++);
s = builder->LoadTableHandlers(
cfd->internal_stats(),
version_set_->db_options_->max_file_opening_threads, false, true,
cfd->GetLatestMutableCFOptions()->prefix_extractor,
MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
if (!s.ok()) {
delete version;
if (s.IsCorruption()) {
s = Status::OK();
}
return s;
}
s = builder->SaveTo(version->storage_info());
if (s.ok()) {
version->PrepareAppend(
@ -823,9 +836,11 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
return s;
}
Status VersionEditHandlerPointInTime::VerifyFile(const std::string& fpath,
Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd,
const std::string& fpath,
int level,
const FileMetaData& fmeta) {
return version_set_->VerifyFileMetadata(fpath, fmeta);
return version_set_->VerifyFileMetadata(cfd, fpath, level, fmeta);
}
Status VersionEditHandlerPointInTime::VerifyBlobFile(
@ -843,6 +858,12 @@ Status VersionEditHandlerPointInTime::VerifyBlobFile(
return s;
}
Status VersionEditHandlerPointInTime::LoadTables(
ColumnFamilyData* /*cfd*/, bool /*prefetch_index_and_filter_in_cache*/,
bool /*is_initial_load*/) {
return Status::OK();
}
Status ManifestTailer::Initialize() {
if (Mode::kRecovery == mode_) {
return VersionEditHandler::Initialize();
@ -930,9 +951,11 @@ void ManifestTailer::CheckIterationResult(const log::Reader& reader,
}
}
Status ManifestTailer::VerifyFile(const std::string& fpath,
Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd,
const std::string& fpath, int level,
const FileMetaData& fmeta) {
Status s = VersionEditHandlerPointInTime::VerifyFile(fpath, fmeta);
Status s =
VersionEditHandlerPointInTime::VerifyFile(cfd, fpath, level, fmeta);
// TODO: Open file or create hard link to prevent the file from being
// deleted.
return s;

View file

@ -164,7 +164,7 @@ class VersionEditHandler : public VersionEditHandlerBase {
ColumnFamilyData* cfd,
bool force_create_version);
Status LoadTables(ColumnFamilyData* cfd,
virtual Status LoadTables(ColumnFamilyData* cfd,
bool prefetch_index_and_filter_in_cache,
bool is_initial_load);
@ -213,11 +213,15 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override;
Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd,
bool force_create_version) override;
virtual Status VerifyFile(const std::string& fpath,
const FileMetaData& fmeta);
virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath,
int level, const FileMetaData& fmeta);
virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
const BlobFileAddition& blob_addition);
Status LoadTables(ColumnFamilyData* cfd,
bool prefetch_index_and_filter_in_cache,
bool is_initial_load) override;
std::unordered_map<uint32_t, Version*> versions_;
};
@ -250,7 +254,7 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
void CheckIterationResult(const log::Reader& reader, Status* s) override;
Status VerifyFile(const std::string& fpath,
Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
const FileMetaData& fmeta) override;
enum Mode : uint8_t {

View file

@ -6704,8 +6704,9 @@ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
return all_versions_blob_file_size;
}
Status VersionSet::VerifyFileMetadata(const std::string& fpath,
const FileMetaData& meta) const {
Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd,
const std::string& fpath, int level,
const FileMetaData& meta) {
uint64_t fsize = 0;
Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr);
if (status.ok()) {
@ -6713,6 +6714,38 @@ Status VersionSet::VerifyFileMetadata(const std::string& fpath,
status = Status::Corruption("File size mismatch: " + fpath);
}
}
if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) {
assert(cfd);
TableCache* table_cache = cfd->table_cache();
assert(table_cache);
const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions();
assert(cf_opts);
std::shared_ptr<const SliceTransform> pe = cf_opts->prefix_extractor;
size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts);
const FileOptions& file_opts = file_options();
Version* version = cfd->current();
assert(version);
VersionStorageInfo& storage_info = version->storage_info_;
const InternalKeyComparator* icmp = storage_info.InternalComparator();
assert(icmp);
InternalStats* internal_stats = cfd->internal_stats();
FileMetaData meta_copy = meta;
status = table_cache->FindTable(
ReadOptions(), file_opts, *icmp, meta_copy,
&(meta_copy.table_reader_handle), pe,
/*no_io=*/false, /*record_read_stats=*/true,
internal_stats->GetFileReadHist(level), false, level,
/*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,
meta_copy.temperature);
if (meta_copy.table_reader_handle) {
table_cache->ReleaseHandle(meta_copy.table_reader_handle);
}
}
return status;
}

View file

@ -1501,8 +1501,8 @@ class VersionSet {
ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
const VersionEdit* edit);
Status VerifyFileMetadata(const std::string& fpath,
const FileMetaData& meta) const;
Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath,
int level, const FileMetaData& meta);
// Protected by DB mutex.
WalSet wals_;

View file

@ -1285,30 +1285,37 @@ struct DBOptions {
// Default: nullptr
std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
// By default, RocksDB recovery fails if any table/blob file referenced in
// By default, RocksDB recovery fails if any table/blob file referenced in the
// final version reconstructed from the
// MANIFEST are missing after scanning the MANIFEST pointed to by the
// CURRENT file.
// Best-efforts recovery is another recovery mode that tolerates missing or
// corrupted table or blob files.
// CURRENT file. It can also fail if verification of unique SST id fails.
// Best-efforts recovery is another recovery mode that does not necessarily
// fail when certain table/blob files are missing/corrupted or have mismatched
// unique id table property. Instead, best-efforts recovery recovers each
// column family to a point in the MANIFEST that corresponds to a version. In
// such a version, all valid table/blob files referenced have the expected
// file size. For table files, their unique id table property match the
// MANIFEST.
//
// Best-efforts recovery does not need a valid CURRENT file, and tries to
// recover the database using one of the available MANIFEST files in the db
// directory.
// Best-efforts recovery recovers database to a state in which the database
// includes only table and blob files whose actual sizes match the
// information in the chosen MANIFEST without holes in the history.
// Best-efforts recovery tries the available MANIFEST files from high file
// numbers (newer) to low file numbers (older), and stops after finding the
// first MANIFEST file from which the db can be recovered to a state without
// invalid (missing/file-mismatch) table and blob files.
// It is possible that the database can be restored to an empty state with no
// table or blob files.
// Regardless of this option, the IDENTITY file is updated if needed during
// recovery to match the DB ID in the MANIFEST (if previously using
// write_dbid_to_manifest) or to be in some valid state (non-empty DB ID).
// Currently, not compatible with atomic flush. Furthermore, WAL files will
// not be used for recovery if best_efforts_recovery is true.
// Also requires either 1) LOCK file exists or 2) underlying env's LockFile()
// call returns ok even for non-existing LOCK file.
// invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob
// files. It is possible that the database can be restored to an empty state
// with no table or blob files.
//
// Regardless of this option, the IDENTITY file
// is updated if needed during recovery to match the DB ID in the MANIFEST (if
// previously using write_dbid_to_manifest) or to be in some valid state
// (non-empty DB ID). Currently, not compatible with atomic flush.
// Furthermore, WAL files will not be used for recovery if
// best_efforts_recovery is true. Also requires either 1) LOCK file exists or
// 2) underlying env's LockFile() call returns ok even for non-existing LOCK
// file.
//
// Default: false
bool best_efforts_recovery = false;