Don't wait for indirect flush in read-only DB (#10569)

Summary:
Some APIs for getting live files, which are used by Checkpoint
and BackupEngine, can optionally trigger and wait for a flush. These
would deadlock when used on a read-only DB. Here we fix that by assuming
the user wants the overall operation to succeed and is OK without
flushing (because the DB is read-only).

Follow-up work: the same or other issues can be hit by directly invoking
some DB functions that are clearly not appropriate for read-only
instance, but are not covered by overrides in DBImplReadOnly and
CompactedDBImpl. These should be fixed to avoid similar problems on
accidental misuse. (Long term, it would be nice to have a DBReadOnly
class without those members, like BackupEngineReadOnly.)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10569

Test Plan: tests updated to catch regression (hang before the fix)

Reviewed By: riversand963

Differential Revision: D38995759

Pulled By: pdillinger

fbshipit-source-id: f5f8bc7123e13cb45bd393dd974d7d6eda20bc68
This commit is contained in:
Peter Dillinger 2022-08-29 13:36:23 -07:00 committed by Facebook GitHub Bot
parent 5532b462c4
commit c5afbbfe4b
9 changed files with 66 additions and 7 deletions

View File

@ -1,6 +1,7 @@
# Rocksdb Change Log
## Unreleased
### Bug Fixes
* Fixed a hang when an operation such as `GetLiveFiles` or `CreateNewBackup` is asked to trigger and wait for memtable flush on a read-only DB. Such indirect requests for memtable flush are now ignored on a read-only DB.
* Fixed bug where `FlushWAL(true /* sync */)` (used by `GetLiveFilesStorageInfo()`, which is used by checkpoint and backup) could cause parallel writes at the tail of a WAL file to never be synced.
* Fix periodic_task unable to re-register the same task type, which may cause `SetOptions()` fail to update periodical_task time like: `stats_dump_period_sec`, `stats_persist_period_sec`.

View File

@ -253,6 +253,7 @@ TEST_F(DBBasicTest, CompactedDB) {
ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
ASSERT_OK(Flush());
ASSERT_OK(Put("something_not_flushed", "x"));
Close();
ASSERT_OK(ReadOnlyReopen(options));
@ -260,6 +261,20 @@ TEST_F(DBBasicTest, CompactedDB) {
s = Put("new", "value");
ASSERT_EQ(s.ToString(),
"Not implemented: Not supported operation in read only mode.");
// TODO: validate that other write ops return NotImplemented
// (DBImplReadOnly is missing some overrides)
// Ensure no deadlock on flush triggered by another API function
// (Old deadlock bug depends on something_not_flushed above.)
std::vector<std::string> files;
uint64_t manifest_file_size;
ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
LiveFilesStorageInfoOptions lfsi_opts;
lfsi_opts.wal_size_for_flush = 0; // always
std::vector<LiveFileStorageInfo> files2;
ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
Close();
// Full compaction
@ -290,6 +305,13 @@ TEST_F(DBBasicTest, CompactedDB) {
ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
ASSERT_EQ("NOT_FOUND", Get("kkk"));
// TODO: validate that other write ops return NotImplemented
// (CompactedDB is missing some overrides)
// Ensure no deadlock on flush triggered by another API function
ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
// MultiGet
std::vector<std::string> values;
std::vector<Status> status_list = dbfull()->MultiGet(

View File

@ -11,6 +11,7 @@
namespace ROCKSDB_NAMESPACE {
// TODO: Share common structure with DBImplSecondary and DBImplReadOnly
class CompactedDBImpl : public DBImpl {
public:
CompactedDBImpl(const DBOptions& options, const std::string& dbname);
@ -127,6 +128,17 @@ class CompactedDBImpl : public DBImpl {
return Status::NotSupported("Not supported in compacted db mode.");
}
// FIXME: some missing overrides for more "write" functions
// Share with DBImplReadOnly?
protected:
#ifndef ROCKSDB_LITE
Status FlushForGetLiveFiles() override {
// No-op for read-only DB
return Status::OK();
}
#endif // !ROCKSDB_LITE
private:
friend class DB;
inline size_t FindFile(const Slice& key);

View File

@ -1387,7 +1387,7 @@ class DBImpl : public DB {
void NotifyOnExternalFileIngested(
ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
Status FlushForGetLiveFiles();
virtual Status FlushForGetLiveFiles();
#endif // !ROCKSDB_LITE
void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;

View File

@ -13,6 +13,7 @@
namespace ROCKSDB_NAMESPACE {
// TODO: Share common structure with CompactedDBImpl and DBImplSecondary
class DBImplReadOnly : public DBImpl {
public:
DBImplReadOnly(const DBOptions& options, const std::string& dbname);
@ -141,6 +142,16 @@ class DBImplReadOnly : public DBImpl {
return Status::NotSupported("Not supported operation in read only mode.");
}
// FIXME: some missing overrides for more "write" functions
protected:
#ifndef ROCKSDB_LITE
Status FlushForGetLiveFiles() override {
// No-op for read-only DB
return Status::OK();
}
#endif // !ROCKSDB_LITE
private:
// A "helper" function for DB::OpenForReadOnly without column families
// to reduce unnecessary I/O

View File

@ -71,6 +71,7 @@ class LogReaderContainer {
// The secondary instance can be opened using `DB::OpenAsSecondary`. After
// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best
// effort attempts to catch up with the primary.
// TODO: Share common structure with CompactedDBImpl and DBImplReadOnly
class DBImplSecondary : public DBImpl {
public:
DBImplSecondary(const DBOptions& options, const std::string& dbname,
@ -268,6 +269,13 @@ class DBImplSecondary : public DBImpl {
#endif // NDEBUG
protected:
#ifndef ROCKSDB_LITE
Status FlushForGetLiveFiles() override {
// No-op for read-only DB
return Status::OK();
}
#endif // !ROCKSDB_LITE
// ColumnFamilyCollector is a write batch handler which does nothing
// except recording unique column family IDs
class ColumnFamilyCollector : public WriteBatch::Handler {

View File

@ -1562,9 +1562,10 @@ class DB {
// The valid size of the manifest file is returned in manifest_file_size.
// The manifest file is an ever growing file, but only the portion specified
// by manifest_file_size is valid for this snapshot. Setting flush_memtable
// to true does Flush before recording the live files. Setting flush_memtable
// to false is useful when we don't want to wait for flush which may have to
// wait for compaction to complete taking an indeterminate time.
// to true does Flush before recording the live files (unless DB is
// read-only). Setting flush_memtable to false is useful when we don't want
// to wait for flush which may have to wait for compaction to complete
// taking an indeterminate time.
//
// NOTE: Although GetLiveFiles() followed by GetSortedWalFiles() can generate
// a lossless backup, GetLiveFilesStorageInfo() is strongly recommended

View File

@ -2094,7 +2094,8 @@ struct LiveFilesStorageInfoOptions {
// Whether to populate FileStorageInfo::file_checksum* or leave blank
bool include_checksum_info = false;
// Flushes memtables if total size in bytes of live WAL files is >= this
// number. Default: always force a flush without checking sizes.
// number (and DB is not read-only).
// Default: always force a flush without checking sizes.
uint64_t wal_size_for_flush = 0;
};
#endif // !ROCKSDB_LITE

View File

@ -2984,9 +2984,12 @@ TEST_F(BackupEngineTest, ReadOnlyBackupEngine) {
DestroyDBWithoutCheck(dbname_, options_);
OpenDBAndBackupEngine(true);
FillDB(db_.get(), 0, 100);
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
// Also test read-only DB with CreateNewBackup and flush=true (no flush)
CloseAndReopenDB(/*read_only*/ true);
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true));
CloseAndReopenDB(/*read_only*/ false);
FillDB(db_.get(), 100, 200);
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true));
CloseDBAndBackupEngine();
DestroyDBWithoutCheck(dbname_, options_);