Add a listener callback for end of auto error recovery (#9244)

Summary:
Previously, the OnErrorRecoveryCompleted callback was called when
RocksDB was able to successfully recover from a retryable error.
However, if the recovery failed and was eventually stopped, there was no
indication of the status. To fix that, a new OnErrorRecoveryEnd callback
is introduced that deprecates the OnErrorRecoveryCompleted callback. The
new callback is called with the original error and the new error status.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9244

Test Plan: Add a new unit test in error_handler_fs_test

Reviewed By: zhichao-cao

Differential Revision: D32922303

Pulled By: anand1976

fbshipit-source-id: f04e77a9cb92c5ea6385590682d3fcf559971b99
This commit is contained in:
anand76 2021-12-08 14:22:41 -08:00 committed by Facebook GitHub Bot
parent 9e4d56f2c9
commit ecf2bec613
6 changed files with 103 additions and 9 deletions

View file

@ -9,6 +9,7 @@
* MemTableList::TrimHistory now use allocated bytes when max_write_buffer_size_to_maintain > 0(default in TrasactionDB, introduced in PR#5022) Fix #8371. * MemTableList::TrimHistory now use allocated bytes when max_write_buffer_size_to_maintain > 0(default in TrasactionDB, introduced in PR#5022) Fix #8371.
### Public API change ### Public API change
* Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes. * Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes.
* Introduce a new EventListener callback that will be called upon the end of automatic error recovery.
### Performance Improvements ### Performance Improvements
* Replaced map property `TableProperties::properties_offsets` with uint64_t property `external_sst_file_global_seqno_offset` to save table properties's memory. * Replaced map property `TableProperties::properties_offsets` with uint64_t property `external_sst_file_global_seqno_offset` to save table properties's memory.

View file

@ -540,6 +540,8 @@ Status ErrorHandler::ClearBGError() {
// Signal that recovery succeeded // Signal that recovery succeeded
if (recovery_error_.ok()) { if (recovery_error_.ok()) {
Status old_bg_error = bg_error_; Status old_bg_error = bg_error_;
// old_bg_error is only for notifying listeners, so may not be checked
old_bg_error.PermitUncheckedError();
// Clear and check the recovery IO and BG error // Clear and check the recovery IO and BG error
bg_error_ = Status::OK(); bg_error_ = Status::OK();
recovery_io_error_ = IOStatus::OK(); recovery_io_error_ = IOStatus::OK();
@ -547,8 +549,8 @@ Status ErrorHandler::ClearBGError() {
recovery_io_error_.PermitUncheckedError(); recovery_io_error_.PermitUncheckedError();
recovery_in_prog_ = false; recovery_in_prog_ = false;
soft_error_no_bg_work_ = false; soft_error_no_bg_work_ = false;
EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error,
old_bg_error, db_mutex_); bg_error_, db_mutex_);
} }
return recovery_error_; return recovery_error_;
#else #else
@ -665,6 +667,9 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
InstrumentedMutexLock l(db_mutex_); InstrumentedMutexLock l(db_mutex_);
if (end_recovery_) { if (end_recovery_) {
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
Status::ShutdownInProgress(),
db_mutex_);
return; return;
} }
DBRecoverContext context = recover_context_; DBRecoverContext context = recover_context_;
@ -674,6 +679,9 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
// Recover from the retryable error. Create a separate thread to do it. // Recover from the retryable error. Create a separate thread to do it.
while (resume_count > 0) { while (resume_count > 0) {
if (end_recovery_) { if (end_recovery_) {
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
Status::ShutdownInProgress(),
db_mutex_);
return; return;
} }
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0");
@ -695,6 +703,8 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
RecordInHistogram(bg_error_stats_.get(), RecordInHistogram(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
} }
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
bg_error_, db_mutex_);
return; return;
} }
if (!recovery_io_error_.ok() && if (!recovery_io_error_.ok() &&
@ -718,8 +728,8 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
Status old_bg_error = bg_error_; Status old_bg_error = bg_error_;
bg_error_ = Status::OK(); bg_error_ = Status::OK();
bg_error_.PermitUncheckedError(); bg_error_.PermitUncheckedError();
EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, EventHelpers::NotifyOnErrorRecoveryEnd(
old_bg_error, db_mutex_); db_options_.listeners, old_bg_error, bg_error_, db_mutex_);
if (bg_error_stats_ != nullptr) { if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT); ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT);
@ -739,12 +749,21 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
RecordInHistogram(bg_error_stats_.get(), RecordInHistogram(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
} }
EventHelpers::NotifyOnErrorRecoveryEnd(
db_options_.listeners, bg_error_,
!recovery_io_error_.ok()
? recovery_io_error_
: (!recovery_error_.ok() ? recovery_error_ : s),
db_mutex_);
return; return;
} }
} }
resume_count--; resume_count--;
} }
recovery_in_prog_ = false; recovery_in_prog_ = false;
EventHelpers::NotifyOnErrorRecoveryEnd(
db_options_.listeners, bg_error_,
Status::Aborted("Exceeded resume retry count"), db_mutex_);
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
if (bg_error_stats_ != nullptr) { if (bg_error_stats_ != nullptr) {
RecordInHistogram(bg_error_stats_.get(), RecordInHistogram(bg_error_stats_.get(),

View file

@ -66,6 +66,7 @@ class ErrorHandlerFSListener : public EventListener {
~ErrorHandlerFSListener() { ~ErrorHandlerFSListener() {
file_creation_error_.PermitUncheckedError(); file_creation_error_.PermitUncheckedError();
bg_error_.PermitUncheckedError(); bg_error_.PermitUncheckedError();
new_bg_error_.PermitUncheckedError();
} }
void OnTableFileCreationStarted( void OnTableFileCreationStarted(
@ -96,6 +97,13 @@ class ErrorHandlerFSListener : public EventListener {
old_bg_error.PermitUncheckedError(); old_bg_error.PermitUncheckedError();
} }
void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& info) override {
InstrumentedMutexLock l(&mutex_);
recovery_complete_ = true;
cv_.SignalAll();
new_bg_error_ = info.new_bg_error;
}
bool WaitForRecovery(uint64_t /*abs_time_us*/) { bool WaitForRecovery(uint64_t /*abs_time_us*/) {
InstrumentedMutexLock l(&mutex_); InstrumentedMutexLock l(&mutex_);
while (!recovery_complete_) { while (!recovery_complete_) {
@ -138,6 +146,8 @@ class ErrorHandlerFSListener : public EventListener {
file_creation_error_ = io_s; file_creation_error_ = io_s;
} }
Status new_bg_error() { return new_bg_error_; }
private: private:
InstrumentedMutex mutex_; InstrumentedMutex mutex_;
InstrumentedCondVar cv_; InstrumentedCondVar cv_;
@ -148,6 +158,7 @@ class ErrorHandlerFSListener : public EventListener {
int file_count_; int file_count_;
IOStatus file_creation_error_; IOStatus file_creation_error_;
Status bg_error_; Status bg_error_;
Status new_bg_error_;
FaultInjectionTestFS* fault_fs_; FaultInjectionTestFS* fault_fs_;
}; };
@ -2428,6 +2439,42 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
Close(); Close();
} }
// Fail auto resume from a flush retryable error and verify that
// OnErrorRecoveryEnd listener callback is called
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) {
// Activate the FS before the first resume
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
Options options = GetDefaultOptions();
options.env = fault_env_.get();
options.create_if_missing = true;
options.listeners.emplace_back(listener);
options.max_bgerror_resume_count = 2;
options.bgerror_resume_retry_interval = 100000; // 0.1 second
Status s;
listener->EnableAutoRecovery(false);
DestroyAndReopen(options);
IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
error_msg.SetRetryable(true);
ASSERT_OK(Put(Key(1), "val1"));
SyncPoint::GetInstance()->SetCallBack(
"BuildTable:BeforeFinishBuildTable",
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
SyncPoint::GetInstance()->EnableProcessing();
s = Flush();
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
ASSERT_EQ(listener->WaitForRecovery(5000000), true);
ASSERT_EQ(listener->new_bg_error(), Status::Aborted());
SyncPoint::GetInstance()->DisableProcessing();
fault_fs_->SetFilesystemActive(true);
Destroy(options);
}
class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest, class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest,
public testing::WithParamInterface<bool> {}; public testing::WithParamInterface<bool> {};

View file

@ -232,23 +232,30 @@ void EventHelpers::LogAndNotifyTableFileDeletion(
#endif // !ROCKSDB_LITE #endif // !ROCKSDB_LITE
} }
void EventHelpers::NotifyOnErrorRecoveryCompleted( void EventHelpers::NotifyOnErrorRecoveryEnd(
const std::vector<std::shared_ptr<EventListener>>& listeners, const std::vector<std::shared_ptr<EventListener>>& listeners,
Status old_bg_error, InstrumentedMutex* db_mutex) { const Status& old_bg_error, const Status& new_bg_error,
InstrumentedMutex* db_mutex) {
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
if (!listeners.empty()) { if (!listeners.empty()) {
db_mutex->AssertHeld(); db_mutex->AssertHeld();
// release lock while notifying events // release lock while notifying events
db_mutex->Unlock(); db_mutex->Unlock();
for (auto& listener : listeners) { for (auto& listener : listeners) {
BackgroundErrorRecoveryInfo info;
info.old_bg_error = old_bg_error;
info.new_bg_error = new_bg_error;
listener->OnErrorRecoveryCompleted(old_bg_error); listener->OnErrorRecoveryCompleted(old_bg_error);
listener->OnErrorRecoveryEnd(info);
info.old_bg_error.PermitUncheckedError();
info.new_bg_error.PermitUncheckedError();
} }
db_mutex->Lock(); db_mutex->Lock();
} }
old_bg_error.PermitUncheckedError();
#else #else
(void)listeners; (void)listeners;
(void)old_bg_error; (void)old_bg_error;
(void)new_bg_error;
(void)db_mutex; (void)db_mutex;
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
} }

View file

@ -43,9 +43,10 @@ class EventHelpers {
uint64_t file_number, const std::string& file_path, uint64_t file_number, const std::string& file_path,
const Status& status, const std::string& db_name, const Status& status, const std::string& db_name,
const std::vector<std::shared_ptr<EventListener>>& listeners); const std::vector<std::shared_ptr<EventListener>>& listeners);
static void NotifyOnErrorRecoveryCompleted( static void NotifyOnErrorRecoveryEnd(
const std::vector<std::shared_ptr<EventListener>>& listeners, const std::vector<std::shared_ptr<EventListener>>& listeners,
Status bg_error, InstrumentedMutex* db_mutex); const Status& old_bg_error, const Status& new_bg_error,
InstrumentedMutex* db_mutex);
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
static void NotifyBlobFileCreationStarted( static void NotifyBlobFileCreationStarted(

View file

@ -453,6 +453,16 @@ struct ExternalFileIngestionInfo {
TableProperties table_properties; TableProperties table_properties;
}; };
// Result of auto background error recovery
struct BackgroundErrorRecoveryInfo {
// The original error that triggered the recovery
Status old_bg_error;
// The final bg_error after all recovery attempts. Status::OK() means
// the recovery was successful and the database is fully operational.
Status new_bg_error;
};
struct IOErrorInfo { struct IOErrorInfo {
IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation, IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation,
const std::string& _file_path, size_t _length, uint64_t _offset) const std::string& _file_path, size_t _length, uint64_t _offset)
@ -691,12 +701,21 @@ class EventListener : public Customizable {
Status /* bg_error */, Status /* bg_error */,
bool* /* auto_recovery */) {} bool* /* auto_recovery */) {}
// DEPRECATED
// A callback function for RocksDB which will be called once the database // A callback function for RocksDB which will be called once the database
// is recovered from read-only mode after an error. When this is called, it // is recovered from read-only mode after an error. When this is called, it
// means normal writes to the database can be issued and the user can // means normal writes to the database can be issued and the user can
// initiate any further recovery actions needed // initiate any further recovery actions needed
virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {} virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
// A callback function for RocksDB which will be called once the recovery
// attempt from a background retryable error is completed. The recovery
// may have been successful or not. In either case, the callback is called
// with the old and new error. If info.new_bg_error is Status::OK(), that
// means the recovery succeeded.
virtual void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& /*info*/) {
}
// A callback function for RocksDB which will be called before // A callback function for RocksDB which will be called before
// a blob file is being created. It will follow by OnBlobFileCreated after // a blob file is being created. It will follow by OnBlobFileCreated after
// the creation finishes. // the creation finishes.