Fix two ErrorHandler race conditions (#11939)

Summary:
1. Prevent a double join on a `port::Thread`
2. Ensure `recovery_in_prog_` and `bg_error_` are both set under same lock hold. This is useful for writers who see a non-OK `bg_error_` and are deciding whether to stall based on whether the error will be auto-recovered.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11939

Reviewed By: cbi42

Differential Revision: D50155484

Pulled By: ajkr

fbshipit-source-id: fbc1f85c50e7eaee27ee0e376aee688d8a06c93b
This commit is contained in:
Andrew Kryczka 2023-10-11 09:42:48 -07:00 committed by Facebook GitHub Bot
parent 5b11f5a3a2
commit 4bd5aa4f55

View file

@ -638,16 +638,22 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
ROCKS_LOG_INFO(
db_options_.info_log,
"ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n");
// Needs to be set in the same lock hold as setting BG error, otherwise
// intervening writes could see a BG error without a recovery and bail out.
recovery_in_prog_ = true;
if (recovery_thread_) {
// Ensure only one thread can execute the join().
std::unique_ptr<port::Thread> old_recovery_thread(
std::move(recovery_thread_));
// In this case, if recovery_in_prog_ is false, current thread should
// wait the previous recover thread to finish and create a new thread
// to recover from the bg error.
db_mutex_->Unlock();
recovery_thread_->join();
old_recovery_thread->join();
db_mutex_->Lock();
}
recovery_in_prog_ = true;
TEST_SYNC_POINT("StartRecoverFromRetryableBGIOError::in_progress");
recovery_thread_.reset(
new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));
@ -790,12 +796,15 @@ void ErrorHandler::EndAutoRecovery() {
if (!end_recovery_) {
end_recovery_ = true;
}
cv_.SignalAll();
db_mutex_->Unlock();
if (recovery_thread_) {
recovery_thread_->join();
}
// Ensure only one thread can execute the join().
std::unique_ptr<port::Thread> old_recovery_thread(
std::move(recovery_thread_));
db_mutex_->Unlock();
cv_.SignalAll();
old_recovery_thread->join();
db_mutex_->Lock();
}
return;
}