From 4bd5aa4f5541d22ae52ff1ee2f6c9b782cf5ccdc Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Wed, 11 Oct 2023 09:42:48 -0700 Subject: [PATCH] Fix two `ErrorHandler` race conditions (#11939) Summary: 1. Prevent a double join on a `port::Thread` 2. Ensure `recovery_in_prog_` and `bg_error_` are both set under same lock hold. This is useful for writers who see a non-OK `bg_error_` and are deciding whether to stall based on whether the error will be auto-recovered. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11939 Reviewed By: cbi42 Differential Revision: D50155484 Pulled By: ajkr fbshipit-source-id: fbc1f85c50e7eaee27ee0e376aee688d8a06c93b --- db/error_handler.cc | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/db/error_handler.cc b/db/error_handler.cc index 018183ba64..c7dd4750f7 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -638,16 +638,22 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError( ROCKS_LOG_INFO( db_options_.info_log, "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n"); + // Needs to be set in the same lock hold as setting BG error, otherwise + // intervening writes could see a BG error without a recovery and bail out. + recovery_in_prog_ = true; + if (recovery_thread_) { + // Ensure only one thread can execute the join(). + std::unique_ptr old_recovery_thread( + std::move(recovery_thread_)); // In this case, if recovery_in_prog_ is false, current thread should // wait the previous recover thread to finish and create a new thread // to recover from the bg error. db_mutex_->Unlock(); - recovery_thread_->join(); + old_recovery_thread->join(); db_mutex_->Lock(); } - recovery_in_prog_ = true; TEST_SYNC_POINT("StartRecoverFromRetryableBGIOError::in_progress"); recovery_thread_.reset( new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this)); @@ -790,12 +796,15 @@ void ErrorHandler::EndAutoRecovery() { if (!end_recovery_) { end_recovery_ = true; } - cv_.SignalAll(); - db_mutex_->Unlock(); if (recovery_thread_) { - recovery_thread_->join(); + // Ensure only one thread can execute the join(). + std::unique_ptr old_recovery_thread( + std::move(recovery_thread_)); + db_mutex_->Unlock(); + cv_.SignalAll(); + old_recovery_thread->join(); + db_mutex_->Lock(); } - db_mutex_->Lock(); return; }