rocksdb/db/error_handler.h
Changyu Bi 1c871a4d86 Only flush after recovery for retryable IOError (#11880)
Summary:
https://github.com/facebook/rocksdb/issues/11872 causes a unit test to start failing with the error message below. The cause is that the additional call to `FlushAllColumnFamilies()` in `DBImpl::ResumeImpl()` can run while DB is closing. More detailed explanation: there are two places where we call `ResumeImpl()`:

1. in `ErrorHandler::RecoverFromBGError`, for manual resume or recovery from errors like OutOfSpace through sst file manager, and
2. in `Errorhandler::RecoverFromRetryableBGIOError`, for error recovery from errors like flush failure due to retryable IOError. This is tracked by `ErrorHandler::recovery_thread_`.

Here is how DB close waits for error recovery: 49da91ec09/db/db_impl/db_impl.cc (L540-L543)

`CancelErrorRecovery()` waits until `recovery_thread_` finishes and `IsRecoveryInProgress()` checks the `recovery_in_prog_` flag. The additional call to `FlushAllColumnFamilies()` in `ResumeImpl()` happens after it clears bg error and the `recovery_in_prog_` flag: 49da91ec09/db/db_impl/db_impl.cc (L436-L463). So if `ResumeImpl()` is called in `RecoverFromBGError()`, we can have a thread running `FlushAllColumnFamilies()` while DB is closing and thought that recovery is done.

The fix is to only do the additional call to `FlushAllColumnFamilies()` when doing error recovery through `Errorhandler::RecoverFromRetryableBGIOError` by setting flags in `DBRecoverContext`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11880

Test Plan:
`gtest-parallel --repeat=100 --workers=4 ./error_handler_fs_test --gtest_filter="*AutoRecoverFlushError*"` reproduces the error pretty reliably.

```[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from DBErrorHandlingFSTest
[ RUN      ] DBErrorHandlingFSTest.AutoRecoverFlushError
error_handler_fs_test: db/column_family.cc:1618: rocksdb::ColumnFamilySet::~ColumnFamilySet(): Assertion `last_ref' failed.
Received signal 6 (Aborted)
...
https://github.com/facebook/rocksdb/issues/10 0x00007fac4409efd6 in __GI___assert_fail (assertion=0x7fac452c0afa "last_ref", file=0x7fac452c9fb5 "db/column_family.cc", line=1618, function=0x7fac452cb950 "rocksdb::ColumnFamilySet::~ColumnFamilySet()") at assert.c:101
101     in assert.c
https://github.com/facebook/rocksdb/issues/11 0x00007fac44b5324f in rocksdb::ColumnFamilySet::~ColumnFamilySet (this=0x7b5400000000) at db/column_family.cc:1618
1618        assert(last_ref);
https://github.com/facebook/rocksdb/issues/12 0x00007fac44e0f047 in std::default_delete<rocksdb::ColumnFamilySet>::operator() (this=0x7b5800000940, __ptr=0x7b5400000000) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:85
85              delete __ptr;
https://github.com/facebook/rocksdb/issues/13 std::__uniq_ptr_impl<rocksdb::ColumnFamilySet, std::default_delete<rocksdb::ColumnFamilySet> >::reset (this=0x7b5800000940, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:182
182               _M_deleter()(__old_p);
https://github.com/facebook/rocksdb/issues/14 std::unique_ptr<rocksdb::ColumnFamilySet, std::default_delete<rocksdb::ColumnFamilySet> >::reset (this=0x7b5800000940, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:456
456             _M_t.reset(std::move(__p));
https://github.com/facebook/rocksdb/issues/15 rocksdb::VersionSet::~VersionSet (this=this@entry=0x7b5800000900) at db/version_set.cc:5081
5081      column_family_set_.reset();
https://github.com/facebook/rocksdb/issues/16 0x00007fac44e0f97a in rocksdb::VersionSet::~VersionSet (this=0x7b5800000900) at db/version_set.cc:5078
5078    VersionSet::~VersionSet() {
https://github.com/facebook/rocksdb/issues/17 0x00007fac44bf0b2f in std::default_delete<rocksdb::VersionSet>::operator() (this=0x7b8c00000068, __ptr=0x7b5800000900) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:85
85              delete __ptr;
https://github.com/facebook/rocksdb/issues/18 std::__uniq_ptr_impl<rocksdb::VersionSet, std::default_delete<rocksdb::VersionSet> >::reset (this=0x7b8c00000068, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:182
182               _M_deleter()(__old_p);
https://github.com/facebook/rocksdb/issues/19 std::unique_ptr<rocksdb::VersionSet, std::default_delete<rocksdb::VersionSet> >::reset (this=0x7b8c00000068, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:456
456             _M_t.reset(std::move(__p));
https://github.com/facebook/rocksdb/issues/20 rocksdb::DBImpl::CloseHelper (this=this@entry=0x7b8c00000000) at db/db_impl/db_impl.cc:676
676       versions_.reset();
https://github.com/facebook/rocksdb/issues/21 0x00007fac44bf1346 in rocksdb::DBImpl::CloseImpl (this=0x7b8c00000000) at db/db_impl/db_impl.cc:720
720     Status DBImpl::CloseImpl() { return CloseHelper(); }
https://github.com/facebook/rocksdb/issues/22 rocksdb::DBImpl::~DBImpl (this=this@entry=0x7b8c00000000) at db/db_impl/db_impl.cc:738
738       closing_status_ = CloseImpl();
https://github.com/facebook/rocksdb/issues/23 0x00007fac44bf2bba in rocksdb::DBImpl::~DBImpl (this=0x7b8c00000000) at db/db_impl/db_impl.cc:722
722     DBImpl::~DBImpl() {
https://github.com/facebook/rocksdb/issues/24 0x00007fac455444d4 in rocksdb::DBTestBase::Close (this=this@entry=0x7b6c00000000) at db/db_test_util.cc:678
678       delete db_;
https://github.com/facebook/rocksdb/issues/25 0x00007fac455455fb in rocksdb::DBTestBase::TryReopen (this=this@entry=0x7b6c00000000, options=...) at db/db_test_util.cc:707
707       Close();
https://github.com/facebook/rocksdb/issues/26 0x00007fac45543459 in rocksdb::DBTestBase::Reopen (this=0x7ffed74b79a0, options=...) at db/db_test_util.cc:670
670       ASSERT_OK(TryReopen(options));
https://github.com/facebook/rocksdb/issues/27 0x00000000004f2522 in rocksdb::DBErrorHandlingFSTest_AutoRecoverFlushError_Test::TestBody (this=this@entry=0x7b6c00000000) at db/error_handler_fs_test.cc:1224
1224      Reopen(options);
```

Reviewed By: jowlyzhang

Differential Revision: D49579701

Pulled By: cbi42

fbshipit-source-id: 3fc8325e6dde7e7faa8bcad95060cb4e26eda638
2023-09-25 09:34:39 -07:00

128 lines
4.3 KiB
C++

// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include "monitoring/instrumented_mutex.h"
#include "options/db_options.h"
#include "rocksdb/io_status.h"
#include "rocksdb/listener.h"
#include "rocksdb/status.h"
namespace ROCKSDB_NAMESPACE {
class DBImpl;
// This structure is used to store the DB recovery context. The context is
// the information that related to the recover actions. For example, it contains
// FlushReason, which tells the flush job why this flush is called.
struct DBRecoverContext {
FlushReason flush_reason;
bool flush_after_recovery;
DBRecoverContext()
: flush_reason(FlushReason::kErrorRecovery),
flush_after_recovery(false) {}
DBRecoverContext(FlushReason reason)
: flush_reason(reason), flush_after_recovery(false) {}
};
class ErrorHandler {
public:
ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options,
InstrumentedMutex* db_mutex)
: db_(db),
db_options_(db_options),
cv_(db_mutex),
end_recovery_(false),
recovery_thread_(nullptr),
db_mutex_(db_mutex),
auto_recovery_(false),
recovery_in_prog_(false),
soft_error_no_bg_work_(false),
is_db_stopped_(false),
bg_error_stats_(db_options.statistics) {
// Clear the checked flag for uninitialized errors
bg_error_.PermitUncheckedError();
recovery_error_.PermitUncheckedError();
recovery_io_error_.PermitUncheckedError();
}
void EnableAutoRecovery() { auto_recovery_ = true; }
Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
Status::Code code, Status::SubCode subcode);
const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason);
Status GetBGError() const { return bg_error_; }
Status GetRecoveryError() const { return recovery_error_; }
Status ClearBGError();
bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); }
bool IsBGWorkStopped() {
assert(db_mutex_);
db_mutex_->AssertHeld();
return !bg_error_.ok() &&
(bg_error_.severity() >= Status::Severity::kHardError ||
!auto_recovery_ || soft_error_no_bg_work_);
}
bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; }
bool IsRecoveryInProgress() { return recovery_in_prog_; }
Status RecoverFromBGError(bool is_manual = false);
void CancelErrorRecovery();
void EndAutoRecovery();
private:
DBImpl* db_;
const ImmutableDBOptions& db_options_;
Status bg_error_;
// A separate Status variable used to record any errors during the
// recovery process from hard errors
Status recovery_error_;
// A separate IO Status variable used to record any IO errors during
// the recovery process. At the same time, recovery_error_ is also set.
IOStatus recovery_io_error_;
// The condition variable used with db_mutex during auto resume for time
// wait.
InstrumentedCondVar cv_;
bool end_recovery_;
std::unique_ptr<port::Thread> recovery_thread_;
InstrumentedMutex* db_mutex_;
// A flag indicating whether automatic recovery from errors is enabled
bool auto_recovery_;
bool recovery_in_prog_;
// A flag to indicate that for the soft error, we should not allow any
// background work except the work is from recovery.
bool soft_error_no_bg_work_;
// Used to store the context for recover, such as flush reason.
DBRecoverContext recover_context_;
std::atomic<bool> is_db_stopped_;
// The pointer of DB statistics.
std::shared_ptr<Statistics> bg_error_stats_;
const Status& HandleKnownErrors(const Status& bg_err,
BackgroundErrorReason reason);
Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery);
void RecoverFromNoSpace();
const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error);
void RecoverFromRetryableBGIOError();
// First, if it is in recovery and the recovery_error is ok. Set the
// recovery_error_ to bg_err. Second, if the severity is higher than the
// current bg_error_, overwrite it.
void CheckAndSetRecoveryAndBGError(const Status& bg_err);
};
} // namespace ROCKSDB_NAMESPACE