rocksdb/db/error_handler.cc
Changyu Bi 1c871a4d86 Only flush after recovery for retryable IOError (#11880)
Summary:
https://github.com/facebook/rocksdb/issues/11872 causes a unit test to start failing with the error message below. The cause is that the additional call to `FlushAllColumnFamilies()` in `DBImpl::ResumeImpl()` can run while DB is closing. More detailed explanation: there are two places where we call `ResumeImpl()`:

1. in `ErrorHandler::RecoverFromBGError`, for manual resume or recovery from errors like OutOfSpace through sst file manager, and
2. in `Errorhandler::RecoverFromRetryableBGIOError`, for error recovery from errors like flush failure due to retryable IOError. This is tracked by `ErrorHandler::recovery_thread_`.

Here is how DB close waits for error recovery: 49da91ec09/db/db_impl/db_impl.cc (L540-L543)

`CancelErrorRecovery()` waits until `recovery_thread_` finishes and `IsRecoveryInProgress()` checks the `recovery_in_prog_` flag. The additional call to `FlushAllColumnFamilies()` in `ResumeImpl()` happens after it clears bg error and the `recovery_in_prog_` flag: 49da91ec09/db/db_impl/db_impl.cc (L436-L463). So if `ResumeImpl()` is called in `RecoverFromBGError()`, we can have a thread running `FlushAllColumnFamilies()` while DB is closing and thought that recovery is done.

The fix is to only do the additional call to `FlushAllColumnFamilies()` when doing error recovery through `Errorhandler::RecoverFromRetryableBGIOError` by setting flags in `DBRecoverContext`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11880

Test Plan:
`gtest-parallel --repeat=100 --workers=4 ./error_handler_fs_test --gtest_filter="*AutoRecoverFlushError*"` reproduces the error pretty reliably.

```[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from DBErrorHandlingFSTest
[ RUN      ] DBErrorHandlingFSTest.AutoRecoverFlushError
error_handler_fs_test: db/column_family.cc:1618: rocksdb::ColumnFamilySet::~ColumnFamilySet(): Assertion `last_ref' failed.
Received signal 6 (Aborted)
...
https://github.com/facebook/rocksdb/issues/10 0x00007fac4409efd6 in __GI___assert_fail (assertion=0x7fac452c0afa "last_ref", file=0x7fac452c9fb5 "db/column_family.cc", line=1618, function=0x7fac452cb950 "rocksdb::ColumnFamilySet::~ColumnFamilySet()") at assert.c:101
101     in assert.c
https://github.com/facebook/rocksdb/issues/11 0x00007fac44b5324f in rocksdb::ColumnFamilySet::~ColumnFamilySet (this=0x7b5400000000) at db/column_family.cc:1618
1618        assert(last_ref);
https://github.com/facebook/rocksdb/issues/12 0x00007fac44e0f047 in std::default_delete<rocksdb::ColumnFamilySet>::operator() (this=0x7b5800000940, __ptr=0x7b5400000000) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:85
85              delete __ptr;
https://github.com/facebook/rocksdb/issues/13 std::__uniq_ptr_impl<rocksdb::ColumnFamilySet, std::default_delete<rocksdb::ColumnFamilySet> >::reset (this=0x7b5800000940, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:182
182               _M_deleter()(__old_p);
https://github.com/facebook/rocksdb/issues/14 std::unique_ptr<rocksdb::ColumnFamilySet, std::default_delete<rocksdb::ColumnFamilySet> >::reset (this=0x7b5800000940, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:456
456             _M_t.reset(std::move(__p));
https://github.com/facebook/rocksdb/issues/15 rocksdb::VersionSet::~VersionSet (this=this@entry=0x7b5800000900) at db/version_set.cc:5081
5081      column_family_set_.reset();
https://github.com/facebook/rocksdb/issues/16 0x00007fac44e0f97a in rocksdb::VersionSet::~VersionSet (this=0x7b5800000900) at db/version_set.cc:5078
5078    VersionSet::~VersionSet() {
https://github.com/facebook/rocksdb/issues/17 0x00007fac44bf0b2f in std::default_delete<rocksdb::VersionSet>::operator() (this=0x7b8c00000068, __ptr=0x7b5800000900) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:85
85              delete __ptr;
https://github.com/facebook/rocksdb/issues/18 std::__uniq_ptr_impl<rocksdb::VersionSet, std::default_delete<rocksdb::VersionSet> >::reset (this=0x7b8c00000068, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:182
182               _M_deleter()(__old_p);
https://github.com/facebook/rocksdb/issues/19 std::unique_ptr<rocksdb::VersionSet, std::default_delete<rocksdb::VersionSet> >::reset (this=0x7b8c00000068, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:456
456             _M_t.reset(std::move(__p));
https://github.com/facebook/rocksdb/issues/20 rocksdb::DBImpl::CloseHelper (this=this@entry=0x7b8c00000000) at db/db_impl/db_impl.cc:676
676       versions_.reset();
https://github.com/facebook/rocksdb/issues/21 0x00007fac44bf1346 in rocksdb::DBImpl::CloseImpl (this=0x7b8c00000000) at db/db_impl/db_impl.cc:720
720     Status DBImpl::CloseImpl() { return CloseHelper(); }
https://github.com/facebook/rocksdb/issues/22 rocksdb::DBImpl::~DBImpl (this=this@entry=0x7b8c00000000) at db/db_impl/db_impl.cc:738
738       closing_status_ = CloseImpl();
https://github.com/facebook/rocksdb/issues/23 0x00007fac44bf2bba in rocksdb::DBImpl::~DBImpl (this=0x7b8c00000000) at db/db_impl/db_impl.cc:722
722     DBImpl::~DBImpl() {
https://github.com/facebook/rocksdb/issues/24 0x00007fac455444d4 in rocksdb::DBTestBase::Close (this=this@entry=0x7b6c00000000) at db/db_test_util.cc:678
678       delete db_;
https://github.com/facebook/rocksdb/issues/25 0x00007fac455455fb in rocksdb::DBTestBase::TryReopen (this=this@entry=0x7b6c00000000, options=...) at db/db_test_util.cc:707
707       Close();
https://github.com/facebook/rocksdb/issues/26 0x00007fac45543459 in rocksdb::DBTestBase::Reopen (this=0x7ffed74b79a0, options=...) at db/db_test_util.cc:670
670       ASSERT_OK(TryReopen(options));
https://github.com/facebook/rocksdb/issues/27 0x00000000004f2522 in rocksdb::DBErrorHandlingFSTest_AutoRecoverFlushError_Test::TestBody (this=this@entry=0x7b6c00000000) at db/error_handler_fs_test.cc:1224
1224      Reopen(options);
```

Reviewed By: jowlyzhang

Differential Revision: D49579701

Pulled By: cbi42

fbshipit-source-id: 3fc8325e6dde7e7faa8bcad95060cb4e26eda638
2023-09-25 09:34:39 -07:00

811 lines
34 KiB
C++

// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
#include "db/error_handler.h"
#include "db/db_impl/db_impl.h"
#include "db/event_helpers.h"
#include "file/sst_file_manager_impl.h"
#include "logging/logging.h"
#include "port/lang.h"
namespace ROCKSDB_NAMESPACE {
// Maps to help decide the severity of an error based on the
// BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks
// is set or not. There are 3 maps, going from most specific to least specific
// (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and
// paranoid_checks). The less specific map serves as a catch all in case we miss
// a specific error code or subcode.
std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>,
Status::Severity>
ErrorSeverityMap = {
// Errors during BG compaction
{std::make_tuple(BackgroundErrorReason::kCompaction,
Status::Code::kIOError, Status::SubCode::kNoSpace,
true),
Status::Severity::kSoftError},
{std::make_tuple(BackgroundErrorReason::kCompaction,
Status::Code::kIOError, Status::SubCode::kNoSpace,
false),
Status::Severity::kNoError},
{std::make_tuple(BackgroundErrorReason::kCompaction,
Status::Code::kIOError, Status::SubCode::kSpaceLimit,
true),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kCompaction,
Status::Code::kIOError, Status::SubCode::kIOFenced,
true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kCompaction,
Status::Code::kIOError, Status::SubCode::kIOFenced,
false),
Status::Severity::kFatalError},
// Errors during BG flush
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
Status::SubCode::kNoSpace, true),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
Status::SubCode::kNoSpace, false),
Status::Severity::kNoError},
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
Status::SubCode::kSpaceLimit, true),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
Status::SubCode::kIOFenced, true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
Status::SubCode::kIOFenced, false),
Status::Severity::kFatalError},
// Errors during Write
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
Status::Code::kIOError, Status::SubCode::kNoSpace,
true),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
Status::Code::kIOError, Status::SubCode::kNoSpace,
false),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
Status::Code::kIOError, Status::SubCode::kIOFenced,
true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
Status::Code::kIOError, Status::SubCode::kIOFenced,
false),
Status::Severity::kFatalError},
// Errors during MANIFEST write
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
Status::Code::kIOError, Status::SubCode::kNoSpace,
true),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
Status::Code::kIOError, Status::SubCode::kNoSpace,
false),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
Status::Code::kIOError, Status::SubCode::kIOFenced,
true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
Status::Code::kIOError, Status::SubCode::kIOFenced,
false),
Status::Severity::kFatalError},
// Errors during BG flush with WAL disabled
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, Status::SubCode::kNoSpace,
true),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, Status::SubCode::kNoSpace,
false),
Status::Severity::kNoError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, Status::SubCode::kSpaceLimit,
true),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, Status::SubCode::kIOFenced,
true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, Status::SubCode::kIOFenced,
false),
Status::Severity::kFatalError},
// Errors during MANIFEST write when WAL is disabled
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
Status::Code::kIOError, Status::SubCode::kNoSpace,
true),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
Status::Code::kIOError, Status::SubCode::kNoSpace,
false),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
Status::Code::kIOError, Status::SubCode::kIOFenced,
true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
Status::Code::kIOError, Status::SubCode::kIOFenced,
false),
Status::Severity::kFatalError},
};
std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>,
Status::Severity>
DefaultErrorSeverityMap = {
// Errors during BG compaction
{std::make_tuple(BackgroundErrorReason::kCompaction,
Status::Code::kCorruption, true),
Status::Severity::kUnrecoverableError},
{std::make_tuple(BackgroundErrorReason::kCompaction,
Status::Code::kCorruption, false),
Status::Severity::kNoError},
{std::make_tuple(BackgroundErrorReason::kCompaction,
Status::Code::kIOError, true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kCompaction,
Status::Code::kIOError, false),
Status::Severity::kNoError},
// Errors during BG flush
{std::make_tuple(BackgroundErrorReason::kFlush,
Status::Code::kCorruption, true),
Status::Severity::kUnrecoverableError},
{std::make_tuple(BackgroundErrorReason::kFlush,
Status::Code::kCorruption, false),
Status::Severity::kNoError},
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
false),
Status::Severity::kNoError},
// Errors during Write
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
Status::Code::kCorruption, true),
Status::Severity::kUnrecoverableError},
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
Status::Code::kCorruption, false),
Status::Severity::kNoError},
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
Status::Code::kIOError, true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
Status::Code::kIOError, false),
Status::Severity::kNoError},
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
Status::Code::kIOError, true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
Status::Code::kIOError, false),
Status::Severity::kFatalError},
// Errors during BG flush with WAL disabled
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kCorruption, true),
Status::Severity::kUnrecoverableError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kCorruption, false),
Status::Severity::kNoError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, false),
Status::Severity::kNoError},
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
Status::Code::kIOError, true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
Status::Code::kIOError, false),
Status::Severity::kFatalError},
};
std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
DefaultReasonMap = {
// Errors during BG compaction
{std::make_tuple(BackgroundErrorReason::kCompaction, true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kCompaction, false),
Status::Severity::kNoError},
// Errors during BG flush
{std::make_tuple(BackgroundErrorReason::kFlush, true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kFlush, false),
Status::Severity::kNoError},
// Errors during Write
{std::make_tuple(BackgroundErrorReason::kWriteCallback, true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kWriteCallback, false),
Status::Severity::kFatalError},
// Errors during Memtable update
{std::make_tuple(BackgroundErrorReason::kMemTable, true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kMemTable, false),
Status::Severity::kFatalError},
};
void ErrorHandler::CancelErrorRecovery() {
db_mutex_->AssertHeld();
// We'll release the lock before calling sfm, so make sure no new
// recovery gets scheduled at that point
auto_recovery_ = false;
SstFileManagerImpl* sfm =
reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
if (sfm) {
// This may or may not cancel a pending recovery
db_mutex_->Unlock();
bool cancelled = sfm->CancelErrorRecovery(this);
db_mutex_->Lock();
if (cancelled) {
recovery_in_prog_ = false;
}
}
// If auto recovery is also runing to resume from the retryable error,
// we should wait and end the auto recovery.
EndAutoRecovery();
}
STATIC_AVOID_DESTRUCTION(const Status, kOkStatus){Status::OK()};
// This is the main function for looking at an error during a background
// operation and deciding the severity, and error recovery strategy. The high
// level algorithm is as follows -
// 1. Classify the severity of the error based on the ErrorSeverityMap,
// DefaultErrorSeverityMap and DefaultReasonMap defined earlier
// 2. Call a Status code specific override function to adjust the severity
// if needed. The reason for this is our ability to recover may depend on
// the exact options enabled in DBOptions
// 3. Determine if auto recovery is possible. A listener notification callback
// is called, which can disable the auto recovery even if we decide its
// feasible
// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control
// the actual recovery. If no sst file manager is specified in DBOptions,
// a default one is allocated during DB::Open(), so there will always be
// one.
// This can also get called as part of a recovery operation. In that case, we
// also track the error separately in recovery_error_ so we can tell in the
// end whether recovery succeeded or not
const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
BackgroundErrorReason reason) {
db_mutex_->AssertHeld();
if (bg_err.ok()) {
return kOkStatus;
}
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED);
}
ROCKS_LOG_INFO(db_options_.info_log,
"ErrorHandler: Set regular background error\n");
bool paranoid = db_options_.paranoid_checks;
Status::Severity sev = Status::Severity::kFatalError;
Status new_bg_err;
DBRecoverContext context;
bool found = false;
{
auto entry = ErrorSeverityMap.find(
std::make_tuple(reason, bg_err.code(), bg_err.subcode(), paranoid));
if (entry != ErrorSeverityMap.end()) {
sev = entry->second;
found = true;
}
}
if (!found) {
auto entry = DefaultErrorSeverityMap.find(
std::make_tuple(reason, bg_err.code(), paranoid));
if (entry != DefaultErrorSeverityMap.end()) {
sev = entry->second;
found = true;
}
}
if (!found) {
auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid));
if (entry != DefaultReasonMap.end()) {
sev = entry->second;
}
}
new_bg_err = Status(bg_err, sev);
// Check if recovery is currently in progress. If it is, we will save this
// error so we can check it at the end to see if recovery succeeded or not
if (recovery_in_prog_ && recovery_error_.ok()) {
recovery_error_ = new_bg_err;
}
bool auto_recovery = auto_recovery_;
if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) {
auto_recovery = false;
}
// Allow some error specific overrides
if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery);
}
if (!new_bg_err.ok()) {
Status s = new_bg_err;
EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
db_mutex_, &auto_recovery);
if (!s.ok() && (s.severity() > bg_error_.severity())) {
bg_error_ = s;
} else {
// This error is less severe than previously encountered error. Don't
// take any further action
return bg_error_;
}
}
recover_context_ = context;
if (auto_recovery) {
recovery_in_prog_ = true;
// Kick-off error specific recovery
if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
RecoverFromNoSpace();
}
}
if (bg_error_.severity() >= Status::Severity::kHardError) {
is_db_stopped_.store(true, std::memory_order_release);
}
return bg_error_;
}
// This is the main function for looking at IO related error during the
// background operations. The main logic is:
// 1) File scope IO error is treated as retryable IO error in the write
// path. In RocksDB, If a file has write IO error and it is at file scope,
// RocksDB never write to the same file again. RocksDB will create a new
// file and rewrite the whole content. Thus, it is retryable.
// 1) if the error is caused by data loss, the error is mapped to
// unrecoverable error. Application/user must take action to handle
// this situation (File scope case is excluded).
// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error,
// or its retryable flag is set and not a data loss error), auto resume
// will be called and the auto resume can be controlled by resume count
// and resume interval options. There are three sub-cases:
// a) if the error happens during compaction, it is mapped to a soft error.
// the compaction thread will reschedule a new compaction.
// b) if the error happens during flush and also WAL is empty, it is mapped
// to a soft error. Note that, it includes the case that IO error happens
// in SST or manifest write during flush.
// c) all other errors are mapped to hard error.
// 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason
// reason) will be called to handle other error cases.
const Status& ErrorHandler::SetBGError(const Status& bg_status,
BackgroundErrorReason reason) {
db_mutex_->AssertHeld();
Status tmp_status = bg_status;
IOStatus bg_io_err = status_to_io_status(std::move(tmp_status));
if (bg_io_err.ok()) {
return kOkStatus;
}
ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s",
bg_io_err.ToString().c_str());
if (recovery_in_prog_ && recovery_io_error_.ok()) {
recovery_io_error_ = bg_io_err;
}
if (BackgroundErrorReason::kManifestWrite == reason ||
BackgroundErrorReason::kManifestWriteNoWAL == reason) {
// Always returns ok
ROCKS_LOG_INFO(db_options_.info_log, "Disabling File Deletions");
db_->DisableFileDeletionsWithLock().PermitUncheckedError();
}
Status new_bg_io_err = bg_io_err;
DBRecoverContext context;
if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile &&
bg_io_err.GetDataLoss()) {
// First, data loss (non file scope) is treated as unrecoverable error. So
// it can directly overwrite any existing bg_error_.
bool auto_recovery = false;
Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
CheckAndSetRecoveryAndBGError(bg_err);
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED);
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED);
}
ROCKS_LOG_INFO(
db_options_.info_log,
"ErrorHandler: Set background IO error as unrecoverable error\n");
EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
&bg_err, db_mutex_, &auto_recovery);
recover_context_ = context;
return bg_error_;
} else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
(bg_io_err.GetScope() ==
IOStatus::IOErrorScope::kIOErrorScopeFile ||
bg_io_err.GetRetryable())) {
// Second, check if the error is a retryable IO error (file scope IO error
// is also treated as retryable IO error in RocksDB write path). if it is
// retryable error and its severity is higher than bg_error_, overwrite the
// bg_error_ with new error. In current stage, for retryable IO error of
// compaction, treat it as soft error. In other cases, treat the retryable
// IO error as hard error. Note that, all the NoSpace error should be
// handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter
// it is retryable or file scope, this logic will be bypassed.
bool auto_recovery = false;
EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
&new_bg_io_err, db_mutex_,
&auto_recovery);
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED);
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED);
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT);
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT_MISSPELLED);
}
ROCKS_LOG_INFO(db_options_.info_log,
"ErrorHandler: Set background retryable IO error\n");
if (BackgroundErrorReason::kCompaction == reason) {
// We map the retryable IO error during compaction to soft error. Since
// compaction can reschedule by itself. We will not set the BG error in
// this case
// TODO: a better way to set or clean the retryable IO error which
// happens during compaction SST file write.
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
}
ROCKS_LOG_INFO(
db_options_.info_log,
"ErrorHandler: Compaction will schedule by itself to resume\n");
// Not used in this code path.
new_bg_io_err.PermitUncheckedError();
return bg_error_;
} else if (BackgroundErrorReason::kFlushNoWAL == reason ||
BackgroundErrorReason::kManifestWriteNoWAL == reason) {
// When the BG Retryable IO error reason is flush without WAL,
// We map it to a soft error. At the same time, all the background work
// should be stopped except the BG work from recovery. Therefore, we
// set the soft_error_no_bg_work_ to true. At the same time, since DB
// continues to receive writes when BG error is soft error, to avoid
// to many small memtable being generated during auto resume, the flush
// reason is set to kErrorRecoveryRetryFlush.
Status bg_err(new_bg_io_err, Status::Severity::kSoftError);
CheckAndSetRecoveryAndBGError(bg_err);
soft_error_no_bg_work_ = true;
context.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
recover_context_ = context;
return StartRecoverFromRetryableBGIOError(bg_io_err);
} else {
Status bg_err(new_bg_io_err, Status::Severity::kHardError);
CheckAndSetRecoveryAndBGError(bg_err);
recover_context_ = context;
return StartRecoverFromRetryableBGIOError(bg_io_err);
}
} else {
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED);
}
// HandleKnownErrors() will use recovery_error_, so ignore
// recovery_io_error_.
// TODO: Do some refactoring and use only one recovery_error_
recovery_io_error_.PermitUncheckedError();
return HandleKnownErrors(new_bg_io_err, reason);
}
}
Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error,
bool* auto_recovery) {
if (bg_error.severity() >= Status::Severity::kFatalError) {
return bg_error;
}
if (db_options_.sst_file_manager.get() == nullptr) {
// We rely on SFM to poll for enough disk space and recover
*auto_recovery = false;
return bg_error;
}
if (db_options_.allow_2pc &&
(bg_error.severity() <= Status::Severity::kSoftError)) {
// Don't know how to recover, as the contents of the current WAL file may
// be inconsistent, and it may be needed for 2PC. If 2PC is not enabled,
// we can just flush the memtable and discard the log
*auto_recovery = false;
return Status(bg_error, Status::Severity::kFatalError);
}
{
uint64_t free_space;
if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path,
&free_space) == Status::NotSupported()) {
*auto_recovery = false;
}
}
return bg_error;
}
void ErrorHandler::RecoverFromNoSpace() {
SstFileManagerImpl* sfm =
reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
// Inform SFM of the error, so it can kick-off the recovery
if (sfm) {
sfm->StartErrorRecovery(this, bg_error_);
}
}
Status ErrorHandler::ClearBGError() {
db_mutex_->AssertHeld();
// Signal that recovery succeeded
if (recovery_error_.ok()) {
Status old_bg_error = bg_error_;
// old_bg_error is only for notifying listeners, so may not be checked
old_bg_error.PermitUncheckedError();
// Clear and check the recovery IO and BG error
bg_error_ = Status::OK();
recovery_io_error_ = IOStatus::OK();
bg_error_.PermitUncheckedError();
recovery_io_error_.PermitUncheckedError();
recovery_in_prog_ = false;
soft_error_no_bg_work_ = false;
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error,
bg_error_, db_mutex_);
}
return recovery_error_;
}
Status ErrorHandler::RecoverFromBGError(bool is_manual) {
InstrumentedMutexLock l(db_mutex_);
bool no_bg_work_original_flag = soft_error_no_bg_work_;
if (is_manual) {
// If its a manual recovery and there's a background recovery in progress
// return busy status
if (recovery_in_prog_) {
return Status::Busy();
}
recovery_in_prog_ = true;
// In manual resume, we allow the bg work to run. If it is a auto resume,
// the bg work should follow this tag.
soft_error_no_bg_work_ = false;
// In manual resume, if the bg error is a soft error and also requires
// no bg work, the error must be recovered by call the flush with
// flush reason: kErrorRecoveryRetryFlush. In other case, the flush
// reason is set to kErrorRecovery.
if (no_bg_work_original_flag) {
recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
} else {
recover_context_.flush_reason = FlushReason::kErrorRecovery;
}
}
if (bg_error_.severity() == Status::Severity::kSoftError &&
recover_context_.flush_reason == FlushReason::kErrorRecovery) {
// Simply clear the background error and return
recovery_error_ = Status::OK();
return ClearBGError();
}
// Reset recovery_error_. We will use this to record any errors that happen
// during the recovery process. While recovering, the only operations that
// can generate background errors should be the flush operations
recovery_error_ = Status::OK();
recovery_error_.PermitUncheckedError();
Status s = db_->ResumeImpl(recover_context_);
if (s.ok()) {
soft_error_no_bg_work_ = false;
} else {
soft_error_no_bg_work_ = no_bg_work_original_flag;
}
// For manual recover, shutdown, and fatal error cases, set
// recovery_in_prog_ to false. For automatic background recovery, leave it
// as is regardless of success or failure as it will be retried
if (is_manual || s.IsShutdownInProgress() ||
bg_error_.severity() >= Status::Severity::kFatalError) {
recovery_in_prog_ = false;
}
return s;
}
const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
const IOStatus& io_error) {
db_mutex_->AssertHeld();
if (bg_error_.ok()) {
return bg_error_;
} else if (io_error.ok()) {
return kOkStatus;
} else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) {
// Auto resume BG error is not enabled, directly return bg_error_.
return bg_error_;
}
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
}
ROCKS_LOG_INFO(
db_options_.info_log,
"ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n");
if (recovery_thread_) {
// In this case, if recovery_in_prog_ is false, current thread should
// wait the previous recover thread to finish and create a new thread
// to recover from the bg error.
db_mutex_->Unlock();
recovery_thread_->join();
db_mutex_->Lock();
}
recovery_in_prog_ = true;
TEST_SYNC_POINT("StartRecoverFromRetryableBGIOError::in_progress");
recovery_thread_.reset(
new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));
if (recovery_io_error_.ok() && recovery_error_.ok()) {
return recovery_error_;
} else {
return bg_error_;
}
}
// Automatic recover from Retryable BG IO error. Must be called after db
// mutex is released.
void ErrorHandler::RecoverFromRetryableBGIOError() {
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart2");
InstrumentedMutexLock l(db_mutex_);
if (end_recovery_) {
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
Status::ShutdownInProgress(),
db_mutex_);
return;
}
DBRecoverContext context = recover_context_;
context.flush_after_recovery = true;
int resume_count = db_options_.max_bgerror_resume_count;
uint64_t wait_interval = db_options_.bgerror_resume_retry_interval;
uint64_t retry_count = 0;
// Recover from the retryable error. Create a separate thread to do it.
while (resume_count > 0) {
if (end_recovery_) {
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
Status::ShutdownInProgress(),
db_mutex_);
return;
}
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0");
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1");
recovery_io_error_ = IOStatus::OK();
recovery_error_ = Status::OK();
retry_count++;
Status s = db_->ResumeImpl(context);
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT);
}
if (s.IsShutdownInProgress() ||
bg_error_.severity() >= Status::Severity::kFatalError) {
// If DB shutdown in progress or the error severity is higher than
// Hard Error, stop auto resume and returns.
recovery_in_prog_ = false;
if (bg_error_stats_ != nullptr) {
RecordInHistogram(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
}
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
bg_error_, db_mutex_);
return;
}
if (!recovery_io_error_.ok() &&
recovery_error_.severity() <= Status::Severity::kHardError &&
recovery_io_error_.GetRetryable()) {
// If new BG IO error happens during auto recovery and it is retryable
// and its severity is Hard Error or lower, the auto resmue sleep for
// a period of time and redo auto resume if it is allowed.
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0");
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1");
int64_t wait_until = db_options_.clock->NowMicros() + wait_interval;
cv_.TimedWait(wait_until);
} else {
// There are three possibility: 1) recover_io_error is set during resume
// and the error is not retryable, 2) recover is successful, 3) other
// error happens during resume and cannot be resumed here.
if (recovery_io_error_.ok() && recovery_error_.ok() && s.ok()) {
// recover from the retryable IO error and no other BG errors. Clean
// the bg_error and notify user.
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess");
Status old_bg_error = bg_error_;
is_db_stopped_.store(false, std::memory_order_release);
bg_error_ = Status::OK();
bg_error_.PermitUncheckedError();
EventHelpers::NotifyOnErrorRecoveryEnd(
db_options_.listeners, old_bg_error, bg_error_, db_mutex_);
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT);
RecordInHistogram(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
}
recovery_in_prog_ = false;
if (soft_error_no_bg_work_) {
soft_error_no_bg_work_ = false;
}
return;
} else {
// In this case: 1) recovery_io_error is more serious or not retryable
// 2) other Non IO recovery_error happens. The auto recovery stops.
recovery_in_prog_ = false;
if (bg_error_stats_ != nullptr) {
RecordInHistogram(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
}
EventHelpers::NotifyOnErrorRecoveryEnd(
db_options_.listeners, bg_error_,
!recovery_io_error_.ok()
? recovery_io_error_
: (!recovery_error_.ok() ? recovery_error_ : s),
db_mutex_);
return;
}
}
resume_count--;
}
recovery_in_prog_ = false;
EventHelpers::NotifyOnErrorRecoveryEnd(
db_options_.listeners, bg_error_,
Status::Aborted("Exceeded resume retry count"), db_mutex_);
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
if (bg_error_stats_ != nullptr) {
RecordInHistogram(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
}
return;
}
void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
if (recovery_in_prog_ && recovery_error_.ok()) {
recovery_error_ = bg_err;
}
if (bg_err.severity() > bg_error_.severity()) {
bg_error_ = bg_err;
}
if (bg_error_.severity() >= Status::Severity::kHardError) {
is_db_stopped_.store(true, std::memory_order_release);
}
return;
}
void ErrorHandler::EndAutoRecovery() {
db_mutex_->AssertHeld();
if (!end_recovery_) {
end_recovery_ = true;
}
cv_.SignalAll();
db_mutex_->Unlock();
if (recovery_thread_) {
recovery_thread_->join();
}
db_mutex_->Lock();
return;
}
} // namespace ROCKSDB_NAMESPACE