2018-06-28 19:23:57 +00:00
|
|
|
// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
#include "db/error_handler.h"
|
2021-03-18 05:36:42 +00:00
|
|
|
|
2019-05-31 18:52:59 +00:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2019-05-31 22:21:36 +00:00
|
|
|
#include "db/event_helpers.h"
|
2019-05-30 03:44:08 +00:00
|
|
|
#include "file/sst_file_manager_impl.h"
|
2021-03-18 05:36:42 +00:00
|
|
|
#include "logging/logging.h"
|
2022-02-08 02:20:51 +00:00
|
|
|
#include "port/lang.h"
|
2018-06-28 19:23:57 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2018-06-28 19:23:57 +00:00
|
|
|
|
|
|
|
// Maps to help decide the severity of an error based on the
|
|
|
|
// BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks
|
|
|
|
// is set or not. There are 3 maps, going from most specific to least specific
|
|
|
|
// (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and
|
|
|
|
// paranoid_checks). The less specific map serves as a catch all in case we miss
|
|
|
|
// a specific error code or subcode.
|
|
|
|
std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>,
|
|
|
|
Status::Severity>
|
|
|
|
ErrorSeverityMap = {
|
|
|
|
// Errors during BG compaction
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kCompaction,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kNoSpace,
|
|
|
|
true),
|
|
|
|
Status::Severity::kSoftError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kCompaction,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kNoSpace,
|
|
|
|
false),
|
|
|
|
Status::Severity::kNoError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kCompaction,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kSpaceLimit,
|
|
|
|
true),
|
|
|
|
Status::Severity::kHardError},
|
2020-09-14 23:03:32 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kCompaction,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kIOFenced,
|
|
|
|
true),
|
|
|
|
Status::Severity::kFatalError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kCompaction,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kIOFenced,
|
|
|
|
false),
|
|
|
|
Status::Severity::kFatalError},
|
2018-06-28 19:23:57 +00:00
|
|
|
// Errors during BG flush
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
|
|
|
|
Status::SubCode::kNoSpace, true),
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
Status::Severity::kHardError},
|
2018-06-28 19:23:57 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
|
|
|
|
Status::SubCode::kNoSpace, false),
|
|
|
|
Status::Severity::kNoError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
|
|
|
|
Status::SubCode::kSpaceLimit, true),
|
|
|
|
Status::Severity::kHardError},
|
2020-09-14 23:03:32 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
|
|
|
|
Status::SubCode::kIOFenced, true),
|
|
|
|
Status::Severity::kFatalError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
|
|
|
|
Status::SubCode::kIOFenced, false),
|
|
|
|
Status::Severity::kFatalError},
|
2018-06-28 19:23:57 +00:00
|
|
|
// Errors during Write
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kNoSpace,
|
|
|
|
true),
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
Status::Severity::kHardError},
|
2018-06-28 19:23:57 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kNoSpace,
|
|
|
|
false),
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
Status::Severity::kHardError},
|
2020-09-14 23:03:32 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kIOFenced,
|
|
|
|
true),
|
|
|
|
Status::Severity::kFatalError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kIOFenced,
|
|
|
|
false),
|
|
|
|
Status::Severity::kFatalError},
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
// Errors during MANIFEST write
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kNoSpace,
|
|
|
|
true),
|
|
|
|
Status::Severity::kHardError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kNoSpace,
|
|
|
|
false),
|
|
|
|
Status::Severity::kHardError},
|
2020-09-14 23:03:32 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kIOFenced,
|
|
|
|
true),
|
|
|
|
Status::Severity::kFatalError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kIOFenced,
|
|
|
|
false),
|
|
|
|
Status::Severity::kFatalError},
|
2020-09-18 03:22:35 +00:00
|
|
|
// Errors during BG flush with WAL disabled
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kNoSpace,
|
|
|
|
true),
|
|
|
|
Status::Severity::kHardError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kNoSpace,
|
|
|
|
false),
|
|
|
|
Status::Severity::kNoError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kSpaceLimit,
|
|
|
|
true),
|
|
|
|
Status::Severity::kHardError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kIOFenced,
|
|
|
|
true),
|
|
|
|
Status::Severity::kFatalError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kIOFenced,
|
|
|
|
false),
|
|
|
|
Status::Severity::kFatalError},
|
2020-12-03 02:22:05 +00:00
|
|
|
// Errors during MANIFEST write when WAL is disabled
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kNoSpace,
|
|
|
|
true),
|
|
|
|
Status::Severity::kHardError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kNoSpace,
|
|
|
|
false),
|
|
|
|
Status::Severity::kHardError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kIOFenced,
|
|
|
|
true),
|
|
|
|
Status::Severity::kFatalError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
|
|
|
|
Status::Code::kIOError, Status::SubCode::kIOFenced,
|
|
|
|
false),
|
|
|
|
Status::Severity::kFatalError},
|
2020-09-18 03:22:35 +00:00
|
|
|
|
2018-06-28 19:23:57 +00:00
|
|
|
};
|
|
|
|
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>,
|
|
|
|
Status::Severity>
|
2018-06-28 19:23:57 +00:00
|
|
|
DefaultErrorSeverityMap = {
|
|
|
|
// Errors during BG compaction
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kCompaction,
|
|
|
|
Status::Code::kCorruption, true),
|
|
|
|
Status::Severity::kUnrecoverableError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kCompaction,
|
|
|
|
Status::Code::kCorruption, false),
|
|
|
|
Status::Severity::kNoError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kCompaction,
|
|
|
|
Status::Code::kIOError, true),
|
|
|
|
Status::Severity::kFatalError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kCompaction,
|
|
|
|
Status::Code::kIOError, false),
|
|
|
|
Status::Severity::kNoError},
|
|
|
|
// Errors during BG flush
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlush,
|
|
|
|
Status::Code::kCorruption, true),
|
|
|
|
Status::Severity::kUnrecoverableError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlush,
|
|
|
|
Status::Code::kCorruption, false),
|
|
|
|
Status::Severity::kNoError},
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
|
|
|
|
true),
|
2018-06-28 19:23:57 +00:00
|
|
|
Status::Severity::kFatalError},
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
|
|
|
|
false),
|
2018-06-28 19:23:57 +00:00
|
|
|
Status::Severity::kNoError},
|
|
|
|
// Errors during Write
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
|
|
|
|
Status::Code::kCorruption, true),
|
|
|
|
Status::Severity::kUnrecoverableError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
|
|
|
|
Status::Code::kCorruption, false),
|
|
|
|
Status::Severity::kNoError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
|
|
|
|
Status::Code::kIOError, true),
|
|
|
|
Status::Severity::kFatalError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kWriteCallback,
|
|
|
|
Status::Code::kIOError, false),
|
|
|
|
Status::Severity::kNoError},
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
|
|
|
|
Status::Code::kIOError, true),
|
|
|
|
Status::Severity::kFatalError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWrite,
|
|
|
|
Status::Code::kIOError, false),
|
|
|
|
Status::Severity::kFatalError},
|
2020-09-18 03:22:35 +00:00
|
|
|
// Errors during BG flush with WAL disabled
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
|
|
|
|
Status::Code::kCorruption, true),
|
|
|
|
Status::Severity::kUnrecoverableError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
|
|
|
|
Status::Code::kCorruption, false),
|
|
|
|
Status::Severity::kNoError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
|
|
|
|
Status::Code::kIOError, true),
|
|
|
|
Status::Severity::kFatalError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
|
|
|
|
Status::Code::kIOError, false),
|
|
|
|
Status::Severity::kNoError},
|
2020-12-03 02:22:05 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
|
|
|
|
Status::Code::kIOError, true),
|
|
|
|
Status::Severity::kFatalError},
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
|
|
|
|
Status::Code::kIOError, false),
|
|
|
|
Status::Severity::kFatalError},
|
2018-06-28 19:23:57 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
|
|
|
|
DefaultReasonMap = {
|
|
|
|
// Errors during BG compaction
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kCompaction, true),
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
Status::Severity::kFatalError},
|
2018-06-28 19:23:57 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kCompaction, false),
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
Status::Severity::kNoError},
|
2018-06-28 19:23:57 +00:00
|
|
|
// Errors during BG flush
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlush, true),
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
Status::Severity::kFatalError},
|
2018-06-28 19:23:57 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kFlush, false),
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
Status::Severity::kNoError},
|
2018-06-28 19:23:57 +00:00
|
|
|
// Errors during Write
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kWriteCallback, true),
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
Status::Severity::kFatalError},
|
2018-06-28 19:23:57 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kWriteCallback, false),
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
Status::Severity::kFatalError},
|
2018-06-28 19:23:57 +00:00
|
|
|
// Errors during Memtable update
|
|
|
|
{std::make_tuple(BackgroundErrorReason::kMemTable, true),
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
Status::Severity::kFatalError},
|
2018-06-28 19:23:57 +00:00
|
|
|
{std::make_tuple(BackgroundErrorReason::kMemTable, false),
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
Status::Severity::kFatalError},
|
2018-06-28 19:23:57 +00:00
|
|
|
};
|
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
void ErrorHandler::CancelErrorRecovery() {
|
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
|
|
|
|
// We'll release the lock before calling sfm, so make sure no new
|
|
|
|
// recovery gets scheduled at that point
|
|
|
|
auto_recovery_ = false;
|
2022-11-02 21:34:24 +00:00
|
|
|
SstFileManagerImpl* sfm =
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
if (sfm) {
|
|
|
|
// This may or may not cancel a pending recovery
|
|
|
|
db_mutex_->Unlock();
|
|
|
|
bool cancelled = sfm->CancelErrorRecovery(this);
|
|
|
|
db_mutex_->Lock();
|
|
|
|
if (cancelled) {
|
|
|
|
recovery_in_prog_ = false;
|
|
|
|
}
|
|
|
|
}
|
2020-07-15 18:02:44 +00:00
|
|
|
|
|
|
|
// If auto recovery is also runing to resume from the retryable error,
|
|
|
|
// we should wait and end the auto recovery.
|
|
|
|
EndAutoRecovery();
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
}
|
|
|
|
|
2022-02-08 02:20:51 +00:00
|
|
|
STATIC_AVOID_DESTRUCTION(const Status, kOkStatus){Status::OK()};
|
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
// This is the main function for looking at an error during a background
|
|
|
|
// operation and deciding the severity, and error recovery strategy. The high
|
|
|
|
// level algorithm is as follows -
|
|
|
|
// 1. Classify the severity of the error based on the ErrorSeverityMap,
|
|
|
|
// DefaultErrorSeverityMap and DefaultReasonMap defined earlier
|
|
|
|
// 2. Call a Status code specific override function to adjust the severity
|
|
|
|
// if needed. The reason for this is our ability to recover may depend on
|
|
|
|
// the exact options enabled in DBOptions
|
|
|
|
// 3. Determine if auto recovery is possible. A listener notification callback
|
|
|
|
// is called, which can disable the auto recovery even if we decide its
|
|
|
|
// feasible
|
|
|
|
// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control
|
|
|
|
// the actual recovery. If no sst file manager is specified in DBOptions,
|
|
|
|
// a default one is allocated during DB::Open(), so there will always be
|
|
|
|
// one.
|
|
|
|
// This can also get called as part of a recovery operation. In that case, we
|
2018-10-05 03:44:43 +00:00
|
|
|
// also track the error separately in recovery_error_ so we can tell in the
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
// end whether recovery succeeded or not
|
2022-03-15 21:45:34 +00:00
|
|
|
const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
|
|
|
|
BackgroundErrorReason reason) {
|
2018-06-28 19:23:57 +00:00
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
if (bg_err.ok()) {
|
2022-02-08 02:20:51 +00:00
|
|
|
return kOkStatus;
|
2018-06-28 19:23:57 +00:00
|
|
|
}
|
|
|
|
|
2021-03-18 05:36:42 +00:00
|
|
|
ROCKS_LOG_INFO(db_options_.info_log,
|
|
|
|
"ErrorHandler: Set regular background error\n");
|
|
|
|
|
2018-06-28 19:23:57 +00:00
|
|
|
bool paranoid = db_options_.paranoid_checks;
|
|
|
|
Status::Severity sev = Status::Severity::kFatalError;
|
|
|
|
Status new_bg_err;
|
2020-09-18 03:22:35 +00:00
|
|
|
DBRecoverContext context;
|
2018-06-28 19:23:57 +00:00
|
|
|
bool found = false;
|
|
|
|
|
|
|
|
{
|
2022-11-02 21:34:24 +00:00
|
|
|
auto entry = ErrorSeverityMap.find(
|
|
|
|
std::make_tuple(reason, bg_err.code(), bg_err.subcode(), paranoid));
|
2018-06-28 19:23:57 +00:00
|
|
|
if (entry != ErrorSeverityMap.end()) {
|
|
|
|
sev = entry->second;
|
|
|
|
found = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!found) {
|
2022-11-02 21:34:24 +00:00
|
|
|
auto entry = DefaultErrorSeverityMap.find(
|
|
|
|
std::make_tuple(reason, bg_err.code(), paranoid));
|
2018-06-28 19:23:57 +00:00
|
|
|
if (entry != DefaultErrorSeverityMap.end()) {
|
|
|
|
sev = entry->second;
|
|
|
|
found = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!found) {
|
|
|
|
auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid));
|
|
|
|
if (entry != DefaultReasonMap.end()) {
|
|
|
|
sev = entry->second;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
new_bg_err = Status(bg_err, sev);
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
|
2020-01-30 18:53:46 +00:00
|
|
|
// Check if recovery is currently in progress. If it is, we will save this
|
|
|
|
// error so we can check it at the end to see if recovery succeeded or not
|
|
|
|
if (recovery_in_prog_ && recovery_error_.ok()) {
|
2023-10-10 13:31:45 +00:00
|
|
|
recovery_error_ = status_to_io_status(Status(new_bg_err));
|
2020-01-30 18:53:46 +00:00
|
|
|
}
|
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
bool auto_recovery = auto_recovery_;
|
|
|
|
if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) {
|
|
|
|
auto_recovery = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Allow some error specific overrides
|
2021-07-21 01:08:55 +00:00
|
|
|
if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
|
|
|
|
new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery);
|
|
|
|
}
|
|
|
|
|
2018-06-28 19:23:57 +00:00
|
|
|
if (!new_bg_err.ok()) {
|
|
|
|
Status s = new_bg_err;
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
|
|
|
|
db_mutex_, &auto_recovery);
|
2018-06-28 19:23:57 +00:00
|
|
|
if (!s.ok() && (s.severity() > bg_error_.severity())) {
|
|
|
|
bg_error_ = s;
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
} else {
|
|
|
|
// This error is less severe than previously encountered error. Don't
|
|
|
|
// take any further action
|
|
|
|
return bg_error_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-09-18 03:22:35 +00:00
|
|
|
recover_context_ = context;
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
if (auto_recovery) {
|
|
|
|
recovery_in_prog_ = true;
|
|
|
|
|
|
|
|
// Kick-off error specific recovery
|
2021-07-21 01:08:55 +00:00
|
|
|
if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
|
|
|
|
new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
RecoverFromNoSpace();
|
|
|
|
}
|
|
|
|
}
|
2022-07-21 20:35:36 +00:00
|
|
|
if (bg_error_.severity() >= Status::Severity::kHardError) {
|
|
|
|
is_db_stopped_.store(true, std::memory_order_release);
|
|
|
|
}
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
return bg_error_;
|
|
|
|
}
|
|
|
|
|
2020-12-03 02:22:05 +00:00
|
|
|
// This is the main function for looking at IO related error during the
|
|
|
|
// background operations. The main logic is:
|
2024-01-22 22:57:30 +00:00
|
|
|
// File scope IO error is treated as retryable IO error in the write path. In
|
|
|
|
// RocksDB, If a file has write IO error and it is at file scope, RocksDB never
|
|
|
|
// write to the same file again. RocksDB will create a new file and rewrite the
|
|
|
|
// whole content. Thus, it is retryable.
|
|
|
|
// There are three main categories of error handling:
|
2020-12-03 02:22:05 +00:00
|
|
|
// 1) if the error is caused by data loss, the error is mapped to
|
|
|
|
// unrecoverable error. Application/user must take action to handle
|
2021-01-08 00:30:06 +00:00
|
|
|
// this situation (File scope case is excluded).
|
|
|
|
// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error,
|
2024-01-22 22:57:30 +00:00
|
|
|
// or its retryable flag is set and not a data loss error), auto resume (
|
|
|
|
// DBImpl::ResumeImpl) may be called and the auto resume can be controlled
|
|
|
|
// by resume count and resume interval options. There are three sub-cases:
|
2020-12-03 02:22:05 +00:00
|
|
|
// a) if the error happens during compaction, it is mapped to a soft error.
|
2024-01-22 22:57:30 +00:00
|
|
|
// the compaction thread will reschedule a new compaction. This doesn't
|
|
|
|
// call auto resume.
|
2020-12-03 02:22:05 +00:00
|
|
|
// b) if the error happens during flush and also WAL is empty, it is mapped
|
|
|
|
// to a soft error. Note that, it includes the case that IO error happens
|
2024-01-22 22:57:30 +00:00
|
|
|
// in SST or manifest write during flush. Auto resume will be called.
|
|
|
|
// c) all other errors are mapped to hard error. Auto resume will be called.
|
|
|
|
// 3) for other cases, HandleKnownErrors(const Status& bg_err,
|
|
|
|
// BackgroundErrorReason reason) will be called to handle other error cases
|
|
|
|
// such as delegating to SstFileManager to handle no space error.
|
2022-03-15 21:45:34 +00:00
|
|
|
const Status& ErrorHandler::SetBGError(const Status& bg_status,
|
2020-12-08 04:09:55 +00:00
|
|
|
BackgroundErrorReason reason) {
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
db_mutex_->AssertHeld();
|
2022-03-15 21:45:34 +00:00
|
|
|
Status tmp_status = bg_status;
|
|
|
|
IOStatus bg_io_err = status_to_io_status(std::move(tmp_status));
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
if (bg_io_err.ok()) {
|
2022-02-08 02:20:51 +00:00
|
|
|
return kOkStatus;
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
}
|
2020-07-15 18:02:44 +00:00
|
|
|
ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s",
|
|
|
|
bg_io_err.ToString().c_str());
|
|
|
|
|
2024-01-29 23:28:37 +00:00
|
|
|
RecordStats({ERROR_HANDLER_BG_ERROR_COUNT, ERROR_HANDLER_BG_IO_ERROR_COUNT},
|
|
|
|
{} /* int_histograms */);
|
2024-01-22 22:57:30 +00:00
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
Status new_bg_io_err = bg_io_err;
|
2020-09-18 03:22:35 +00:00
|
|
|
DBRecoverContext context;
|
2021-01-08 00:30:06 +00:00
|
|
|
if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile &&
|
|
|
|
bg_io_err.GetDataLoss()) {
|
|
|
|
// First, data loss (non file scope) is treated as unrecoverable error. So
|
|
|
|
// it can directly overwrite any existing bg_error_.
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
bool auto_recovery = false;
|
|
|
|
Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
|
2021-11-08 23:47:53 +00:00
|
|
|
CheckAndSetRecoveryAndBGError(bg_err);
|
2021-03-18 05:36:42 +00:00
|
|
|
ROCKS_LOG_INFO(
|
|
|
|
db_options_.info_log,
|
|
|
|
"ErrorHandler: Set background IO error as unrecoverable error\n");
|
2020-12-08 04:09:55 +00:00
|
|
|
EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
|
|
|
|
&bg_err, db_mutex_, &auto_recovery);
|
2020-09-18 03:22:35 +00:00
|
|
|
recover_context_ = context;
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
return bg_error_;
|
All the NoSpace() errors will be handled by regular SetBGError and RecoverFromNoSpace() (#8376)
Summary:
In the current logic, any IO Error with retryable flag == true will be handled by the special logic and in most cases, StartRecoverFromRetryableBGIOError will be called to do the auto resume. If the NoSpace error with retryable flag is set during WAL write, it is mapped as a hard error, which will trigger the auto recovery. During the recover process, if write continues and append to the WAL, the write process sees that bg_error is set to HardError and it calls WriteStatusCheck(), which calls SetBGError() with Status (not IOStatus). This will redirect to the regular SetBGError interface, in which recovery_error_ will be set to the corresponding error. With the recovery_error_ set, the auto resume thread created in StartRecoverFromRetryableBGIOError will keep failing as long as user keeps trying to write.
To fix this issue. All the NoSpace error (no matter retryable flag is set or not) will be redirect to the regular SetBGError, and RecoverFromNoSpace() will do the recovery job which calls SstFileManager::StartErrorRecovery().
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8376
Test Plan: make check and added the new testing case
Reviewed By: anand1976
Differential Revision: D29071828
Pulled By: zhichao-cao
fbshipit-source-id: 7171d7e14cc4620fdab49b7eff7a2fe9a89942c2
2021-06-11 21:47:40 +00:00
|
|
|
} else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
|
|
|
|
(bg_io_err.GetScope() ==
|
|
|
|
IOStatus::IOErrorScope::kIOErrorScopeFile ||
|
|
|
|
bg_io_err.GetRetryable())) {
|
2021-01-08 00:30:06 +00:00
|
|
|
// Second, check if the error is a retryable IO error (file scope IO error
|
|
|
|
// is also treated as retryable IO error in RocksDB write path). if it is
|
|
|
|
// retryable error and its severity is higher than bg_error_, overwrite the
|
|
|
|
// bg_error_ with new error. In current stage, for retryable IO error of
|
|
|
|
// compaction, treat it as soft error. In other cases, treat the retryable
|
All the NoSpace() errors will be handled by regular SetBGError and RecoverFromNoSpace() (#8376)
Summary:
In the current logic, any IO Error with retryable flag == true will be handled by the special logic and in most cases, StartRecoverFromRetryableBGIOError will be called to do the auto resume. If the NoSpace error with retryable flag is set during WAL write, it is mapped as a hard error, which will trigger the auto recovery. During the recover process, if write continues and append to the WAL, the write process sees that bg_error is set to HardError and it calls WriteStatusCheck(), which calls SetBGError() with Status (not IOStatus). This will redirect to the regular SetBGError interface, in which recovery_error_ will be set to the corresponding error. With the recovery_error_ set, the auto resume thread created in StartRecoverFromRetryableBGIOError will keep failing as long as user keeps trying to write.
To fix this issue. All the NoSpace error (no matter retryable flag is set or not) will be redirect to the regular SetBGError, and RecoverFromNoSpace() will do the recovery job which calls SstFileManager::StartErrorRecovery().
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8376
Test Plan: make check and added the new testing case
Reviewed By: anand1976
Differential Revision: D29071828
Pulled By: zhichao-cao
fbshipit-source-id: 7171d7e14cc4620fdab49b7eff7a2fe9a89942c2
2021-06-11 21:47:40 +00:00
|
|
|
// IO error as hard error. Note that, all the NoSpace error should be
|
|
|
|
// handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter
|
|
|
|
// it is retryable or file scope, this logic will be bypassed.
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
bool auto_recovery = false;
|
2020-12-08 04:09:55 +00:00
|
|
|
EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
|
|
|
|
&new_bg_io_err, db_mutex_,
|
|
|
|
&auto_recovery);
|
2024-01-22 22:57:30 +00:00
|
|
|
|
2024-01-29 23:28:37 +00:00
|
|
|
RecordStats({ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT},
|
2024-01-22 22:57:30 +00:00
|
|
|
{} /* int_histograms */);
|
2021-03-18 05:36:42 +00:00
|
|
|
ROCKS_LOG_INFO(db_options_.info_log,
|
|
|
|
"ErrorHandler: Set background retryable IO error\n");
|
2020-07-15 18:02:44 +00:00
|
|
|
if (BackgroundErrorReason::kCompaction == reason) {
|
2021-01-28 01:56:17 +00:00
|
|
|
// We map the retryable IO error during compaction to soft error. Since
|
|
|
|
// compaction can reschedule by itself. We will not set the BG error in
|
|
|
|
// this case
|
|
|
|
// TODO: a better way to set or clean the retryable IO error which
|
|
|
|
// happens during compaction SST file write.
|
2024-01-22 22:57:30 +00:00
|
|
|
RecordStats({ERROR_HANDLER_AUTORESUME_COUNT}, {} /* int_histograms */);
|
2021-03-18 05:36:42 +00:00
|
|
|
ROCKS_LOG_INFO(
|
|
|
|
db_options_.info_log,
|
|
|
|
"ErrorHandler: Compaction will schedule by itself to resume\n");
|
2023-09-06 17:23:41 +00:00
|
|
|
// Not used in this code path.
|
|
|
|
new_bg_io_err.PermitUncheckedError();
|
2020-07-15 18:02:44 +00:00
|
|
|
return bg_error_;
|
2024-01-22 22:57:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status::Severity severity;
|
|
|
|
if (BackgroundErrorReason::kFlushNoWAL == reason ||
|
|
|
|
BackgroundErrorReason::kManifestWriteNoWAL == reason) {
|
2020-09-18 03:22:35 +00:00
|
|
|
// When the BG Retryable IO error reason is flush without WAL,
|
|
|
|
// We map it to a soft error. At the same time, all the background work
|
|
|
|
// should be stopped except the BG work from recovery. Therefore, we
|
|
|
|
// set the soft_error_no_bg_work_ to true. At the same time, since DB
|
|
|
|
// continues to receive writes when BG error is soft error, to avoid
|
|
|
|
// to many small memtable being generated during auto resume, the flush
|
|
|
|
// reason is set to kErrorRecoveryRetryFlush.
|
2024-01-22 22:57:30 +00:00
|
|
|
severity = Status::Severity::kSoftError;
|
2020-09-18 03:22:35 +00:00
|
|
|
soft_error_no_bg_work_ = true;
|
|
|
|
context.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
|
2020-07-15 18:02:44 +00:00
|
|
|
} else {
|
2024-01-22 22:57:30 +00:00
|
|
|
severity = Status::Severity::kHardError;
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
}
|
2024-01-22 22:57:30 +00:00
|
|
|
Status bg_err(new_bg_io_err, severity);
|
|
|
|
CheckAndSetRecoveryAndBGError(bg_err);
|
|
|
|
recover_context_ = context;
|
|
|
|
return StartRecoverFromRetryableBGIOError(bg_io_err);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
} else {
|
2022-03-15 21:45:34 +00:00
|
|
|
return HandleKnownErrors(new_bg_io_err, reason);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-11 16:11:11 +00:00
|
|
|
void ErrorHandler::AddFilesToQuarantine(
|
|
|
|
autovector<const autovector<uint64_t>*> files_to_quarantine) {
|
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
std::ostringstream quarantine_files_oss;
|
|
|
|
bool is_first_one = true;
|
|
|
|
for (const auto* files : files_to_quarantine) {
|
|
|
|
assert(files);
|
|
|
|
for (uint64_t file_number : *files) {
|
|
|
|
files_to_quarantine_.push_back(file_number);
|
|
|
|
quarantine_files_oss << (is_first_one ? "" : ", ") << file_number;
|
|
|
|
is_first_one = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ROCKS_LOG_INFO(db_options_.info_log,
|
|
|
|
"ErrorHandler: added file numbers %s to quarantine.\n",
|
|
|
|
quarantine_files_oss.str().c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
void ErrorHandler::ClearFilesToQuarantine() {
|
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
files_to_quarantine_.clear();
|
|
|
|
ROCKS_LOG_INFO(db_options_.info_log,
|
|
|
|
"ErrorHandler: cleared files in quarantine.\n");
|
|
|
|
}
|
|
|
|
|
2020-12-08 04:09:55 +00:00
|
|
|
Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error,
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
bool* auto_recovery) {
|
|
|
|
if (bg_error.severity() >= Status::Severity::kFatalError) {
|
|
|
|
return bg_error;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (db_options_.sst_file_manager.get() == nullptr) {
|
|
|
|
// We rely on SFM to poll for enough disk space and recover
|
|
|
|
*auto_recovery = false;
|
|
|
|
return bg_error;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (db_options_.allow_2pc &&
|
|
|
|
(bg_error.severity() <= Status::Severity::kSoftError)) {
|
|
|
|
// Don't know how to recover, as the contents of the current WAL file may
|
|
|
|
// be inconsistent, and it may be needed for 2PC. If 2PC is not enabled,
|
|
|
|
// we can just flush the memtable and discard the log
|
|
|
|
*auto_recovery = false;
|
|
|
|
return Status(bg_error, Status::Severity::kFatalError);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
uint64_t free_space;
|
|
|
|
if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path,
|
|
|
|
&free_space) == Status::NotSupported()) {
|
|
|
|
*auto_recovery = false;
|
2018-06-28 19:23:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
return bg_error;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ErrorHandler::RecoverFromNoSpace() {
|
|
|
|
SstFileManagerImpl* sfm =
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
|
|
|
|
// Inform SFM of the error, so it can kick-off the recovery
|
|
|
|
if (sfm) {
|
|
|
|
sfm->StartErrorRecovery(this, bg_error_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status ErrorHandler::ClearBGError() {
|
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
|
|
|
|
// Signal that recovery succeeded
|
|
|
|
if (recovery_error_.ok()) {
|
2023-11-11 16:11:11 +00:00
|
|
|
assert(files_to_quarantine_.empty());
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
Status old_bg_error = bg_error_;
|
2021-12-08 22:22:41 +00:00
|
|
|
// old_bg_error is only for notifying listeners, so may not be checked
|
|
|
|
old_bg_error.PermitUncheckedError();
|
2020-12-08 04:09:55 +00:00
|
|
|
// Clear and check the recovery IO and BG error
|
2023-10-31 23:13:36 +00:00
|
|
|
is_db_stopped_.store(false, std::memory_order_release);
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
bg_error_ = Status::OK();
|
2023-10-10 13:31:45 +00:00
|
|
|
recovery_error_ = IOStatus::OK();
|
2020-12-08 04:09:55 +00:00
|
|
|
bg_error_.PermitUncheckedError();
|
2023-10-10 13:31:45 +00:00
|
|
|
recovery_error_.PermitUncheckedError();
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
recovery_in_prog_ = false;
|
2020-09-18 03:22:35 +00:00
|
|
|
soft_error_no_bg_work_ = false;
|
2021-12-08 22:22:41 +00:00
|
|
|
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error,
|
|
|
|
bg_error_, db_mutex_);
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
}
|
|
|
|
return recovery_error_;
|
2018-06-28 19:23:57 +00:00
|
|
|
}
|
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
Status ErrorHandler::RecoverFromBGError(bool is_manual) {
|
|
|
|
InstrumentedMutexLock l(db_mutex_);
|
2020-09-18 03:22:35 +00:00
|
|
|
bool no_bg_work_original_flag = soft_error_no_bg_work_;
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
if (is_manual) {
|
|
|
|
// If its a manual recovery and there's a background recovery in progress
|
|
|
|
// return busy status
|
|
|
|
if (recovery_in_prog_) {
|
2024-01-30 00:31:09 +00:00
|
|
|
return Status::Busy("Recovery already in progress");
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
}
|
|
|
|
recovery_in_prog_ = true;
|
2020-09-18 03:22:35 +00:00
|
|
|
|
|
|
|
// In manual resume, we allow the bg work to run. If it is a auto resume,
|
|
|
|
// the bg work should follow this tag.
|
|
|
|
soft_error_no_bg_work_ = false;
|
|
|
|
|
|
|
|
// In manual resume, if the bg error is a soft error and also requires
|
|
|
|
// no bg work, the error must be recovered by call the flush with
|
|
|
|
// flush reason: kErrorRecoveryRetryFlush. In other case, the flush
|
|
|
|
// reason is set to kErrorRecovery.
|
|
|
|
if (no_bg_work_original_flag) {
|
|
|
|
recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
|
|
|
|
} else {
|
|
|
|
recover_context_.flush_reason = FlushReason::kErrorRecovery;
|
|
|
|
}
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
}
|
|
|
|
|
2020-09-18 03:22:35 +00:00
|
|
|
if (bg_error_.severity() == Status::Severity::kSoftError &&
|
|
|
|
recover_context_.flush_reason == FlushReason::kErrorRecovery) {
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
// Simply clear the background error and return
|
2023-10-10 13:31:45 +00:00
|
|
|
recovery_error_ = IOStatus::OK();
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
return ClearBGError();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reset recovery_error_. We will use this to record any errors that happen
|
|
|
|
// during the recovery process. While recovering, the only operations that
|
|
|
|
// can generate background errors should be the flush operations
|
2023-10-10 13:31:45 +00:00
|
|
|
recovery_error_ = IOStatus::OK();
|
2020-12-08 04:09:55 +00:00
|
|
|
recovery_error_.PermitUncheckedError();
|
2020-09-18 03:22:35 +00:00
|
|
|
Status s = db_->ResumeImpl(recover_context_);
|
|
|
|
if (s.ok()) {
|
|
|
|
soft_error_no_bg_work_ = false;
|
|
|
|
} else {
|
|
|
|
soft_error_no_bg_work_ = no_bg_work_original_flag;
|
|
|
|
}
|
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
// For manual recover, shutdown, and fatal error cases, set
|
|
|
|
// recovery_in_prog_ to false. For automatic background recovery, leave it
|
|
|
|
// as is regardless of success or failure as it will be retried
|
|
|
|
if (is_manual || s.IsShutdownInProgress() ||
|
|
|
|
bg_error_.severity() >= Status::Severity::kFatalError) {
|
|
|
|
recovery_in_prog_ = false;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
2020-07-15 18:02:44 +00:00
|
|
|
|
2020-12-08 04:09:55 +00:00
|
|
|
const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
|
|
|
|
const IOStatus& io_error) {
|
2020-07-15 18:02:44 +00:00
|
|
|
db_mutex_->AssertHeld();
|
2020-12-08 04:09:55 +00:00
|
|
|
if (bg_error_.ok()) {
|
|
|
|
return bg_error_;
|
|
|
|
} else if (io_error.ok()) {
|
2022-02-08 02:20:51 +00:00
|
|
|
return kOkStatus;
|
2020-12-08 04:09:55 +00:00
|
|
|
} else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) {
|
2020-07-15 18:02:44 +00:00
|
|
|
// Auto resume BG error is not enabled, directly return bg_error_.
|
|
|
|
return bg_error_;
|
2023-10-25 18:59:09 +00:00
|
|
|
} else if (end_recovery_) {
|
|
|
|
// Can temporarily release db mutex
|
|
|
|
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
|
|
|
|
Status::ShutdownInProgress(),
|
|
|
|
db_mutex_);
|
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
return bg_error_;
|
2020-12-04 22:57:29 +00:00
|
|
|
}
|
2024-01-22 22:57:30 +00:00
|
|
|
RecordStats({ERROR_HANDLER_AUTORESUME_COUNT}, {} /* int_histograms */);
|
2021-03-18 05:36:42 +00:00
|
|
|
ROCKS_LOG_INFO(
|
|
|
|
db_options_.info_log,
|
|
|
|
"ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n");
|
2023-10-11 16:42:48 +00:00
|
|
|
// Needs to be set in the same lock hold as setting BG error, otherwise
|
|
|
|
// intervening writes could see a BG error without a recovery and bail out.
|
|
|
|
recovery_in_prog_ = true;
|
|
|
|
|
2020-12-04 22:57:29 +00:00
|
|
|
if (recovery_thread_) {
|
2023-10-11 16:42:48 +00:00
|
|
|
// Ensure only one thread can execute the join().
|
|
|
|
std::unique_ptr<port::Thread> old_recovery_thread(
|
|
|
|
std::move(recovery_thread_));
|
2020-12-04 22:57:29 +00:00
|
|
|
// In this case, if recovery_in_prog_ is false, current thread should
|
|
|
|
// wait the previous recover thread to finish and create a new thread
|
|
|
|
// to recover from the bg error.
|
|
|
|
db_mutex_->Unlock();
|
2023-10-31 23:13:36 +00:00
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"StartRecoverFromRetryableBGIOError:BeforeWaitingForOtherThread");
|
2023-10-11 16:42:48 +00:00
|
|
|
old_recovery_thread->join();
|
2023-10-31 23:13:36 +00:00
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"StartRecoverFromRetryableBGIOError:AfterWaitingForOtherThread");
|
2020-12-04 22:57:29 +00:00
|
|
|
db_mutex_->Lock();
|
2020-07-15 18:02:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
recovery_thread_.reset(
|
|
|
|
new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));
|
|
|
|
|
2023-10-10 13:31:45 +00:00
|
|
|
if (recovery_error_.ok()) {
|
2020-12-08 04:09:55 +00:00
|
|
|
return recovery_error_;
|
2020-07-15 18:02:44 +00:00
|
|
|
} else {
|
|
|
|
return bg_error_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Automatic recover from Retryable BG IO error. Must be called after db
|
|
|
|
// mutex is released.
|
|
|
|
void ErrorHandler::RecoverFromRetryableBGIOError() {
|
2023-10-31 23:13:36 +00:00
|
|
|
assert(recovery_in_prog_);
|
2020-07-15 18:02:44 +00:00
|
|
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
|
2023-09-22 23:43:50 +00:00
|
|
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart2");
|
2020-07-15 18:02:44 +00:00
|
|
|
InstrumentedMutexLock l(db_mutex_);
|
|
|
|
if (end_recovery_) {
|
2021-12-08 22:22:41 +00:00
|
|
|
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
|
|
|
|
Status::ShutdownInProgress(),
|
|
|
|
db_mutex_);
|
2023-09-26 03:15:40 +00:00
|
|
|
|
|
|
|
recovery_in_prog_ = false;
|
2020-07-15 18:02:44 +00:00
|
|
|
return;
|
|
|
|
}
|
2020-09-18 03:22:35 +00:00
|
|
|
DBRecoverContext context = recover_context_;
|
Only flush after recovery for retryable IOError (#11880)
Summary:
https://github.com/facebook/rocksdb/issues/11872 causes a unit test to start failing with the error message below. The cause is that the additional call to `FlushAllColumnFamilies()` in `DBImpl::ResumeImpl()` can run while DB is closing. More detailed explanation: there are two places where we call `ResumeImpl()`:
1. in `ErrorHandler::RecoverFromBGError`, for manual resume or recovery from errors like OutOfSpace through sst file manager, and
2. in `Errorhandler::RecoverFromRetryableBGIOError`, for error recovery from errors like flush failure due to retryable IOError. This is tracked by `ErrorHandler::recovery_thread_`.
Here is how DB close waits for error recovery: https://github.com/facebook/rocksdb/blob/49da91ec097b4efcd8a8e4dc1b287e9f81eb4093/db/db_impl/db_impl.cc#L540-L543
`CancelErrorRecovery()` waits until `recovery_thread_` finishes and `IsRecoveryInProgress()` checks the `recovery_in_prog_` flag. The additional call to `FlushAllColumnFamilies()` in `ResumeImpl()` happens after it clears bg error and the `recovery_in_prog_` flag: https://github.com/facebook/rocksdb/blob/49da91ec097b4efcd8a8e4dc1b287e9f81eb4093/db/db_impl/db_impl.cc#L436-L463. So if `ResumeImpl()` is called in `RecoverFromBGError()`, we can have a thread running `FlushAllColumnFamilies()` while DB is closing and thought that recovery is done.
The fix is to only do the additional call to `FlushAllColumnFamilies()` when doing error recovery through `Errorhandler::RecoverFromRetryableBGIOError` by setting flags in `DBRecoverContext`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11880
Test Plan:
`gtest-parallel --repeat=100 --workers=4 ./error_handler_fs_test --gtest_filter="*AutoRecoverFlushError*"` reproduces the error pretty reliably.
```[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from DBErrorHandlingFSTest
[ RUN ] DBErrorHandlingFSTest.AutoRecoverFlushError
error_handler_fs_test: db/column_family.cc:1618: rocksdb::ColumnFamilySet::~ColumnFamilySet(): Assertion `last_ref' failed.
Received signal 6 (Aborted)
...
https://github.com/facebook/rocksdb/issues/10 0x00007fac4409efd6 in __GI___assert_fail (assertion=0x7fac452c0afa "last_ref", file=0x7fac452c9fb5 "db/column_family.cc", line=1618, function=0x7fac452cb950 "rocksdb::ColumnFamilySet::~ColumnFamilySet()") at assert.c:101
101 in assert.c
https://github.com/facebook/rocksdb/issues/11 0x00007fac44b5324f in rocksdb::ColumnFamilySet::~ColumnFamilySet (this=0x7b5400000000) at db/column_family.cc:1618
1618 assert(last_ref);
https://github.com/facebook/rocksdb/issues/12 0x00007fac44e0f047 in std::default_delete<rocksdb::ColumnFamilySet>::operator() (this=0x7b5800000940, __ptr=0x7b5400000000) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:85
85 delete __ptr;
https://github.com/facebook/rocksdb/issues/13 std::__uniq_ptr_impl<rocksdb::ColumnFamilySet, std::default_delete<rocksdb::ColumnFamilySet> >::reset (this=0x7b5800000940, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:182
182 _M_deleter()(__old_p);
https://github.com/facebook/rocksdb/issues/14 std::unique_ptr<rocksdb::ColumnFamilySet, std::default_delete<rocksdb::ColumnFamilySet> >::reset (this=0x7b5800000940, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:456
456 _M_t.reset(std::move(__p));
https://github.com/facebook/rocksdb/issues/15 rocksdb::VersionSet::~VersionSet (this=this@entry=0x7b5800000900) at db/version_set.cc:5081
5081 column_family_set_.reset();
https://github.com/facebook/rocksdb/issues/16 0x00007fac44e0f97a in rocksdb::VersionSet::~VersionSet (this=0x7b5800000900) at db/version_set.cc:5078
5078 VersionSet::~VersionSet() {
https://github.com/facebook/rocksdb/issues/17 0x00007fac44bf0b2f in std::default_delete<rocksdb::VersionSet>::operator() (this=0x7b8c00000068, __ptr=0x7b5800000900) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:85
85 delete __ptr;
https://github.com/facebook/rocksdb/issues/18 std::__uniq_ptr_impl<rocksdb::VersionSet, std::default_delete<rocksdb::VersionSet> >::reset (this=0x7b8c00000068, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:182
182 _M_deleter()(__old_p);
https://github.com/facebook/rocksdb/issues/19 std::unique_ptr<rocksdb::VersionSet, std::default_delete<rocksdb::VersionSet> >::reset (this=0x7b8c00000068, __p=0x0) at /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:456
456 _M_t.reset(std::move(__p));
https://github.com/facebook/rocksdb/issues/20 rocksdb::DBImpl::CloseHelper (this=this@entry=0x7b8c00000000) at db/db_impl/db_impl.cc:676
676 versions_.reset();
https://github.com/facebook/rocksdb/issues/21 0x00007fac44bf1346 in rocksdb::DBImpl::CloseImpl (this=0x7b8c00000000) at db/db_impl/db_impl.cc:720
720 Status DBImpl::CloseImpl() { return CloseHelper(); }
https://github.com/facebook/rocksdb/issues/22 rocksdb::DBImpl::~DBImpl (this=this@entry=0x7b8c00000000) at db/db_impl/db_impl.cc:738
738 closing_status_ = CloseImpl();
https://github.com/facebook/rocksdb/issues/23 0x00007fac44bf2bba in rocksdb::DBImpl::~DBImpl (this=0x7b8c00000000) at db/db_impl/db_impl.cc:722
722 DBImpl::~DBImpl() {
https://github.com/facebook/rocksdb/issues/24 0x00007fac455444d4 in rocksdb::DBTestBase::Close (this=this@entry=0x7b6c00000000) at db/db_test_util.cc:678
678 delete db_;
https://github.com/facebook/rocksdb/issues/25 0x00007fac455455fb in rocksdb::DBTestBase::TryReopen (this=this@entry=0x7b6c00000000, options=...) at db/db_test_util.cc:707
707 Close();
https://github.com/facebook/rocksdb/issues/26 0x00007fac45543459 in rocksdb::DBTestBase::Reopen (this=0x7ffed74b79a0, options=...) at db/db_test_util.cc:670
670 ASSERT_OK(TryReopen(options));
https://github.com/facebook/rocksdb/issues/27 0x00000000004f2522 in rocksdb::DBErrorHandlingFSTest_AutoRecoverFlushError_Test::TestBody (this=this@entry=0x7b6c00000000) at db/error_handler_fs_test.cc:1224
1224 Reopen(options);
```
Reviewed By: jowlyzhang
Differential Revision: D49579701
Pulled By: cbi42
fbshipit-source-id: 3fc8325e6dde7e7faa8bcad95060cb4e26eda638
2023-09-25 16:34:39 +00:00
|
|
|
context.flush_after_recovery = true;
|
2020-07-15 18:02:44 +00:00
|
|
|
int resume_count = db_options_.max_bgerror_resume_count;
|
|
|
|
uint64_t wait_interval = db_options_.bgerror_resume_retry_interval;
|
2021-03-18 05:36:42 +00:00
|
|
|
uint64_t retry_count = 0;
|
2020-07-15 18:02:44 +00:00
|
|
|
// Recover from the retryable error. Create a separate thread to do it.
|
|
|
|
while (resume_count > 0) {
|
|
|
|
if (end_recovery_) {
|
2021-12-08 22:22:41 +00:00
|
|
|
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
|
|
|
|
Status::ShutdownInProgress(),
|
|
|
|
db_mutex_);
|
2023-09-26 03:15:40 +00:00
|
|
|
recovery_in_prog_ = false;
|
2020-07-15 18:02:44 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0");
|
|
|
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1");
|
2023-10-10 13:31:45 +00:00
|
|
|
recovery_error_ = IOStatus::OK();
|
2021-03-18 05:36:42 +00:00
|
|
|
retry_count++;
|
2020-09-18 03:22:35 +00:00
|
|
|
Status s = db_->ResumeImpl(context);
|
2024-01-22 22:57:30 +00:00
|
|
|
RecordStats({ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT},
|
|
|
|
{} /* int_histograms */);
|
2020-07-15 18:02:44 +00:00
|
|
|
if (s.IsShutdownInProgress() ||
|
|
|
|
bg_error_.severity() >= Status::Severity::kFatalError) {
|
|
|
|
// If DB shutdown in progress or the error severity is higher than
|
|
|
|
// Hard Error, stop auto resume and returns.
|
|
|
|
recovery_in_prog_ = false;
|
2024-01-22 22:57:30 +00:00
|
|
|
RecordStats({} /* ticker_types */,
|
|
|
|
{{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}});
|
2021-12-08 22:22:41 +00:00
|
|
|
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
|
|
|
|
bg_error_, db_mutex_);
|
2020-07-15 18:02:44 +00:00
|
|
|
return;
|
|
|
|
}
|
2023-10-10 13:31:45 +00:00
|
|
|
if (!recovery_error_.ok() &&
|
2020-07-15 18:02:44 +00:00
|
|
|
recovery_error_.severity() <= Status::Severity::kHardError &&
|
2023-10-10 13:31:45 +00:00
|
|
|
recovery_error_.GetRetryable()) {
|
2020-07-15 18:02:44 +00:00
|
|
|
// If new BG IO error happens during auto recovery and it is retryable
|
|
|
|
// and its severity is Hard Error or lower, the auto resmue sleep for
|
|
|
|
// a period of time and redo auto resume if it is allowed.
|
|
|
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0");
|
|
|
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1");
|
2021-03-15 11:32:24 +00:00
|
|
|
int64_t wait_until = db_options_.clock->NowMicros() + wait_interval;
|
2020-07-15 18:02:44 +00:00
|
|
|
cv_.TimedWait(wait_until);
|
|
|
|
} else {
|
2023-10-10 13:31:45 +00:00
|
|
|
// There are three possibility: 1) recovery_error_ is set during resume
|
2020-07-15 18:02:44 +00:00
|
|
|
// and the error is not retryable, 2) recover is successful, 3) other
|
|
|
|
// error happens during resume and cannot be resumed here.
|
2023-10-10 13:31:45 +00:00
|
|
|
if (recovery_error_.ok() && s.ok()) {
|
2020-07-15 18:02:44 +00:00
|
|
|
// recover from the retryable IO error and no other BG errors. Clean
|
|
|
|
// the bg_error and notify user.
|
|
|
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess");
|
2024-01-22 22:57:30 +00:00
|
|
|
RecordStats({ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT},
|
|
|
|
{{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}});
|
2020-07-15 18:02:44 +00:00
|
|
|
return;
|
|
|
|
} else {
|
2023-10-10 13:31:45 +00:00
|
|
|
// In this case: 1) recovery_error_ is more serious or not retryable
|
|
|
|
// 2) other error happens. The auto recovery stops.
|
2020-07-15 18:02:44 +00:00
|
|
|
recovery_in_prog_ = false;
|
2024-01-22 22:57:30 +00:00
|
|
|
RecordStats({} /* ticker_types */,
|
|
|
|
{{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}});
|
2021-12-08 22:22:41 +00:00
|
|
|
EventHelpers::NotifyOnErrorRecoveryEnd(
|
|
|
|
db_options_.listeners, bg_error_,
|
2023-10-10 13:31:45 +00:00
|
|
|
!recovery_error_.ok() ? recovery_error_ : s, db_mutex_);
|
2020-07-15 18:02:44 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
resume_count--;
|
|
|
|
}
|
|
|
|
recovery_in_prog_ = false;
|
2021-12-08 22:22:41 +00:00
|
|
|
EventHelpers::NotifyOnErrorRecoveryEnd(
|
|
|
|
db_options_.listeners, bg_error_,
|
|
|
|
Status::Aborted("Exceeded resume retry count"), db_mutex_);
|
2020-07-15 18:02:44 +00:00
|
|
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
|
2024-01-22 22:57:30 +00:00
|
|
|
RecordStats({} /* ticker_types */,
|
|
|
|
{{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}});
|
2020-07-15 18:02:44 +00:00
|
|
|
}
|
|
|
|
|
2021-11-08 23:47:53 +00:00
|
|
|
void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
|
|
|
|
if (recovery_in_prog_ && recovery_error_.ok()) {
|
2023-10-10 13:31:45 +00:00
|
|
|
recovery_error_ = status_to_io_status(Status(bg_err));
|
2021-11-08 23:47:53 +00:00
|
|
|
}
|
|
|
|
if (bg_err.severity() > bg_error_.severity()) {
|
|
|
|
bg_error_ = bg_err;
|
|
|
|
}
|
2022-07-21 20:35:36 +00:00
|
|
|
if (bg_error_.severity() >= Status::Severity::kHardError) {
|
|
|
|
is_db_stopped_.store(true, std::memory_order_release);
|
|
|
|
}
|
2021-11-08 23:47:53 +00:00
|
|
|
}
|
|
|
|
|
2020-07-15 18:02:44 +00:00
|
|
|
void ErrorHandler::EndAutoRecovery() {
|
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
if (!end_recovery_) {
|
|
|
|
end_recovery_ = true;
|
|
|
|
}
|
|
|
|
if (recovery_thread_) {
|
2023-10-11 16:42:48 +00:00
|
|
|
// Ensure only one thread can execute the join().
|
|
|
|
std::unique_ptr<port::Thread> old_recovery_thread(
|
|
|
|
std::move(recovery_thread_));
|
|
|
|
db_mutex_->Unlock();
|
|
|
|
cv_.SignalAll();
|
|
|
|
old_recovery_thread->join();
|
|
|
|
db_mutex_->Lock();
|
2020-07-15 18:02:44 +00:00
|
|
|
}
|
2023-10-25 18:59:09 +00:00
|
|
|
TEST_SYNC_POINT("PostEndAutoRecovery");
|
2020-07-15 18:02:44 +00:00
|
|
|
}
|
|
|
|
|
2024-01-22 22:57:30 +00:00
|
|
|
void ErrorHandler::RecordStats(
|
|
|
|
const std::vector<Tickers>& ticker_types,
|
|
|
|
const std::vector<std::tuple<Histograms, uint64_t>>& int_histograms) {
|
|
|
|
if (bg_error_stats_ == nullptr) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
for (const auto& ticker_type : ticker_types) {
|
|
|
|
RecordTick(bg_error_stats_.get(), ticker_type);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& hist : int_histograms) {
|
|
|
|
RecordInHistogram(bg_error_stats_.get(), std::get<0>(hist),
|
|
|
|
std::get<1>(hist));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|