diff --git a/HISTORY.md b/HISTORY.md index 84f5d2a2df..479b3efc11 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -21,6 +21,7 @@ * Add new options for db_bench --benchmarks: flush, waitforcompaction, compact0, compact1. * Add an option to BackupEngine::GetBackupInfo to include the name and size of each backed-up file. Especially in the presence of file sharing among backups, this offers detailed insight into backup space usage. * Enable backward iteration on keys with user-defined timestamps. +* Add statistics and info log for error handler: counters for bg error, bg io error, bg retryable io error, auto resume count, auto resume total retry number, and auto resume sucess; Histogram for auto resume retry count in each recovery call. Note that, each auto resume attempt will have one or multiple retries. ## 6.18.0 (02/19/2021) ### Behavior Changes diff --git a/db/error_handler.cc b/db/error_handler.cc index 80c503c4d0..9d8ae09705 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -4,9 +4,11 @@ // (found in the LICENSE.Apache file in the root directory). // #include "db/error_handler.h" + #include "db/db_impl/db_impl.h" #include "db/event_helpers.h" #include "file/sst_file_manager_impl.h" +#include "logging/logging.h" namespace ROCKSDB_NAMESPACE { @@ -274,6 +276,12 @@ const Status& ErrorHandler::SetBGError(const Status& bg_err, return bg_err; } + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + } + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: Set regular background error\n"); + bool paranoid = db_options_.paranoid_checks; Status::Severity sev = Status::Severity::kFatalError; Status new_bg_err; @@ -399,6 +407,13 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err, if (recovery_in_prog_ && recovery_error_.ok()) { recovery_error_ = bg_err; } + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Set background IO error as unrecoverable error\n"); EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &bg_err, db_mutex_, &auto_recovery); recover_context_ = context; @@ -416,12 +431,26 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err, EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &new_bg_io_err, db_mutex_, &auto_recovery); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT); + } + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: Set background retryable IO error\n"); if (BackgroundErrorReason::kCompaction == reason) { // We map the retryable IO error during compaction to soft error. Since // compaction can reschedule by itself. We will not set the BG error in // this case // TODO: a better way to set or clean the retryable IO error which // happens during compaction SST file write. + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Compaction will schedule by itself to resume\n"); return bg_error_; } else if (BackgroundErrorReason::kFlushNoWAL == reason || BackgroundErrorReason::kManifestWriteNoWAL == reason) { @@ -455,6 +484,9 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err, return StartRecoverFromRetryableBGIOError(bg_io_err); } } else { + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + } return SetBGError(new_bg_io_err, reason); } } @@ -603,7 +635,12 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError( // Auto resume BG error is not enabled, directly return bg_error_. return bg_error_; } - + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n"); if (recovery_thread_) { // In this case, if recovery_in_prog_ is false, current thread should // wait the previous recover thread to finish and create a new thread @@ -642,6 +679,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { DBRecoverContext context = recover_context_; int resume_count = db_options_.max_bgerror_resume_count; uint64_t wait_interval = db_options_.bgerror_resume_retry_interval; + uint64_t retry_count = 0; // Recover from the retryable error. Create a separate thread to do it. while (resume_count > 0) { if (end_recovery_) { @@ -651,15 +689,24 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1"); recovery_io_error_ = IOStatus::OK(); recovery_error_ = Status::OK(); + retry_count++; Status s = db_->ResumeImpl(context); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume0"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume1"); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT); + } if (s.IsShutdownInProgress() || bg_error_.severity() >= Status::Severity::kFatalError) { // If DB shutdown in progress or the error severity is higher than // Hard Error, stop auto resume and returns. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail0"); recovery_in_prog_ = false; + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } return; } if (!recovery_io_error_.ok() && @@ -686,6 +733,12 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { bg_error_.PermitUncheckedError(); EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, old_bg_error, db_mutex_); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT); + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } recovery_in_prog_ = false; if (soft_error_no_bg_work_) { soft_error_no_bg_work_ = false; @@ -696,6 +749,10 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { // In this case: 1) recovery_io_error is more serious or not retryable // 2) other Non IO recovery_error happens. The auto recovery stops. recovery_in_prog_ = false; + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } return; } } @@ -703,6 +760,10 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { } recovery_in_prog_ = false; TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut"); + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } return; #else return; diff --git a/db/error_handler.h b/db/error_handler.h index acd09514ec..6c7373b90f 100644 --- a/db/error_handler.h +++ b/db/error_handler.h @@ -37,7 +37,8 @@ class ErrorHandler { db_mutex_(db_mutex), auto_recovery_(false), recovery_in_prog_(false), - soft_error_no_bg_work_(false) { + soft_error_no_bg_work_(false), + bg_error_stats_(db_options.statistics) { // Clear the checked flag for uninitialized errors bg_error_.PermitUncheckedError(); recovery_error_.PermitUncheckedError(); @@ -108,6 +109,9 @@ class ErrorHandler { // Used to store the context for recover, such as flush reason. DBRecoverContext recover_context_; + // The pointer of DB statistics. + std::shared_ptr bg_error_stats_; + Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery); void RecoverFromNoSpace(); const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error); diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc index 00247c50c9..9c99407406 100644 --- a/db/error_handler_fs_test.cc +++ b/db/error_handler_fs_test.cc @@ -158,6 +158,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteError) { options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(false); @@ -174,13 +175,25 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteError) { fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); Reopen(options); ASSERT_EQ("val", Get(Key(0))); Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) { +TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) { std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); @@ -188,6 +201,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) { options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 0; + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(false); @@ -207,6 +221,18 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) { fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); Reopen(options); ASSERT_EQ("val1", Get(Key(1))); @@ -241,7 +267,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) { Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWritFileScopeError) { +TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) { std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); @@ -325,7 +351,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritFileScopeError) { Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { +TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError1) { std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); @@ -333,6 +359,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 0; + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(false); @@ -363,11 +390,23 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { s = Flush(); ASSERT_OK(s); ASSERT_EQ("val3", Get(Key(3))); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError2) { +TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) { std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); @@ -410,7 +449,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError2) { Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError3) { +TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) { std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); @@ -1010,6 +1049,7 @@ TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) { options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(); @@ -1028,6 +1068,18 @@ TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) { s = Put(Key(1), "val"); ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); Reopen(options); ASSERT_EQ("val", Get(Key(0))); @@ -1567,6 +1619,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) { options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 2; options.bgerror_resume_retry_interval = 100000; // 0.1 second + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(false); @@ -1594,6 +1647,22 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) { ASSERT_EQ("val1", Get(Key(1))); SyncPoint::GetInstance()->DisableProcessing(); fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(2, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + HistogramData autoresume_retry; + options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + &autoresume_retry); + ASSERT_EQ(autoresume_retry.max, 2); ASSERT_OK(Put(Key(2), "val2", wo)); s = Flush(); // Since auto resume fails, the bg error is not cleand, flush will @@ -1620,6 +1689,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) { options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 2; options.bgerror_resume_retry_interval = 100000; // 0.1 second + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(false); @@ -1643,6 +1713,22 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) { fault_fs_->SetFilesystemActive(true); ASSERT_EQ(listener->WaitForRecovery(5000000), true); ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + HistogramData autoresume_retry; + options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + &autoresume_retry); + ASSERT_EQ(autoresume_retry.max, 1); ASSERT_OK(Put(Key(2), "val2", wo)); s = Flush(); // Since auto resume is successful, the bg error is cleaned, flush will diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 98b4fb970d..bec5e66368 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -374,6 +374,15 @@ enum Tickers : uint32_t { // # of files deleted immediately by sst file manger through delete scheduler. FILES_DELETED_IMMEDIATELY, + // The counters for error handler, not that, bg_io_error is the subset of + // bg_error and bg_retryable_io_error is the subset of bg_io_error + ERROR_HANDLER_BG_ERROR_COUNT, + ERROR_HANDLER_BG_IO_ERROR_COUNT, + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, + ERROR_HANDLER_AUTORESUME_COUNT, + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT, + TICKER_ENUM_MAX }; @@ -472,6 +481,9 @@ enum Histograms : uint32_t { // Num of sst files read from file system per level. NUM_SST_READ_PER_LEVEL, + // Error handler statistics + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + HISTOGRAM_ENUM_MAX, }; diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index df955564fb..8f4ebcaf05 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -4982,7 +4982,20 @@ class TickerTypeJni { return -0x14; case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL: return -0x15; - + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT: + return -0x16; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT: + return -0x17; + case ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT: + return -0x18; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT: + return -0x19; + case ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT: + return -0x1A; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT: + return -0x1B; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: // 0x5F for backwards compatibility on current minor version. return 0x5F; @@ -5294,6 +5307,21 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC; case -0x15: return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL; + case -0x16: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT; + case -0x17: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT; + case -0x18: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT; + case -0x19: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT; + case -0x1A: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT; + case -0x1B: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT; case 0x5F: // 0x5F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX; @@ -5413,6 +5441,8 @@ class HistogramTypeJni { return 0x30; case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL: return 0x31; + case ROCKSDB_NAMESPACE::Histograms::ERROR_HANDLER_AUTORESUME_RETRY_COUNT: + return 0x31; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; @@ -5527,6 +5557,9 @@ class HistogramTypeJni { return ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL; case 0x31: return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL; + case 0x32: + return ROCKSDB_NAMESPACE::Histograms:: + ERROR_HANDLER_AUTORESUME_RETRY_COUNT; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index 80d7c600ed..5953a7d9bd 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -175,6 +175,11 @@ public enum HistogramType { */ NUM_SST_READ_PER_LEVEL((byte) 0x31), + /** + * The number of retry in auto resume + */ + ERROR_HANDLER_AUTORESUME_RETRY_COUNT((byte) 0x32), + // 0x1F for backwards compatibility on current minor version. HISTOGRAM_ENUM_MAX((byte) 0x1F); diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 7a37f35b9e..a2c7085884 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -742,6 +742,16 @@ public enum TickerType { COMPACT_WRITE_BYTES_PERIODIC((byte) -0x14), COMPACT_WRITE_BYTES_TTL((byte) -0x15), + /** + * DB error handler statistics + */ + ERROR_HANDLER_BG_ERROR_COUNT((byte) -0x16), + ERROR_HANDLER_BG_IO_ERROR_COUNT((byte) -0x17), + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT((byte) -0x18), + ERROR_HANDLER_AUTORESUME_COUNT((byte) -0x19), + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT((byte) -0x1A), + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT((byte) -0x1B), + TICKER_ENUM_MAX((byte) 0x5F); private final byte value; diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 1723827cff..2b690dd50f 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -191,6 +191,16 @@ const std::vector> TickersNameMap = { "rocksdb.block.cache.compression.dict.add.redundant"}, {FILES_MARKED_TRASH, "rocksdb.files.marked.trash"}, {FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"}, + {ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.errro.count"}, + {ERROR_HANDLER_BG_IO_ERROR_COUNT, + "rocksdb.error.handler.bg.io.errro.count"}, + {ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, + "rocksdb.error.handler.bg.retryable.io.errro.count"}, + {ERROR_HANDLER_AUTORESUME_COUNT, "rocksdb.error.handler.autoresume.count"}, + {ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, + "rocksdb.error.handler.autoresume.retry.total.count"}, + {ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT, + "rocksdb.error.handler.autoresume.success.count"}, }; const std::vector> HistogramsNameMap = { @@ -246,6 +256,8 @@ const std::vector> HistogramsNameMap = { "rocksdb.num.index.and.filter.blocks.read.per.level"}, {NUM_DATA_BLOCKS_READ_PER_LEVEL, "rocksdb.num.data.blocks.read.per.level"}, {NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"}, + {ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + "rocksdb.error.handler.autoresume.retry.count"}, }; std::shared_ptr CreateDBStatistics() {