mirror of https://github.com/facebook/rocksdb.git
Add a DB property to indicate number of background errors encountered
Summary: Add a property to calculate number of background errors encountered to help users build their monitoring Test Plan: Add a unit test. make all check Reviewers: haobo, igor, dhruba Reviewed By: igor CC: ljin, nkg-, yhchiang, leveldb Differential Revision: https://reviews.facebook.net/D16959
This commit is contained in:
parent
1ec72b37b1
commit
71e6a34271
|
@ -9,6 +9,8 @@
|
|||
|
||||
#include "db/db_impl.h"
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
#include <algorithm>
|
||||
#include <climits>
|
||||
#include <cstdio>
|
||||
|
@ -1806,8 +1808,10 @@ Status DBImpl::WaitForFlushMemTable() {
|
|||
return s;
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_FlushMemTable() {
|
||||
return FlushMemTable(FlushOptions());
|
||||
Status DBImpl::TEST_FlushMemTable(bool wait) {
|
||||
FlushOptions fo;
|
||||
fo.wait = wait;
|
||||
return FlushMemTable(fo);
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_WaitForFlushMemTable() {
|
||||
|
@ -1904,10 +1908,13 @@ void DBImpl::BackgroundCallFlush() {
|
|||
// case this is an environmental problem and we do not want to
|
||||
// chew up resources for failed compactions for the duration of
|
||||
// the problem.
|
||||
uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount();
|
||||
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
|
||||
Log(options_.info_log, "Waiting after background flush error: %s",
|
||||
s.ToString().c_str());
|
||||
mutex_.Unlock();
|
||||
Log(options_.info_log,
|
||||
"Waiting after background flush error: %s"
|
||||
"Accumulated background error counts: %" PRIu64,
|
||||
s.ToString().c_str(), error_cnt);
|
||||
log_buffer.FlushBufferToLog();
|
||||
LogFlush(options_.info_log);
|
||||
env_->SleepForMicroseconds(1000000);
|
||||
|
@ -1978,11 +1985,14 @@ void DBImpl::BackgroundCallCompaction() {
|
|||
// case this is an environmental problem and we do not want to
|
||||
// chew up resources for failed compactions for the duration of
|
||||
// the problem.
|
||||
uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount();
|
||||
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
|
||||
mutex_.Unlock();
|
||||
log_buffer.FlushBufferToLog();
|
||||
Log(options_.info_log, "Waiting after background compaction error: %s",
|
||||
s.ToString().c_str());
|
||||
Log(options_.info_log,
|
||||
"Waiting after background compaction error: %s, "
|
||||
"Accumulated background error counts: %" PRIu64,
|
||||
s.ToString().c_str(), error_cnt);
|
||||
LogFlush(options_.info_log);
|
||||
env_->SleepForMicroseconds(1000000);
|
||||
mutex_.Lock();
|
||||
|
|
|
@ -109,7 +109,7 @@ class DBImpl : public DB {
|
|||
const Slice* end);
|
||||
|
||||
// Force current memtable contents to be flushed.
|
||||
Status TEST_FlushMemTable();
|
||||
Status TEST_FlushMemTable(bool wait = true);
|
||||
|
||||
// Wait for memtable compaction
|
||||
Status TEST_WaitForFlushMemTable();
|
||||
|
|
|
@ -4188,6 +4188,11 @@ TEST(DBTest, NoSpace) {
|
|||
dbfull()->TEST_CompactRange(level, nullptr, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
std::string property_value;
|
||||
ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
|
||||
ASSERT_EQ("5", property_value);
|
||||
|
||||
env_->no_space_.Release_Store(nullptr);
|
||||
ASSERT_LT(CountFiles(), num_files + 3);
|
||||
|
||||
|
@ -4196,6 +4201,43 @@ TEST(DBTest, NoSpace) {
|
|||
} while (ChangeCompactOptions());
|
||||
}
|
||||
|
||||
// Check background error counter bumped on flush failures.
|
||||
TEST(DBTest, NoSpaceFlush) {
|
||||
do {
|
||||
Options options = CurrentOptions();
|
||||
options.env = env_;
|
||||
options.max_background_flushes = 1;
|
||||
Reopen(&options);
|
||||
|
||||
ASSERT_OK(Put("foo", "v1"));
|
||||
env_->no_space_.Release_Store(env_); // Force out-of-space errors
|
||||
|
||||
std::string property_value;
|
||||
// Background error count is 0 now.
|
||||
ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
|
||||
ASSERT_EQ("0", property_value);
|
||||
|
||||
dbfull()->TEST_FlushMemTable(false);
|
||||
|
||||
// Wait 300 milliseconds or background-errors turned 1 from 0.
|
||||
int time_to_sleep_limit = 300000;
|
||||
while (time_to_sleep_limit > 0) {
|
||||
int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit;
|
||||
time_to_sleep_limit -= to_sleep;
|
||||
env_->SleepForMicroseconds(to_sleep);
|
||||
|
||||
ASSERT_TRUE(
|
||||
db_->GetProperty("rocksdb.background-errors", &property_value));
|
||||
if (property_value == "1") {
|
||||
break;
|
||||
}
|
||||
}
|
||||
ASSERT_EQ("1", property_value);
|
||||
|
||||
env_->no_space_.Release_Store(nullptr);
|
||||
} while (ChangeCompactOptions());
|
||||
}
|
||||
|
||||
TEST(DBTest, NonWritableFileSystem) {
|
||||
do {
|
||||
Options options = CurrentOptions();
|
||||
|
|
|
@ -30,9 +30,11 @@ DBPropertyType GetPropertyType(const Slice& property) {
|
|||
} else if (in == "num-immutable-mem-table") {
|
||||
return kNumImmutableMemTable;
|
||||
} else if (in == "mem-table-flush-pending") {
|
||||
return MemtableFlushPending;
|
||||
return kMemtableFlushPending;
|
||||
} else if (in == "compaction-pending") {
|
||||
return CompactionPending;
|
||||
return kCompactionPending;
|
||||
} else if (in == "background-errors") {
|
||||
return kBackgroundErrors;
|
||||
}
|
||||
return kUnknown;
|
||||
}
|
||||
|
@ -330,15 +332,21 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
|
|||
case kNumImmutableMemTable:
|
||||
*value = std::to_string(imm.size());
|
||||
return true;
|
||||
case MemtableFlushPending:
|
||||
case kMemtableFlushPending:
|
||||
// Return number of mem tables that are ready to flush (made immutable)
|
||||
*value = std::to_string(imm.IsFlushPending() ? 1 : 0);
|
||||
return true;
|
||||
case CompactionPending:
|
||||
case kCompactionPending:
|
||||
// 1 if the system already determines at least one compacdtion is needed.
|
||||
// 0 otherwise,
|
||||
*value = std::to_string(current->NeedsCompaction() ? 1 : 0);
|
||||
return true;
|
||||
/////////////
|
||||
case kBackgroundErrors:
|
||||
// Accumulated number of errors in background flushes or compactions.
|
||||
*value = std::to_string(GetBackgroundErrorCount());
|
||||
return true;
|
||||
/////////
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -26,9 +26,11 @@ enum DBPropertyType {
|
|||
kStats, // Return general statitistics of DB
|
||||
kSsTables, // Return a human readable string of current SST files
|
||||
kNumImmutableMemTable, // Return number of immutable mem tables
|
||||
MemtableFlushPending, // Return 1 if mem table flushing is pending, otherwise
|
||||
kMemtableFlushPending, // Return 1 if mem table flushing is pending,
|
||||
// otherwise
|
||||
// 0.
|
||||
CompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
|
||||
kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
|
||||
kBackgroundErrors, // Return accumulated background errors encountered.
|
||||
kUnknown,
|
||||
};
|
||||
|
||||
|
@ -49,6 +51,7 @@ class InternalStats {
|
|||
stall_counts_(WRITE_STALLS_ENUM_MAX, 0),
|
||||
stall_leveln_slowdown_(num_levels, 0),
|
||||
stall_leveln_slowdown_count_(num_levels, 0),
|
||||
bg_error_count_(0),
|
||||
number_levels_(num_levels),
|
||||
statistics_(statistics),
|
||||
env_(env),
|
||||
|
@ -116,6 +119,10 @@ class InternalStats {
|
|||
stall_leveln_slowdown_count_[level] += micros;
|
||||
}
|
||||
|
||||
uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
|
||||
|
||||
uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
|
||||
|
||||
bool GetProperty(DBPropertyType property_type, const Slice& property,
|
||||
std::string* value, VersionSet* version_set,
|
||||
const MemTableList& imm);
|
||||
|
@ -158,6 +165,13 @@ class InternalStats {
|
|||
std::vector<uint64_t> stall_leveln_slowdown_;
|
||||
std::vector<uint64_t> stall_leveln_slowdown_count_;
|
||||
|
||||
// Total number of background errors encountered. Every time a flush task
|
||||
// or compaction task fails, this counter is incremented. The failure can
|
||||
// be caused by any possible reason, including file system errors, out of
|
||||
// resources, or input file corruption. Failing when retrying the same flush
|
||||
// or compaction will cause the counter to increase too.
|
||||
uint64_t bg_error_count_;
|
||||
|
||||
int number_levels_;
|
||||
Statistics* statistics_;
|
||||
Env* env_;
|
||||
|
|
Loading…
Reference in New Issue