Add a DB property to indicate number of background errors encountered

Summary: Add a property to calculate number of background errors encountered to help users build their monitoring

Test Plan: Add a unit test. make all check

Reviewers: haobo, igor, dhruba

Reviewed By: igor

CC: ljin, nkg-, yhchiang, leveldb

Differential Revision: https://reviews.facebook.net/D16959
This commit is contained in:
sdong 2014-03-18 12:25:08 -07:00
parent 1ec72b37b1
commit 71e6a34271
5 changed files with 88 additions and 14 deletions

View File

@ -9,6 +9,8 @@
#include "db/db_impl.h"
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <algorithm>
#include <climits>
#include <cstdio>
@ -1806,8 +1808,10 @@ Status DBImpl::WaitForFlushMemTable() {
return s;
}
Status DBImpl::TEST_FlushMemTable() {
return FlushMemTable(FlushOptions());
Status DBImpl::TEST_FlushMemTable(bool wait) {
FlushOptions fo;
fo.wait = wait;
return FlushMemTable(fo);
}
Status DBImpl::TEST_WaitForFlushMemTable() {
@ -1904,10 +1908,13 @@ void DBImpl::BackgroundCallFlush() {
// case this is an environmental problem and we do not want to
// chew up resources for failed compactions for the duration of
// the problem.
uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount();
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
Log(options_.info_log, "Waiting after background flush error: %s",
s.ToString().c_str());
mutex_.Unlock();
Log(options_.info_log,
"Waiting after background flush error: %s"
"Accumulated background error counts: %" PRIu64,
s.ToString().c_str(), error_cnt);
log_buffer.FlushBufferToLog();
LogFlush(options_.info_log);
env_->SleepForMicroseconds(1000000);
@ -1978,11 +1985,14 @@ void DBImpl::BackgroundCallCompaction() {
// case this is an environmental problem and we do not want to
// chew up resources for failed compactions for the duration of
// the problem.
uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount();
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
mutex_.Unlock();
log_buffer.FlushBufferToLog();
Log(options_.info_log, "Waiting after background compaction error: %s",
s.ToString().c_str());
Log(options_.info_log,
"Waiting after background compaction error: %s, "
"Accumulated background error counts: %" PRIu64,
s.ToString().c_str(), error_cnt);
LogFlush(options_.info_log);
env_->SleepForMicroseconds(1000000);
mutex_.Lock();

View File

@ -109,7 +109,7 @@ class DBImpl : public DB {
const Slice* end);
// Force current memtable contents to be flushed.
Status TEST_FlushMemTable();
Status TEST_FlushMemTable(bool wait = true);
// Wait for memtable compaction
Status TEST_WaitForFlushMemTable();

View File

@ -4188,6 +4188,11 @@ TEST(DBTest, NoSpace) {
dbfull()->TEST_CompactRange(level, nullptr, nullptr);
}
}
std::string property_value;
ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
ASSERT_EQ("5", property_value);
env_->no_space_.Release_Store(nullptr);
ASSERT_LT(CountFiles(), num_files + 3);
@ -4196,6 +4201,43 @@ TEST(DBTest, NoSpace) {
} while (ChangeCompactOptions());
}
// Check background error counter bumped on flush failures.
TEST(DBTest, NoSpaceFlush) {
do {
Options options = CurrentOptions();
options.env = env_;
options.max_background_flushes = 1;
Reopen(&options);
ASSERT_OK(Put("foo", "v1"));
env_->no_space_.Release_Store(env_); // Force out-of-space errors
std::string property_value;
// Background error count is 0 now.
ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
ASSERT_EQ("0", property_value);
dbfull()->TEST_FlushMemTable(false);
// Wait 300 milliseconds or background-errors turned 1 from 0.
int time_to_sleep_limit = 300000;
while (time_to_sleep_limit > 0) {
int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit;
time_to_sleep_limit -= to_sleep;
env_->SleepForMicroseconds(to_sleep);
ASSERT_TRUE(
db_->GetProperty("rocksdb.background-errors", &property_value));
if (property_value == "1") {
break;
}
}
ASSERT_EQ("1", property_value);
env_->no_space_.Release_Store(nullptr);
} while (ChangeCompactOptions());
}
TEST(DBTest, NonWritableFileSystem) {
do {
Options options = CurrentOptions();

View File

@ -30,9 +30,11 @@ DBPropertyType GetPropertyType(const Slice& property) {
} else if (in == "num-immutable-mem-table") {
return kNumImmutableMemTable;
} else if (in == "mem-table-flush-pending") {
return MemtableFlushPending;
return kMemtableFlushPending;
} else if (in == "compaction-pending") {
return CompactionPending;
return kCompactionPending;
} else if (in == "background-errors") {
return kBackgroundErrors;
}
return kUnknown;
}
@ -330,15 +332,21 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
case kNumImmutableMemTable:
*value = std::to_string(imm.size());
return true;
case MemtableFlushPending:
case kMemtableFlushPending:
// Return number of mem tables that are ready to flush (made immutable)
*value = std::to_string(imm.IsFlushPending() ? 1 : 0);
return true;
case CompactionPending:
case kCompactionPending:
// 1 if the system already determines at least one compacdtion is needed.
// 0 otherwise,
*value = std::to_string(current->NeedsCompaction() ? 1 : 0);
return true;
/////////////
case kBackgroundErrors:
// Accumulated number of errors in background flushes or compactions.
*value = std::to_string(GetBackgroundErrorCount());
return true;
/////////
default:
return false;
}

View File

@ -26,9 +26,11 @@ enum DBPropertyType {
kStats, // Return general statitistics of DB
kSsTables, // Return a human readable string of current SST files
kNumImmutableMemTable, // Return number of immutable mem tables
MemtableFlushPending, // Return 1 if mem table flushing is pending, otherwise
kMemtableFlushPending, // Return 1 if mem table flushing is pending,
// otherwise
// 0.
CompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
kBackgroundErrors, // Return accumulated background errors encountered.
kUnknown,
};
@ -49,6 +51,7 @@ class InternalStats {
stall_counts_(WRITE_STALLS_ENUM_MAX, 0),
stall_leveln_slowdown_(num_levels, 0),
stall_leveln_slowdown_count_(num_levels, 0),
bg_error_count_(0),
number_levels_(num_levels),
statistics_(statistics),
env_(env),
@ -116,6 +119,10 @@ class InternalStats {
stall_leveln_slowdown_count_[level] += micros;
}
uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
bool GetProperty(DBPropertyType property_type, const Slice& property,
std::string* value, VersionSet* version_set,
const MemTableList& imm);
@ -158,6 +165,13 @@ class InternalStats {
std::vector<uint64_t> stall_leveln_slowdown_;
std::vector<uint64_t> stall_leveln_slowdown_count_;
// Total number of background errors encountered. Every time a flush task
// or compaction task fails, this counter is incremented. The failure can
// be caused by any possible reason, including file system errors, out of
// resources, or input file corruption. Failing when retrying the same flush
// or compaction will cause the counter to increase too.
uint64_t bg_error_count_;
int number_levels_;
Statistics* statistics_;
Env* env_;