mirror of https://github.com/facebook/rocksdb.git
Add a perf level for measuring user thread block time (#12368)
Summary: Enabling time PerfCounter stats in RocksDB is currently very expensive, as it enables all sorts of relatively uninteresting stats, such as iteration, point lookup breakdown etc. This PR adds a new perf level between `kEnableCount` and `kEnableTimeExceptForMutex` to enable stats for time spent by user (i.e a RocksDB user) threads blocked by other RocksDB threads or events, such as a write group leader, write delay or stalls etc. It does not include time spent waiting to acquire mutexes, or waiting for IO. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12368 Test Plan: Add a unit test for write_thread_wait_nanos Reviewed By: ajkr Differential Revision: D54021583 Pulled By: anand1976 fbshipit-source-id: 3f6fcf71010132ffffca0391a5565f3b59fddd48
This commit is contained in:
parent
cb4f4381f6
commit
d9c0d44dab
|
@ -954,7 +954,7 @@ Status DBImpl::WriteImplWALOnly(
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
||||||
PERF_TIMER_GUARD(write_delay_time);
|
PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
|
||||||
InstrumentedMutexLock lock(&mutex_);
|
InstrumentedMutexLock lock(&mutex_);
|
||||||
Status status =
|
Status status =
|
||||||
DelayWrite(/*num_bytes=*/0ull, *write_thread, write_options);
|
DelayWrite(/*num_bytes=*/0ull, *write_thread, write_options);
|
||||||
|
@ -1228,7 +1228,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
||||||
if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
|
if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
|
||||||
write_controller_.NeedsDelay()))) {
|
write_controller_.NeedsDelay()))) {
|
||||||
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
||||||
PERF_TIMER_GUARD(write_delay_time);
|
PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
|
||||||
// We don't know size of curent batch so that we always use the size
|
// We don't know size of curent batch so that we always use the size
|
||||||
// for previous one. It might create a fairness issue that expiration
|
// for previous one. It might create a fairness issue that expiration
|
||||||
// might happen for smaller writes but larger writes can go through.
|
// might happen for smaller writes but larger writes can go through.
|
||||||
|
@ -2000,7 +2000,7 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
|
||||||
// is that in case the write is heavy, low pri writes may never have
|
// is that in case the write is heavy, low pri writes may never have
|
||||||
// a chance to run. Now we guarantee we are still slowly making
|
// a chance to run. Now we guarantee we are still slowly making
|
||||||
// progress.
|
// progress.
|
||||||
PERF_TIMER_GUARD(write_delay_time);
|
PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
|
||||||
auto data_size = my_batch->GetDataSize();
|
auto data_size = my_batch->GetDataSize();
|
||||||
while (data_size > 0) {
|
while (data_size > 0) {
|
||||||
size_t allowed = write_controller_.low_pri_rate_limiter()->RequestToken(
|
size_t allowed = write_controller_.low_pri_rate_limiter()->RequestToken(
|
||||||
|
|
|
@ -269,6 +269,47 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
|
||||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_P(DBWriteTest, WriteThreadWaitNanosCounter) {
|
||||||
|
Options options = GetOptions();
|
||||||
|
std::vector<port::Thread> threads;
|
||||||
|
|
||||||
|
Reopen(options);
|
||||||
|
|
||||||
|
std::function<void()> write_func = [&]() {
|
||||||
|
PerfContext* perf_ctx = get_perf_context();
|
||||||
|
SetPerfLevel(PerfLevel::kEnableWait);
|
||||||
|
perf_ctx->Reset();
|
||||||
|
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:WriteFunc");
|
||||||
|
ASSERT_OK(dbfull()->Put(WriteOptions(), "bar", "val2"));
|
||||||
|
ASSERT_GT(perf_ctx->write_thread_wait_nanos, 1000000000U);
|
||||||
|
};
|
||||||
|
|
||||||
|
std::function<void()> sleep_func = [&]() {
|
||||||
|
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:1");
|
||||||
|
sleep(2);
|
||||||
|
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:2");
|
||||||
|
};
|
||||||
|
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||||
|
{{"WriteThread::EnterAsBatchGroupLeader:End",
|
||||||
|
"DBWriteTest::WriteThreadWaitNanosCounter:WriteFunc"},
|
||||||
|
{"WriteThread::AwaitState:BlockingWaiting",
|
||||||
|
"DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:1"},
|
||||||
|
{"DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:2",
|
||||||
|
"WriteThread::ExitAsBatchGroupLeader:Start"}});
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||||
|
|
||||||
|
threads.emplace_back(sleep_func);
|
||||||
|
threads.emplace_back(write_func);
|
||||||
|
|
||||||
|
ASSERT_OK(dbfull()->Put(WriteOptions(), "foo", "val1"));
|
||||||
|
|
||||||
|
for (auto& t : threads) {
|
||||||
|
t.join();
|
||||||
|
}
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||||
|
}
|
||||||
|
|
||||||
TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
|
TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
|
||||||
constexpr int kNumThreads = 5;
|
constexpr int kNumThreads = 5;
|
||||||
std::unique_ptr<FaultInjectionTestEnv> mock_env(
|
std::unique_ptr<FaultInjectionTestEnv> mock_env(
|
||||||
|
|
|
@ -83,7 +83,7 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
|
||||||
|
|
||||||
// This is below the fast path, so that the stat is zero when all writes are
|
// This is below the fast path, so that the stat is zero when all writes are
|
||||||
// from the same thread.
|
// from the same thread.
|
||||||
PERF_TIMER_GUARD(write_thread_wait_nanos);
|
PERF_TIMER_FOR_WAIT_GUARD(write_thread_wait_nanos);
|
||||||
|
|
||||||
// If we're only going to end up waiting a short period of time,
|
// If we're only going to end up waiting a short period of time,
|
||||||
// it can be a lot more efficient to call std::this_thread::yield()
|
// it can be a lot more efficient to call std::this_thread::yield()
|
||||||
|
|
|
@ -18,13 +18,16 @@ enum PerfLevel : unsigned char {
|
||||||
kUninitialized = 0, // unknown setting
|
kUninitialized = 0, // unknown setting
|
||||||
kDisable = 1, // disable perf stats
|
kDisable = 1, // disable perf stats
|
||||||
kEnableCount = 2, // enable only count stats
|
kEnableCount = 2, // enable only count stats
|
||||||
kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time
|
kEnableWait = 3, // measure time spent by user threads
|
||||||
|
// blocked in RocksDB, and not external
|
||||||
|
// resources such as mutexes and IO
|
||||||
|
kEnableTimeExceptForMutex = 4, // Other than count stats, also enable time
|
||||||
// stats except for mutexes
|
// stats except for mutexes
|
||||||
// Other than time, also measure CPU time counters. Still don't measure
|
// Other than time, also measure CPU time counters. Still don't measure
|
||||||
// time (neither wall time nor CPU time) for mutexes.
|
// time (neither wall time nor CPU time) for mutexes.
|
||||||
kEnableTimeAndCPUTimeExceptForMutex = 4,
|
kEnableTimeAndCPUTimeExceptForMutex = 5,
|
||||||
kEnableTime = 5, // enable count and time stats
|
kEnableTime = 6, // enable count and time stats
|
||||||
kOutOfBounds = 6 // N.B. Must always be the last value!
|
kOutOfBounds = 7 // N.B. Must always be the last value!
|
||||||
};
|
};
|
||||||
|
|
||||||
// set the perf stats level for current thread
|
// set the perf stats level for current thread
|
||||||
|
|
|
@ -29,6 +29,7 @@ extern thread_local PerfContext perf_context;
|
||||||
#define PERF_CPU_TIMER_GUARD(metric, clock)
|
#define PERF_CPU_TIMER_GUARD(metric, clock)
|
||||||
#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \
|
#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \
|
||||||
ticker_type)
|
ticker_type)
|
||||||
|
#define PERF_TIMER_FOR_WAIT_GUARD(metric)
|
||||||
#define PERF_TIMER_MEASURE(metric)
|
#define PERF_TIMER_MEASURE(metric)
|
||||||
#define PERF_COUNTER_ADD(metric, value)
|
#define PERF_COUNTER_ADD(metric, value)
|
||||||
#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)
|
#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)
|
||||||
|
@ -66,6 +67,11 @@ extern thread_local PerfContext perf_context;
|
||||||
perf_step_timer_##metric.Start(); \
|
perf_step_timer_##metric.Start(); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define PERF_TIMER_FOR_WAIT_GUARD(metric) \
|
||||||
|
PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr, \
|
||||||
|
false, PerfLevel::kEnableWait); \
|
||||||
|
perf_step_timer_##metric.Start();
|
||||||
|
|
||||||
// Update metric with time elapsed since last START. start time is reset
|
// Update metric with time elapsed since last START. start time is reset
|
||||||
// to current timestamp.
|
// to current timestamp.
|
||||||
#define PERF_TIMER_MEASURE(metric) perf_step_timer_##metric.Measure();
|
#define PERF_TIMER_MEASURE(metric) perf_step_timer_##metric.Measure();
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Defined a new PerfLevel `kEnableWait` to measure time spent by user threads blocked in RocksDB other than mutex, such as a write thread waiting to be added to a write group, a write thread delayed or stalled etc.
|
Loading…
Reference in New Issue