Add a perf level for measuring user thread block time (#12368)

Summary:
Enabling time PerfCounter stats in RocksDB is currently very expensive, as it enables all sorts of relatively uninteresting stats, such as iteration, point lookup breakdown etc. This PR adds a new perf level between `kEnableCount` and `kEnableTimeExceptForMutex` to enable stats for time spent by user (i.e a RocksDB user) threads blocked by other RocksDB threads or events, such as a write group leader, write delay or stalls etc. It does not include time spent waiting to acquire mutexes, or waiting for IO.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12368

Test Plan: Add a unit test for write_thread_wait_nanos

Reviewed By: ajkr

Differential Revision: D54021583

Pulled By: anand1976

fbshipit-source-id: 3f6fcf71010132ffffca0391a5565f3b59fddd48
This commit is contained in:
anand76 2024-02-22 12:14:53 -08:00 committed by Facebook GitHub Bot
parent cb4f4381f6
commit d9c0d44dab
6 changed files with 59 additions and 8 deletions

View File

@ -954,7 +954,7 @@ Status DBImpl::WriteImplWALOnly(
} }
} else { } else {
PERF_TIMER_STOP(write_pre_and_post_process_time); PERF_TIMER_STOP(write_pre_and_post_process_time);
PERF_TIMER_GUARD(write_delay_time); PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
InstrumentedMutexLock lock(&mutex_); InstrumentedMutexLock lock(&mutex_);
Status status = Status status =
DelayWrite(/*num_bytes=*/0ull, *write_thread, write_options); DelayWrite(/*num_bytes=*/0ull, *write_thread, write_options);
@ -1228,7 +1228,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
write_controller_.NeedsDelay()))) { write_controller_.NeedsDelay()))) {
PERF_TIMER_STOP(write_pre_and_post_process_time); PERF_TIMER_STOP(write_pre_and_post_process_time);
PERF_TIMER_GUARD(write_delay_time); PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
// We don't know size of curent batch so that we always use the size // We don't know size of curent batch so that we always use the size
// for previous one. It might create a fairness issue that expiration // for previous one. It might create a fairness issue that expiration
// might happen for smaller writes but larger writes can go through. // might happen for smaller writes but larger writes can go through.
@ -2000,7 +2000,7 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
// is that in case the write is heavy, low pri writes may never have // is that in case the write is heavy, low pri writes may never have
// a chance to run. Now we guarantee we are still slowly making // a chance to run. Now we guarantee we are still slowly making
// progress. // progress.
PERF_TIMER_GUARD(write_delay_time); PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
auto data_size = my_batch->GetDataSize(); auto data_size = my_batch->GetDataSize();
while (data_size > 0) { while (data_size > 0) {
size_t allowed = write_controller_.low_pri_rate_limiter()->RequestToken( size_t allowed = write_controller_.low_pri_rate_limiter()->RequestToken(

View File

@ -269,6 +269,47 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
} }
TEST_P(DBWriteTest, WriteThreadWaitNanosCounter) {
Options options = GetOptions();
std::vector<port::Thread> threads;
Reopen(options);
std::function<void()> write_func = [&]() {
PerfContext* perf_ctx = get_perf_context();
SetPerfLevel(PerfLevel::kEnableWait);
perf_ctx->Reset();
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:WriteFunc");
ASSERT_OK(dbfull()->Put(WriteOptions(), "bar", "val2"));
ASSERT_GT(perf_ctx->write_thread_wait_nanos, 1000000000U);
};
std::function<void()> sleep_func = [&]() {
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:1");
sleep(2);
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:2");
};
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
{{"WriteThread::EnterAsBatchGroupLeader:End",
"DBWriteTest::WriteThreadWaitNanosCounter:WriteFunc"},
{"WriteThread::AwaitState:BlockingWaiting",
"DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:1"},
{"DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:2",
"WriteThread::ExitAsBatchGroupLeader:Start"}});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
threads.emplace_back(sleep_func);
threads.emplace_back(write_func);
ASSERT_OK(dbfull()->Put(WriteOptions(), "foo", "val1"));
for (auto& t : threads) {
t.join();
}
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}
TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
constexpr int kNumThreads = 5; constexpr int kNumThreads = 5;
std::unique_ptr<FaultInjectionTestEnv> mock_env( std::unique_ptr<FaultInjectionTestEnv> mock_env(

View File

@ -83,7 +83,7 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
// This is below the fast path, so that the stat is zero when all writes are // This is below the fast path, so that the stat is zero when all writes are
// from the same thread. // from the same thread.
PERF_TIMER_GUARD(write_thread_wait_nanos); PERF_TIMER_FOR_WAIT_GUARD(write_thread_wait_nanos);
// If we're only going to end up waiting a short period of time, // If we're only going to end up waiting a short period of time,
// it can be a lot more efficient to call std::this_thread::yield() // it can be a lot more efficient to call std::this_thread::yield()

View File

@ -18,13 +18,16 @@ enum PerfLevel : unsigned char {
kUninitialized = 0, // unknown setting kUninitialized = 0, // unknown setting
kDisable = 1, // disable perf stats kDisable = 1, // disable perf stats
kEnableCount = 2, // enable only count stats kEnableCount = 2, // enable only count stats
kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time kEnableWait = 3, // measure time spent by user threads
// blocked in RocksDB, and not external
// resources such as mutexes and IO
kEnableTimeExceptForMutex = 4, // Other than count stats, also enable time
// stats except for mutexes // stats except for mutexes
// Other than time, also measure CPU time counters. Still don't measure // Other than time, also measure CPU time counters. Still don't measure
// time (neither wall time nor CPU time) for mutexes. // time (neither wall time nor CPU time) for mutexes.
kEnableTimeAndCPUTimeExceptForMutex = 4, kEnableTimeAndCPUTimeExceptForMutex = 5,
kEnableTime = 5, // enable count and time stats kEnableTime = 6, // enable count and time stats
kOutOfBounds = 6 // N.B. Must always be the last value! kOutOfBounds = 7 // N.B. Must always be the last value!
}; };
// set the perf stats level for current thread // set the perf stats level for current thread

View File

@ -29,6 +29,7 @@ extern thread_local PerfContext perf_context;
#define PERF_CPU_TIMER_GUARD(metric, clock) #define PERF_CPU_TIMER_GUARD(metric, clock)
#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ #define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \
ticker_type) ticker_type)
#define PERF_TIMER_FOR_WAIT_GUARD(metric)
#define PERF_TIMER_MEASURE(metric) #define PERF_TIMER_MEASURE(metric)
#define PERF_COUNTER_ADD(metric, value) #define PERF_COUNTER_ADD(metric, value)
#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) #define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)
@ -66,6 +67,11 @@ extern thread_local PerfContext perf_context;
perf_step_timer_##metric.Start(); \ perf_step_timer_##metric.Start(); \
} }
#define PERF_TIMER_FOR_WAIT_GUARD(metric) \
PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr, \
false, PerfLevel::kEnableWait); \
perf_step_timer_##metric.Start();
// Update metric with time elapsed since last START. start time is reset // Update metric with time elapsed since last START. start time is reset
// to current timestamp. // to current timestamp.
#define PERF_TIMER_MEASURE(metric) perf_step_timer_##metric.Measure(); #define PERF_TIMER_MEASURE(metric) perf_step_timer_##metric.Measure();

View File

@ -0,0 +1 @@
Defined a new PerfLevel `kEnableWait` to measure time spent by user threads blocked in RocksDB other than mutex, such as a write thread waiting to be added to a write group, a write thread delayed or stalled etc.