mirror of https://github.com/facebook/rocksdb.git
Add a perf level for measuring user thread block time (#12368)
Summary: Enabling time PerfCounter stats in RocksDB is currently very expensive, as it enables all sorts of relatively uninteresting stats, such as iteration, point lookup breakdown etc. This PR adds a new perf level between `kEnableCount` and `kEnableTimeExceptForMutex` to enable stats for time spent by user (i.e a RocksDB user) threads blocked by other RocksDB threads or events, such as a write group leader, write delay or stalls etc. It does not include time spent waiting to acquire mutexes, or waiting for IO. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12368 Test Plan: Add a unit test for write_thread_wait_nanos Reviewed By: ajkr Differential Revision: D54021583 Pulled By: anand1976 fbshipit-source-id: 3f6fcf71010132ffffca0391a5565f3b59fddd48
This commit is contained in:
parent
cb4f4381f6
commit
d9c0d44dab
|
@ -954,7 +954,7 @@ Status DBImpl::WriteImplWALOnly(
|
|||
}
|
||||
} else {
|
||||
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
||||
PERF_TIMER_GUARD(write_delay_time);
|
||||
PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
|
||||
InstrumentedMutexLock lock(&mutex_);
|
||||
Status status =
|
||||
DelayWrite(/*num_bytes=*/0ull, *write_thread, write_options);
|
||||
|
@ -1228,7 +1228,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|||
if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
|
||||
write_controller_.NeedsDelay()))) {
|
||||
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
||||
PERF_TIMER_GUARD(write_delay_time);
|
||||
PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
|
||||
// We don't know size of curent batch so that we always use the size
|
||||
// for previous one. It might create a fairness issue that expiration
|
||||
// might happen for smaller writes but larger writes can go through.
|
||||
|
@ -2000,7 +2000,7 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
|
|||
// is that in case the write is heavy, low pri writes may never have
|
||||
// a chance to run. Now we guarantee we are still slowly making
|
||||
// progress.
|
||||
PERF_TIMER_GUARD(write_delay_time);
|
||||
PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
|
||||
auto data_size = my_batch->GetDataSize();
|
||||
while (data_size > 0) {
|
||||
size_t allowed = write_controller_.low_pri_rate_limiter()->RequestToken(
|
||||
|
|
|
@ -269,6 +269,47 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
|
|||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
}
|
||||
|
||||
TEST_P(DBWriteTest, WriteThreadWaitNanosCounter) {
|
||||
Options options = GetOptions();
|
||||
std::vector<port::Thread> threads;
|
||||
|
||||
Reopen(options);
|
||||
|
||||
std::function<void()> write_func = [&]() {
|
||||
PerfContext* perf_ctx = get_perf_context();
|
||||
SetPerfLevel(PerfLevel::kEnableWait);
|
||||
perf_ctx->Reset();
|
||||
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:WriteFunc");
|
||||
ASSERT_OK(dbfull()->Put(WriteOptions(), "bar", "val2"));
|
||||
ASSERT_GT(perf_ctx->write_thread_wait_nanos, 1000000000U);
|
||||
};
|
||||
|
||||
std::function<void()> sleep_func = [&]() {
|
||||
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:1");
|
||||
sleep(2);
|
||||
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:2");
|
||||
};
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"WriteThread::EnterAsBatchGroupLeader:End",
|
||||
"DBWriteTest::WriteThreadWaitNanosCounter:WriteFunc"},
|
||||
{"WriteThread::AwaitState:BlockingWaiting",
|
||||
"DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:1"},
|
||||
{"DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:2",
|
||||
"WriteThread::ExitAsBatchGroupLeader:Start"}});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
threads.emplace_back(sleep_func);
|
||||
threads.emplace_back(write_func);
|
||||
|
||||
ASSERT_OK(dbfull()->Put(WriteOptions(), "foo", "val1"));
|
||||
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
||||
TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
|
||||
constexpr int kNumThreads = 5;
|
||||
std::unique_ptr<FaultInjectionTestEnv> mock_env(
|
||||
|
|
|
@ -83,7 +83,7 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
|
|||
|
||||
// This is below the fast path, so that the stat is zero when all writes are
|
||||
// from the same thread.
|
||||
PERF_TIMER_GUARD(write_thread_wait_nanos);
|
||||
PERF_TIMER_FOR_WAIT_GUARD(write_thread_wait_nanos);
|
||||
|
||||
// If we're only going to end up waiting a short period of time,
|
||||
// it can be a lot more efficient to call std::this_thread::yield()
|
||||
|
|
|
@ -18,13 +18,16 @@ enum PerfLevel : unsigned char {
|
|||
kUninitialized = 0, // unknown setting
|
||||
kDisable = 1, // disable perf stats
|
||||
kEnableCount = 2, // enable only count stats
|
||||
kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time
|
||||
kEnableWait = 3, // measure time spent by user threads
|
||||
// blocked in RocksDB, and not external
|
||||
// resources such as mutexes and IO
|
||||
kEnableTimeExceptForMutex = 4, // Other than count stats, also enable time
|
||||
// stats except for mutexes
|
||||
// Other than time, also measure CPU time counters. Still don't measure
|
||||
// time (neither wall time nor CPU time) for mutexes.
|
||||
kEnableTimeAndCPUTimeExceptForMutex = 4,
|
||||
kEnableTime = 5, // enable count and time stats
|
||||
kOutOfBounds = 6 // N.B. Must always be the last value!
|
||||
kEnableTimeAndCPUTimeExceptForMutex = 5,
|
||||
kEnableTime = 6, // enable count and time stats
|
||||
kOutOfBounds = 7 // N.B. Must always be the last value!
|
||||
};
|
||||
|
||||
// set the perf stats level for current thread
|
||||
|
|
|
@ -29,6 +29,7 @@ extern thread_local PerfContext perf_context;
|
|||
#define PERF_CPU_TIMER_GUARD(metric, clock)
|
||||
#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \
|
||||
ticker_type)
|
||||
#define PERF_TIMER_FOR_WAIT_GUARD(metric)
|
||||
#define PERF_TIMER_MEASURE(metric)
|
||||
#define PERF_COUNTER_ADD(metric, value)
|
||||
#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)
|
||||
|
@ -66,6 +67,11 @@ extern thread_local PerfContext perf_context;
|
|||
perf_step_timer_##metric.Start(); \
|
||||
}
|
||||
|
||||
#define PERF_TIMER_FOR_WAIT_GUARD(metric) \
|
||||
PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr, \
|
||||
false, PerfLevel::kEnableWait); \
|
||||
perf_step_timer_##metric.Start();
|
||||
|
||||
// Update metric with time elapsed since last START. start time is reset
|
||||
// to current timestamp.
|
||||
#define PERF_TIMER_MEASURE(metric) perf_step_timer_##metric.Measure();
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Defined a new PerfLevel `kEnableWait` to measure time spent by user threads blocked in RocksDB other than mutex, such as a write thread waiting to be added to a write group, a write thread delayed or stalled etc.
|
Loading…
Reference in New Issue