From affe01b0d53303ea8869b17a8ab85d77931810a2 Mon Sep 17 00:00:00 2001
From: Mike Kolupaev <kolmike@fb.com>
Date: Mon, 23 Apr 2018 17:53:27 -0700
Subject: [PATCH] Improve write time breakdown stats

Summary:
There's a group of stats in PerfContext for profiling the write path. They break down the write time into WAL write, memtable insert, throttling, and everything else. We use these stats a lot for figuring out the cause of slow writes.

These stats got a bit out of date and are now categorizing some interesting things as "everything else", and also do some double counting. This PR fixes it and adds two new stats: time spent waiting for other threads of the batch group, and time spent waiting for scheduling flushes/compactions. Probably these will be enough to explain all the occasional abnormally slow (multiple seconds) writes that we're seeing.
Closes https://github.com/facebook/rocksdb/pull/3602

Differential Revision: D7251562

Pulled By: al13n321

fbshipit-source-id: 0a2d0f5a4fa5677455e1f566da931cb46efe2a0d
---
 HISTORY.md                     |  1 +
 db/db_impl_write.cc            | 25 +++++++++++++++++++++++--
 db/perf_context_test.cc        | 24 +++++++++++++++++++++++-
 db/write_thread.cc             |  5 +++++
 include/rocksdb/perf_context.h | 17 ++++++++++++++---
 monitoring/perf_context.cc     |  4 ++++
 6 files changed, 70 insertions(+), 6 deletions(-)
diff --git a/HISTORY.md b/HISTORY.md
index 5dbf7b83df..5163b0222d 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,6 +4,7 @@
 * Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages.
 * The background thread naming convention changed (on supporting platforms) to "rocksdb:<thread pool priority><thread number>", e.g., "rocksdb:low0".
 * Add a new ticker stat rocksdb.number.multiget.keys.found to count number of keys successfully read in MultiGet calls
+* Touch-up to write-related counters in PerfContext. New counters added: write_scheduling_flushes_compactions_time, write_thread_wait_nanos. Counters whose behavior was fixed or modified: write_memtable_time, write_pre_and_post_process_time, write_delay_time.
 
 ### New Features
 * Introduce TTL for level compaction so that all files older than ttl go through the compaction process to get rid of old data.
diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc
index 0e249cb406..c0416fdf92 100644
--- a/db/db_impl_write.cc
+++ b/db/db_impl_write.cc
@@ -122,15 +122,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   write_thread_.JoinBatchGroup(&w);
   if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
     // we are a non-leader in a parallel group
-    PERF_TIMER_GUARD(write_memtable_time);
 
     if (w.ShouldWriteToMemtable()) {
+      PERF_TIMER_STOP(write_pre_and_post_process_time);
+      PERF_TIMER_GUARD(write_memtable_time);
+
       ColumnFamilyMemTablesImpl column_family_memtables(
           versions_->GetColumnFamilySet());
       w.status = WriteBatchInternal::InsertInto(
           &w, w.sequence, &column_family_memtables, &flush_scheduler_,
           write_options.ignore_missing_column_families, 0 /*log_number*/, this,
           true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt);
+
+      PERF_TIMER_START(write_pre_and_post_process_time);
     }
 
     if (write_thread_.CompleteParallelMemTableWriter(&w)) {
@@ -190,7 +194,13 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // With concurrent writes we do preprocess only in the write thread that
     // also does write to memtable to avoid sync issue on shared data structure
     // with the other thread
+
+    // PreprocessWrite does its own perf timing.
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+
     status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+
+    PERF_TIMER_START(write_pre_and_post_process_time);
   }
   log::Writer* log_writer = logs_.back().writer;
 
@@ -353,7 +363,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
     mutex_.Unlock();
     // Requesting sync with two_write_queues_ is expected to be very rare. We
-    // hance provide a simple implementation that is not necessarily efficient.
+    // hence provide a simple implementation that is not necessarily efficient.
     if (two_write_queues_) {
       if (manual_wal_flush_) {
         status = FlushWAL(true);
@@ -414,7 +424,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     mutex_.Lock();
     bool need_log_sync = !write_options.disableWAL && write_options.sync;
     bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
+    // PreprocessWrite does its own perf timing.
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
     w.status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+    PERF_TIMER_START(write_pre_and_post_process_time);
     log::Writer* log_writer = logs_.back().writer;
     mutex_.Unlock();
 
@@ -704,6 +717,8 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
   assert(write_context != nullptr && need_log_sync != nullptr);
   Status status;
 
+  PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
+
   assert(!single_column_family_mode_ ||
          versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
   if (UNLIKELY(status.ok() && !single_column_family_mode_ &&
@@ -728,14 +743,19 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
     status = ScheduleFlushes(write_context);
   }
 
+  PERF_TIMER_STOP(write_scheduling_flushes_compactions_time);
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+
   if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
                                write_controller_.NeedsDelay()))) {
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
     PERF_TIMER_GUARD(write_delay_time);
     // We don't know size of curent batch so that we always use the size
     // for previous one. It might create a fairness issue that expiration
     // might happen for smaller writes but larger writes can go through.
     // Can optimize it if it is an issue.
     status = DelayWrite(last_batch_group_size_, write_options);
+    PERF_TIMER_START(write_pre_and_post_process_time);
   }
 
   if (status.ok() && *need_log_sync) {
@@ -1189,6 +1209,7 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
       // is that in case the write is heavy, low pri writes may never have
       // a chance to run. Now we guarantee we are still slowly making
       // progress.
+      PERF_TIMER_GUARD(write_delay_time);
       write_controller_.low_pri_rate_limiter()->Request(
           my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */,
           RateLimiter::OpType::kWrite);
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index d06843a830..d8ee80a795 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -227,6 +227,9 @@ void ProfileQueries(bool enabled_time = false) {
   HistogramImpl hist_write_pre_post;
   HistogramImpl hist_write_wal_time;
   HistogramImpl hist_write_memtable_time;
+  HistogramImpl hist_write_delay_time;
+  HistogramImpl hist_write_thread_wait_nanos;
+  HistogramImpl hist_write_scheduling_time;
 
   uint64_t total_db_mutex_nanos = 0;
 
@@ -270,9 +273,15 @@ void ProfileQueries(bool enabled_time = false) {
       ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
 #endif
     }
-    hist_write_pre_post.Add(get_perf_context()->write_pre_and_post_process_time);
+    hist_write_pre_post.Add(
+        get_perf_context()->write_pre_and_post_process_time);
     hist_write_wal_time.Add(get_perf_context()->write_wal_time);
     hist_write_memtable_time.Add(get_perf_context()->write_memtable_time);
+    hist_write_delay_time.Add(get_perf_context()->write_delay_time);
+    hist_write_thread_wait_nanos.Add(
+        get_perf_context()->write_thread_wait_nanos);
+    hist_write_scheduling_time.Add(
+        get_perf_context()->write_scheduling_flushes_compactions_time);
     hist_put.Add(get_perf_context()->user_key_comparison_count);
     total_db_mutex_nanos += get_perf_context()->db_mutex_lock_nanos;
   }
@@ -320,6 +329,11 @@ void ProfileQueries(bool enabled_time = false) {
               << hist_write_wal_time.ToString() << "\n"
               << " Writing Mem Table time: \n"
               << hist_write_memtable_time.ToString() << "\n"
+              << " Write Delay: \n" << hist_write_delay_time.ToString() << "\n"
+              << " Waiting for Batch time: \n"
+              << hist_write_thread_wait_nanos.ToString() << "\n"
+              << " Scheduling Flushes and Compactions Time: \n"
+              << hist_write_scheduling_time.ToString() << "\n"
               << " Total DB mutex nanos: \n" << total_db_mutex_nanos << "\n";
 
     std::cout << "Get(): Time to get snapshot: \n"
@@ -359,6 +373,14 @@ void ProfileQueries(bool enabled_time = false) {
     ASSERT_GT(hist_mget_files.Average(), 0);
     ASSERT_GT(hist_mget_post_process.Average(), 0);
     ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+
+    EXPECT_GT(hist_write_pre_post.Average(), 0);
+    EXPECT_GT(hist_write_wal_time.Average(), 0);
+    EXPECT_GT(hist_write_memtable_time.Average(), 0);
+    EXPECT_EQ(hist_write_delay_time.Average(), 0);
+    EXPECT_EQ(hist_write_thread_wait_nanos.Average(), 0);
+    EXPECT_GT(hist_write_scheduling_time.Average(), 0);
+
 #ifndef NDEBUG
     ASSERT_GT(total_db_mutex_nanos, 2000U);
 #endif
diff --git a/db/write_thread.cc b/db/write_thread.cc
index a44f028cec..f35e0dd21a 100644
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@@ -7,6 +7,7 @@
 #include <chrono>
 #include <thread>
 #include "db/column_family.h"
+#include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "util/random.h"
 #include "util/sync_point.h"
@@ -73,6 +74,10 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
     port::AsmVolatilePause();
   }
 
+  // This is below the fast path, so that the stat is zero when all writes are
+  // from the same thread.
+  PERF_TIMER_GUARD(write_thread_wait_nanos);
+
   // If we're only going to end up waiting a short period of time,
   // it can be a lot more efficient to call std::this_thread::yield()
   // in a loop than to block in StateMutex().  For reference, on my 4.0
diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h
index 3bb7f5c0b3..c3d61a3f62 100644
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@@ -95,16 +95,27 @@ struct PerfContext {
   // total nanos spent on iterating internal entries to find the next user entry
   uint64_t find_next_user_entry_time;
 
+  // This group of stats provide a breakdown of time spent by Write().
+  // May be inaccurate when 2PC, two_write_queues or enable_pipelined_write
+  // are enabled.
+  //
   // total nanos spent on writing to WAL
   uint64_t write_wal_time;
   // total nanos spent on writing to mem tables
   uint64_t write_memtable_time;
-  // total nanos spent on delaying write
+  // total nanos spent on delaying or throttling write
   uint64_t write_delay_time;
-  // total nanos spent on writing a record, excluding the above three times
+  // total nanos spent on switching memtable/wal and scheduling
+  // flushes/compactions.
+  uint64_t write_scheduling_flushes_compactions_time;
+  // total nanos spent on writing a record, excluding the above four things
   uint64_t write_pre_and_post_process_time;
 
-  uint64_t db_mutex_lock_nanos;      // time spent on acquiring DB mutex.
+  // time spent waiting for other threads of the batch group
+  uint64_t write_thread_wait_nanos;
+
+  // time spent on acquiring DB mutex.
+  uint64_t db_mutex_lock_nanos;
   // Time spent on waiting with a condition variable created with DB mutex.
   uint64_t db_condition_wait_nanos;
   // Time spent on merge operator.
diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc
index a80b9e470a..9bba841f8f 100644
--- a/monitoring/perf_context.cc
+++ b/monitoring/perf_context.cc
@@ -66,6 +66,8 @@ void PerfContext::Reset() {
   write_pre_and_post_process_time = 0;
   write_memtable_time = 0;
   write_delay_time = 0;
+  write_thread_wait_nanos = 0;
+  write_scheduling_flushes_compactions_time = 0;
   db_mutex_lock_nanos = 0;
   db_condition_wait_nanos = 0;
   merge_operator_time_nanos = 0;
@@ -146,6 +148,8 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
   PERF_CONTEXT_OUTPUT(find_next_user_entry_time);
   PERF_CONTEXT_OUTPUT(write_pre_and_post_process_time);
   PERF_CONTEXT_OUTPUT(write_memtable_time);
+  PERF_CONTEXT_OUTPUT(write_thread_wait_nanos);
+  PERF_CONTEXT_OUTPUT(write_scheduling_flushes_compactions_time);
   PERF_CONTEXT_OUTPUT(db_mutex_lock_nanos);
   PERF_CONTEXT_OUTPUT(db_condition_wait_nanos);
   PERF_CONTEXT_OUTPUT(merge_operator_time_nanos);