diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f7238e69e..598c728154 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -748,6 +748,7 @@ set(SOURCES db/write_batch.cc db/write_batch_base.cc db/write_controller.cc + db/write_stall_stats.cc db/write_thread.cc env/composite_env.cc env/env.cc diff --git a/HISTORY.md b/HISTORY.md index 589085a376..7f2c425cbb 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -15,6 +15,7 @@ * Added a new PerfContext counter `internal_merge_point_lookup_count` which tracks the number of Merge operands applied while serving point lookup queries. * Add new statistics rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} * Add support for SecondaryCache with HyperClockCache (`HyperClockCacheOptions` inherits `secondary_cache` option from `ShardedCacheOptions`) +* Add new db properties `rocksdb.cf-write-stall-stats`, `rocksdb.db-write-stall-stats`and APIs to examine them in a structured way. In particular, users of `GetMapProperty()` with property `kCFWriteStallStats`/`kDBWriteStallStats` can now use the functions in `WriteStallStatsMapKeys` to find stats in the map. ### Public API Changes * Changed various functions and features in `Cache` that are mostly relevant to custom implementations or wrappers. Especially, asychronous lookup functionality is moved from `Lookup()` to a new `StartAsyncLookup()` function. diff --git a/TARGETS b/TARGETS index b6b4eee786..2514e09a7c 100644 --- a/TARGETS +++ b/TARGETS @@ -106,6 +106,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "db/write_batch.cc", "db/write_batch_base.cc", "db/write_controller.cc", + "db/write_stall_stats.cc", "db/write_thread.cc", "env/composite_env.cc", "env/env.cc", @@ -451,6 +452,7 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[ "db/write_batch.cc", "db/write_batch_base.cc", "db/write_controller.cc", + "db/write_stall_stats.cc", "db/write_thread.cc", "env/composite_env.cc", "env/env.cc", diff --git a/db/column_family.cc b/db/column_family.cc index a17ab665aa..b3d04dc6a1 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -869,7 +869,7 @@ int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, } } // anonymous namespace -std::pair +std::pair ColumnFamilyData::GetWriteStallConditionAndCause( int num_unflushed_memtables, int num_l0_files, uint64_t num_compaction_needed_bytes, @@ -942,7 +942,8 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1); if (compaction_picker_->IsLevel0CompactionInProgress()) { internal_stats_->AddCFStats( - InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1); + InternalStats::L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION, + 1); } ROCKS_LOG_WARN(ioptions_.logger, "[%s] Stopping writes because we have %d level-0 files", @@ -963,7 +964,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( SetupDelay(write_controller, compaction_needed_bytes, prev_compaction_needed_bytes_, was_stopped, mutable_cf_options.disable_auto_compactions); - internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_DELAYS, 1); ROCKS_LOG_WARN( ioptions_.logger, "[%s] Stalling writes because we have %d immutable memtables " @@ -981,11 +982,11 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( SetupDelay(write_controller, compaction_needed_bytes, prev_compaction_needed_bytes_, was_stopped || near_stop, mutable_cf_options.disable_auto_compactions); - internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS, - 1); + internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_DELAYS, 1); if (compaction_picker_->IsLevel0CompactionInProgress()) { internal_stats_->AddCFStats( - InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1); + InternalStats::L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION, + 1); } ROCKS_LOG_WARN(ioptions_.logger, "[%s] Stalling writes because we have %d level-0 files " @@ -1011,7 +1012,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( prev_compaction_needed_bytes_, was_stopped || near_stop, mutable_cf_options.disable_auto_compactions); internal_stats_->AddCFStats( - InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1); + InternalStats::PENDING_COMPACTION_BYTES_LIMIT_DELAYS, 1); ROCKS_LOG_WARN( ioptions_.logger, "[%s] Stalling writes because of estimated pending compaction " diff --git a/db/column_family.h b/db/column_family.h index 44c04cc9aa..9ec093010d 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -462,12 +462,6 @@ class ColumnFamilyData { bool queued_for_flush() { return queued_for_flush_; } bool queued_for_compaction() { return queued_for_compaction_; } - enum class WriteStallCause { - kNone, - kMemtableLimit, - kL0FileCountLimit, - kPendingCompactionBytes, - }; static std::pair GetWriteStallConditionAndCause( int num_unflushed_memtables, int num_l0_files, diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 25dcec7272..89a054e4c0 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -1211,6 +1211,9 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, // exceeded at this point so no new write (including current one) will go // through until memory usage is decreased. if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) { + default_cf_internal_stats_->AddDBStats( + InternalStats::kIntStatsWriteBufferManagerLimitStopsCounts, 1, + true /* concurrent */); if (write_options.no_slowdown) { status = Status::Incomplete("Write stall"); } else { diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index de63def829..2c843a9749 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -13,6 +13,7 @@ #include #include "db/db_test_util.h" +#include "db/write_stall_stats.h" #include "options/cf_options.h" #include "port/stack_trace.h" #include "rocksdb/listener.h" @@ -2106,6 +2107,180 @@ TEST_F(DBPropertiesTest, GetMapPropertyBlockCacheEntryStats) { ASSERT_EQ(3 * kNumCacheEntryRoles + 4, values.size()); } +TEST_F(DBPropertiesTest, WriteStallStatsSanityCheck) { + for (uint32_t i = 0; i < static_cast(WriteStallCause::kNone); ++i) { + std::string str = kWriteStallCauseToHyphenString[i]; + ASSERT_TRUE(!str.empty()) + << "Please ensure mapping from `WriteStallCause` to " + "`kWriteStallCauseToHyphenString` is complete"; + WriteStallCause cause = static_cast(i); + if (cause == WriteStallCause::kCFScopeWriteStallCauseEnumMax || + cause == WriteStallCause::kDBScopeWriteStallCauseEnumMax) { + ASSERT_EQ(str, kInvalidWriteStallCauseHyphenString) + << "Please ensure order in `kWriteStallCauseToHyphenString` is " + "consistent with `WriteStallCause`"; + } + } + + for (uint32_t i = 0; i < static_cast(WriteStallCondition::kNormal); + ++i) { + std::string str = kWriteStallConditionToHyphenString[i]; + ASSERT_TRUE(!str.empty()) + << "Please ensure mapping from `WriteStallCondition` to " + "`kWriteStallConditionToHyphenString` is complete"; + } + + for (uint32_t i = 0; i < static_cast(WriteStallCause::kNone); ++i) { + for (uint32_t j = 0; + j < static_cast(WriteStallCondition::kNormal); ++j) { + WriteStallCause cause = static_cast(i); + WriteStallCondition condition = static_cast(j); + + if (isCFScopeWriteStallCause(cause)) { + ASSERT_TRUE(InternalCFStat(cause, condition) != + InternalStats::INTERNAL_CF_STATS_ENUM_MAX) + << "Please ensure the combination of WriteStallCause(" + + std::to_string(static_cast(cause)) + + ") + WriteStallCondition(" + + std::to_string(static_cast(condition)) + + ") is correctly mapped to a valid `InternalStats` or bypass " + "its check in this test"; + } else if (isDBScopeWriteStallCause(cause)) { + InternalStats::InternalDBStatsType internal_db_stat = + InternalDBStat(cause, condition); + if (internal_db_stat == InternalStats::kIntStatsNumMax) { + ASSERT_TRUE(cause == WriteStallCause::kWriteBufferManagerLimit && + condition == WriteStallCondition::kDelayed) + << "Please ensure the combination of WriteStallCause(" + + std::to_string(static_cast(cause)) + + ") + WriteStallCondition(" + + std::to_string(static_cast(condition)) + + ") is correctly mapped to a valid `InternalStats` or " + "bypass its check in this test"; + } + } else if (cause != WriteStallCause::kCFScopeWriteStallCauseEnumMax && + cause != WriteStallCause::kDBScopeWriteStallCauseEnumMax) { + ASSERT_TRUE(false) << "Please ensure the WriteStallCause(" + + std::to_string(static_cast(cause)) + + ") is either CF-scope or DB-scope write " + "stall cause in enum `WriteStallCause`"; + } + } + } +} +TEST_F(DBPropertiesTest, GetMapPropertyWriteStallStats) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"heavy_write_cf"}, options); + + for (auto test_cause : {WriteStallCause::kWriteBufferManagerLimit, + WriteStallCause::kMemtableLimit}) { + if (test_cause == WriteStallCause::kWriteBufferManagerLimit) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } else if (test_cause == WriteStallCause::kMemtableLimit) { + options.max_write_buffer_number = 2; + options.disable_auto_compactions = true; + } + ReopenWithColumnFamilies({"default", "heavy_write_cf"}, options); + + // Assert initial write stall stats are all 0 + std::map db_values; + ASSERT_TRUE(dbfull()->GetMapProperty(DB::Properties::kDBWriteStallStats, + &db_values)); + ASSERT_EQ(std::stoi(db_values[WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause::kWriteBufferManagerLimit, + WriteStallCondition::kStopped)]), + 0); + + for (int cf = 0; cf <= 1; ++cf) { + std::map cf_values; + ASSERT_TRUE(dbfull()->GetMapProperty( + handles_[cf], DB::Properties::kCFWriteStallStats, &cf_values)); + ASSERT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalStops()]), 0); + ASSERT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalDelays()]), 0); + } + + // Pause flush thread to help coerce write stall + std::unique_ptr sleeping_task( + new test::SleepingBackgroundTask()); + env_->SetBackgroundThreads(1, Env::HIGH); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + sleeping_task.get(), Env::Priority::HIGH); + sleeping_task->WaitUntilSleeping(); + + // Coerce write stall + if (test_cause == WriteStallCause::kWriteBufferManagerLimit) { + ASSERT_OK(dbfull()->Put( + WriteOptions(), handles_[1], Key(1), + DummyString(options.write_buffer_manager->buffer_size()))); + + WriteOptions wo; + wo.no_slowdown = true; + Status s = dbfull()->Put( + wo, handles_[1], Key(2), + DummyString(options.write_buffer_manager->buffer_size())); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos); + } else if (test_cause == WriteStallCause::kMemtableLimit) { + FlushOptions fo; + fo.allow_write_stall = true; + fo.wait = false; + + ASSERT_OK( + dbfull()->Put(WriteOptions(), handles_[1], Key(1), DummyString(1))); + ASSERT_OK(dbfull()->Flush(fo, handles_[1])); + + ASSERT_OK( + dbfull()->Put(WriteOptions(), handles_[1], Key(2), DummyString(1))); + ASSERT_OK(dbfull()->Flush(fo, handles_[1])); + } + + if (test_cause == WriteStallCause::kWriteBufferManagerLimit) { + db_values.clear(); + EXPECT_TRUE(dbfull()->GetMapProperty(DB::Properties::kDBWriteStallStats, + &db_values)); + EXPECT_EQ(std::stoi(db_values[WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause::kWriteBufferManagerLimit, + WriteStallCondition::kStopped)]), + 1); + // `WriteStallCause::kWriteBufferManagerLimit` should not result in any + // CF-scope write stall stats changes + for (int cf = 0; cf <= 1; ++cf) { + std::map cf_values; + EXPECT_TRUE(dbfull()->GetMapProperty( + handles_[cf], DB::Properties::kCFWriteStallStats, &cf_values)); + EXPECT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalStops()]), + 0); + EXPECT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalDelays()]), + 0); + } + } else if (test_cause == WriteStallCause::kMemtableLimit) { + for (int cf = 0; cf <= 1; ++cf) { + std::map cf_values; + EXPECT_TRUE(dbfull()->GetMapProperty( + handles_[cf], DB::Properties::kCFWriteStallStats, &cf_values)); + EXPECT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalStops()]), + cf == 1 ? 1 : 0); + EXPECT_EQ( + std::stoi(cf_values[WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause::kMemtableLimit, + WriteStallCondition::kStopped)]), + cf == 1 ? 1 : 0); + EXPECT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalDelays()]), + 0); + EXPECT_EQ( + std::stoi(cf_values[WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause::kMemtableLimit, + WriteStallCondition::kDelayed)]), + 0); + } + } + + sleeping_task->WakeUp(); + sleeping_task->WaitUntilDone(); + } +} + namespace { std::string PopMetaIndexKey(InternalIterator* meta_iter) { Status s = meta_iter->status(); diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 0fa565ee04..5b76a7883a 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -23,6 +23,7 @@ #include "cache/cache_entry_stats.h" #include "db/column_family.h" #include "db/db_impl/db_impl.h" +#include "db/write_stall_stats.h" #include "port/port.h" #include "rocksdb/system_clock.h" #include "rocksdb/table.h" @@ -77,6 +78,10 @@ const std::map DBStatInfo{"db.user_writes_with_wal"}}, {InternalStats::kIntStatsWriteStallMicros, DBStatInfo{"db.user_write_stall_micros"}}, + {InternalStats::kIntStatsWriteBufferManagerLimitStopsCounts, + DBStatInfo{WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause::kWriteBufferManagerLimit, + WriteStallCondition::kStopped)}}, }; namespace { @@ -243,7 +248,9 @@ static const std::string cfstats = "cfstats"; static const std::string cfstats_no_file_histogram = "cfstats-no-file-histogram"; static const std::string cf_file_histogram = "cf-file-histogram"; +static const std::string cf_write_stall_stats = "cf-write-stall-stats"; static const std::string dbstats = "dbstats"; +static const std::string db_write_stall_stats = "db-write-stall-stats"; static const std::string levelstats = "levelstats"; static const std::string block_cache_entry_stats = "block-cache-entry-stats"; static const std::string fast_block_cache_entry_stats = @@ -323,6 +330,10 @@ const std::string DB::Properties::kCFStatsNoFileHistogram = rocksdb_prefix + cfstats_no_file_histogram; const std::string DB::Properties::kCFFileHistogram = rocksdb_prefix + cf_file_histogram; +const std::string DB::Properties::kCFWriteStallStats = + rocksdb_prefix + cf_write_stall_stats; +const std::string DB::Properties::kDBWriteStallStats = + rocksdb_prefix + db_write_stall_stats; const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats; const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats; const std::string DB::Properties::kBlockCacheEntryStats = @@ -450,9 +461,15 @@ const UnorderedMap {DB::Properties::kCFFileHistogram, {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr, nullptr}}, + {DB::Properties::kCFWriteStallStats, + {false, &InternalStats::HandleCFWriteStallStats, nullptr, + &InternalStats::HandleCFWriteStallStatsMap, nullptr}}, {DB::Properties::kDBStats, {false, &InternalStats::HandleDBStats, nullptr, &InternalStats::HandleDBMapStats, nullptr}}, + {DB::Properties::kDBWriteStallStats, + {false, &InternalStats::HandleDBWriteStallStats, nullptr, + &InternalStats::HandleDBWriteStallStatsMap, nullptr}}, {DB::Properties::kBlockCacheEntryStats, {true, &InternalStats::HandleBlockCacheEntryStats, nullptr, &InternalStats::HandleBlockCacheEntryStatsMap, nullptr}}, @@ -1087,6 +1104,18 @@ bool InternalStats::HandleCFFileHistogram(std::string* value, return true; } +bool InternalStats::HandleCFWriteStallStats(std::string* value, + Slice /*suffix*/) { + DumpCFStatsWriteStall(value); + return true; +} + +bool InternalStats::HandleCFWriteStallStatsMap( + std::map* value, Slice /*suffix*/) { + DumpCFMapStatsWriteStall(value); + return true; +} + bool InternalStats::HandleDBMapStats( std::map* db_stats, Slice /*suffix*/) { DumpDBMapStats(db_stats); @@ -1098,6 +1127,18 @@ bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) { return true; } +bool InternalStats::HandleDBWriteStallStats(std::string* value, + Slice /*suffix*/) { + DumpDBStatsWriteStall(value); + return true; +} + +bool InternalStats::HandleDBWriteStallStatsMap( + std::map* value, Slice /*suffix*/) { + DumpDBMapStatsWriteStall(value); + return true; +} + bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) { auto* current = cfd_->current(); *value = current->DebugString(true, true); @@ -1580,6 +1621,10 @@ void InternalStats::DumpDBStats(std::string* value) { 10000.0 / std::max(interval_seconds_up, 0.001)); value->append(buf); + std::string write_stall_stats; + DumpDBStatsWriteStall(&write_stall_stats); + value->append(write_stall_stats); + db_stats_snapshot_.seconds_up = seconds_up; db_stats_snapshot_.ingest_bytes = user_bytes_written; db_stats_snapshot_.write_other = write_other; @@ -1591,6 +1636,50 @@ void InternalStats::DumpDBStats(std::string* value) { db_stats_snapshot_.write_stall_micros = write_stall_micros; } +void InternalStats::DumpDBMapStatsWriteStall( + std::map* value) { + constexpr uint32_t max_db_scope_write_stall_cause = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax); + + for (uint32_t i = + max_db_scope_write_stall_cause - kNumDBScopeWriteStallCauses; + i < max_db_scope_write_stall_cause; ++i) { + for (uint32_t j = 0; + j < static_cast(WriteStallCondition::kNormal); ++j) { + WriteStallCause cause = static_cast(i); + WriteStallCondition condition = static_cast(j); + InternalStats::InternalDBStatsType internal_db_stat = + InternalDBStat(cause, condition); + + if (internal_db_stat == InternalStats::kIntStatsNumMax) { + continue; + } + + std::string name = + WriteStallStatsMapKeys::CauseConditionCount(cause, condition); + uint64_t stat = + db_stats_[static_cast(internal_db_stat)].load( + std::memory_order_relaxed); + (*value)[name] = std::to_string(stat); + } + } +} + +void InternalStats::DumpDBStatsWriteStall(std::string* value) { + assert(value); + + std::map write_stall_stats_map; + DumpDBMapStatsWriteStall(&write_stall_stats_map); + + std::ostringstream str; + str << "Write Stall (count): "; + + for (const auto& name_and_stat : write_stall_stats_map) { + str << name_and_stat.first << ": " << name_and_stat.second << ", "; + } + *value = str.str(); +} + /** * Dump Compaction Level stats to a map of stat name with "compaction." prefix * to value in double as string. The level in stat name is represented with @@ -1617,7 +1706,7 @@ void InternalStats::DumpCFMapStats( } } - DumpCFMapStatsIOStalls(cf_stats); + DumpCFMapStatsWriteStall(cf_stats); } void InternalStats::DumpCFMapStats( @@ -1709,36 +1798,81 @@ void InternalStats::DumpCFMapStatsByPriority( } } -void InternalStats::DumpCFMapStatsIOStalls( - std::map* cf_stats) { - (*cf_stats)["io_stalls.level0_slowdown"] = - std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS]); - (*cf_stats)["io_stalls.level0_slowdown_with_compaction"] = - std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS]); - (*cf_stats)["io_stalls.level0_numfiles"] = - std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS]); - (*cf_stats)["io_stalls.level0_numfiles_with_compaction"] = - std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS]); - (*cf_stats)["io_stalls.stop_for_pending_compaction_bytes"] = - std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS]); - (*cf_stats)["io_stalls.slowdown_for_pending_compaction_bytes"] = - std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS]); - (*cf_stats)["io_stalls.memtable_compaction"] = - std::to_string(cf_stats_count_[MEMTABLE_LIMIT_STOPS]); - (*cf_stats)["io_stalls.memtable_slowdown"] = - std::to_string(cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]); +void InternalStats::DumpCFMapStatsWriteStall( + std::map* value) { + uint64_t total_delays = 0; + uint64_t total_stops = 0; + constexpr uint32_t max_cf_scope_write_stall_cause = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax); - uint64_t total_stop = cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] + - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] + - cf_stats_count_[MEMTABLE_LIMIT_STOPS]; + for (uint32_t i = + max_cf_scope_write_stall_cause - kNumCFScopeWriteStallCauses; + i < max_cf_scope_write_stall_cause; ++i) { + for (uint32_t j = 0; + j < static_cast(WriteStallCondition::kNormal); ++j) { + WriteStallCause cause = static_cast(i); + WriteStallCondition condition = static_cast(j); + InternalStats::InternalCFStatsType internal_cf_stat = + InternalCFStat(cause, condition); - uint64_t total_slowdown = - cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] + - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] + - cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]; + if (internal_cf_stat == InternalStats::INTERNAL_CF_STATS_ENUM_MAX) { + continue; + } - (*cf_stats)["io_stalls.total_stop"] = std::to_string(total_stop); - (*cf_stats)["io_stalls.total_slowdown"] = std::to_string(total_slowdown); + std::string name = + WriteStallStatsMapKeys::CauseConditionCount(cause, condition); + uint64_t stat = + cf_stats_count_[static_cast(internal_cf_stat)]; + (*value)[name] = std::to_string(stat); + + if (condition == WriteStallCondition::kDelayed) { + total_delays += stat; + } else if (condition == WriteStallCondition::kStopped) { + total_stops += stat; + } + } + } + + (*value)[WriteStallStatsMapKeys:: + CFL0FileCountLimitDelaysWithOngoingCompaction()] = + std::to_string( + cf_stats_count_[L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION]); + (*value)[WriteStallStatsMapKeys:: + CFL0FileCountLimitStopsWithOngoingCompaction()] = + std::to_string( + cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION]); + + (*value)[WriteStallStatsMapKeys::TotalStops()] = std::to_string(total_stops); + (*value)[WriteStallStatsMapKeys::TotalDelays()] = + std::to_string(total_delays); +} + +void InternalStats::DumpCFStatsWriteStall(std::string* value, + uint64_t* total_stall_count) { + assert(value); + + std::map write_stall_stats_map; + DumpCFMapStatsWriteStall(&write_stall_stats_map); + + std::ostringstream str; + str << "Write Stall (count): "; + + for (const auto& name_and_stat : write_stall_stats_map) { + str << name_and_stat.first << ": " << name_and_stat.second << ", "; + } + + if (total_stall_count) { + *total_stall_count = + ParseUint64( + write_stall_stats_map[WriteStallStatsMapKeys::TotalStops()]) + + ParseUint64( + write_stall_stats_map[WriteStallStatsMapKeys::TotalDelays()]); + if (*total_stall_count > 0) { + str << "interval: " << *total_stall_count - cf_stats_snapshot_.stall_count + << " total count\n"; + } + } + *value = str.str(); } void InternalStats::DumpCFStats(std::string* value) { @@ -1776,14 +1910,6 @@ void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic, uint64_t ingest_l0_files_addfile = cf_stats_value_[INGESTED_LEVEL0_NUM_FILES_TOTAL]; uint64_t ingest_keys_addfile = cf_stats_value_[INGESTED_NUM_KEYS_TOTAL]; - // Cumulative summary - uint64_t total_stall_count = - cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] + - cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] + - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] + - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] + - cf_stats_count_[MEMTABLE_LIMIT_STOPS] + - cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]; // Interval summary uint64_t interval_flush_ingest = flush_ingest - cf_stats_snapshot_.ingest_bytes_flush; @@ -1905,34 +2031,10 @@ void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic, cf_stats_snapshot_.compact_micros = compact_micros; } - snprintf(buf, sizeof(buf), - "Stalls(count): %" PRIu64 - " level0_slowdown, " - "%" PRIu64 - " level0_slowdown_with_compaction, " - "%" PRIu64 - " level0_numfiles, " - "%" PRIu64 - " level0_numfiles_with_compaction, " - "%" PRIu64 - " stop for pending_compaction_bytes, " - "%" PRIu64 - " slowdown for pending_compaction_bytes, " - "%" PRIu64 - " memtable_compaction, " - "%" PRIu64 - " memtable_slowdown, " - "interval %" PRIu64 " total count\n", - cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS], - cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS], - cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS], - cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS], - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS], - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS], - cf_stats_count_[MEMTABLE_LIMIT_STOPS], - cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS], - total_stall_count - cf_stats_snapshot_.stall_count); - value->append(buf); + std::string write_stall_stats; + uint64_t total_stall_count; + DumpCFStatsWriteStall(&write_stall_stats, &total_stall_count); + value->append(write_stall_stats); if (is_periodic) { cf_stats_snapshot_.seconds_up = seconds_up; diff --git a/db/internal_stats.h b/db/internal_stats.h index feb72dd740..7a600384a7 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -104,15 +104,20 @@ class InternalStats { static const std::map compaction_level_stats; enum InternalCFStatsType { - L0_FILE_COUNT_LIMIT_SLOWDOWNS, - LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, + MEMTABLE_LIMIT_DELAYS, MEMTABLE_LIMIT_STOPS, - MEMTABLE_LIMIT_SLOWDOWNS, + L0_FILE_COUNT_LIMIT_DELAYS, L0_FILE_COUNT_LIMIT_STOPS, - LOCKED_L0_FILE_COUNT_LIMIT_STOPS, - PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, + PENDING_COMPACTION_BYTES_LIMIT_DELAYS, PENDING_COMPACTION_BYTES_LIMIT_STOPS, + // Write slowdown caused by l0 file count limit while there is ongoing L0 + // compaction + L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION, + // Write stop caused by l0 file count limit while there is ongoing L0 + // compaction + L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION, WRITE_STALLS_ENUM_MAX, + // End of all write stall stats BYTES_FLUSHED, BYTES_INGESTED_ADD_FILE, INGESTED_NUM_FILES_TOTAL, @@ -129,7 +134,18 @@ class InternalStats { kIntStatsWriteDoneByOther, kIntStatsWriteDoneBySelf, kIntStatsWriteWithWal, + // TODO(hx235): Currently `kIntStatsWriteStallMicros` only measures + // "delayed" time of CF-scope write stalls, not including the "stopped" time + // nor any DB-scope write stalls (e.g, ones triggered by + // `WriteBufferManager`). + // + // However, the word "write stall" includes both "delayed" and "stopped" + // (see `WriteStallCondition`) and DB-scope writes stalls (see + // `WriteStallCause`). + // + // So we should improve, rename or clarify it kIntStatsWriteStallMicros, + kIntStatsWriteBufferManagerLimitStopsCounts, kIntStatsNumMax, }; @@ -599,6 +615,10 @@ class InternalStats { private: void DumpDBMapStats(std::map* db_stats); void DumpDBStats(std::string* value); + + void DumpDBMapStatsWriteStall(std::map* value); + void DumpDBStatsWriteStall(std::string* value); + void DumpCFMapStats(std::map* cf_stats); void DumpCFMapStats( const VersionStorageInfo* vstorage, @@ -606,7 +626,6 @@ class InternalStats { CompactionStats* compaction_stats_sum); void DumpCFMapStatsByPriority( std::map>* priorities_stats); - void DumpCFMapStatsIOStalls(std::map* cf_stats); void DumpCFStats(std::string* value); // if is_periodic = true, it is an internal call by RocksDB periodically to // dump the status. @@ -615,6 +634,10 @@ class InternalStats { // dump the status. void DumpCFFileHistogram(std::string* value); + void DumpCFMapStatsWriteStall(std::map* value); + void DumpCFStatsWriteStall(std::string* value, + uint64_t* total_stall_count = nullptr); + Cache* GetBlockCacheForStats(); Cache* GetBlobCacheForStats(); @@ -648,7 +671,7 @@ class InternalStats { // ColumnFamily-level stats CompactionStats comp_stats; uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush) - uint64_t stall_count; // Stall count + uint64_t stall_count; // Total counts of CF-scope write stalls // Stats from compaction jobs - bytes written, bytes read, duration. uint64_t compact_bytes_write; uint64_t compact_bytes_read; @@ -743,9 +766,15 @@ class InternalStats { bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix); bool HandleCFFileHistogram(std::string* value, Slice suffix); bool HandleCFStatsPeriodic(std::string* value, Slice suffix); + bool HandleCFWriteStallStats(std::string* value, Slice suffix); + bool HandleCFWriteStallStatsMap(std::map* values, + Slice suffix); bool HandleDBMapStats(std::map* compaction_stats, Slice suffix); bool HandleDBStats(std::string* value, Slice suffix); + bool HandleDBWriteStallStats(std::string* value, Slice suffix); + bool HandleDBWriteStallStatsMap(std::map* values, + Slice suffix); bool HandleSsTables(std::string* value, Slice suffix); bool HandleAggregatedTableProperties(std::string* value, Slice suffix); bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix); diff --git a/db/write_stall_stats.cc b/db/write_stall_stats.cc new file mode 100644 index 0000000000..3143531e72 --- /dev/null +++ b/db/write_stall_stats.cc @@ -0,0 +1,159 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/write_stall_stats.h" + +namespace ROCKSDB_NAMESPACE { +const std::string kInvalidWriteStallCauseHyphenString = "invalid"; + +const std::array(WriteStallCause::kNone)> + kWriteStallCauseToHyphenString{{ + "memtable-limit", + "l0-file-count-limit", + "pending-compaction-bytes", + // WriteStallCause::kCFScopeWriteStallCauseEnumMax + kInvalidWriteStallCauseHyphenString, + "write-buffer-manager-limit", + // WriteStallCause::kDBScopeWriteStallCauseEnumMax + kInvalidWriteStallCauseHyphenString, + }}; + +const std::array(WriteStallCondition::kNormal)> + kWriteStallConditionToHyphenString{{ + "delays", + "stops", + }}; + +InternalStats::InternalCFStatsType InternalCFStat( + WriteStallCause cause, WriteStallCondition condition) { + switch (cause) { + case WriteStallCause::kMemtableLimit: { + switch (condition) { + case WriteStallCondition::kDelayed: + return InternalStats::MEMTABLE_LIMIT_DELAYS; + case WriteStallCondition::kStopped: + return InternalStats::MEMTABLE_LIMIT_STOPS; + case WriteStallCondition::kNormal: + break; + } + break; + } + case WriteStallCause::kL0FileCountLimit: { + switch (condition) { + case WriteStallCondition::kDelayed: + return InternalStats::L0_FILE_COUNT_LIMIT_DELAYS; + case WriteStallCondition::kStopped: + return InternalStats::L0_FILE_COUNT_LIMIT_STOPS; + case WriteStallCondition::kNormal: + break; + } + break; + } + case WriteStallCause::kPendingCompactionBytes: { + switch (condition) { + case WriteStallCondition::kDelayed: + return InternalStats::PENDING_COMPACTION_BYTES_LIMIT_DELAYS; + case WriteStallCondition::kStopped: + return InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS; + case WriteStallCondition::kNormal: + break; + } + break; + } + default: + break; + } + return InternalStats::INTERNAL_CF_STATS_ENUM_MAX; +} + +InternalStats::InternalDBStatsType InternalDBStat( + WriteStallCause cause, WriteStallCondition condition) { + switch (cause) { + case WriteStallCause::kWriteBufferManagerLimit: { + switch (condition) { + case WriteStallCondition::kStopped: + return InternalStats::kIntStatsWriteBufferManagerLimitStopsCounts; + default: + break; + } + break; + } + default: + break; + } + return InternalStats::kIntStatsNumMax; +} + +bool isCFScopeWriteStallCause(WriteStallCause cause) { + uint32_t int_cause = static_cast(cause); + uint32_t lower_bound = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax) - + kNumCFScopeWriteStallCauses; + uint32_t upper_bound = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax) - + 1; + return lower_bound <= int_cause && int_cause <= upper_bound; +} + +bool isDBScopeWriteStallCause(WriteStallCause cause) { + uint32_t int_cause = static_cast(cause); + uint32_t lower_bound = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax) - + kNumDBScopeWriteStallCauses; + uint32_t upper_bound = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax) - + 1; + return lower_bound <= int_cause && int_cause <= upper_bound; +} + +const std::string& WriteStallStatsMapKeys::TotalStops() { + static const std::string kTotalStops = "total-stops"; + return kTotalStops; +} + +const std::string& WriteStallStatsMapKeys::TotalDelays() { + static const std::string kTotalDelays = "total-delays"; + return kTotalDelays; +} + +const std::string& +WriteStallStatsMapKeys::CFL0FileCountLimitDelaysWithOngoingCompaction() { + static const std::string ret = + "cf-l0-file-count-limit-delays-with-ongoing-compaction"; + return ret; +} + +const std::string& +WriteStallStatsMapKeys::CFL0FileCountLimitStopsWithOngoingCompaction() { + static const std::string ret = + "cf-l0-file-count-limit-stops-with-ongoing-compaction"; + return ret; +} + +std::string WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause cause, WriteStallCondition condition) { + std::string cause_condition_count_name; + + std::string cause_name; + if (isCFScopeWriteStallCause(cause) || isDBScopeWriteStallCause(cause)) { + cause_name = kWriteStallCauseToHyphenString[static_cast(cause)]; + } else { + assert(false); + return ""; + } + + const std::string& condition_name = + kWriteStallConditionToHyphenString[static_cast(condition)]; + + cause_condition_count_name.reserve(cause_name.size() + 1 + + condition_name.size()); + cause_condition_count_name.append(cause_name); + cause_condition_count_name.append("-"); + cause_condition_count_name.append(condition_name); + + return cause_condition_count_name; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/db/write_stall_stats.h b/db/write_stall_stats.h new file mode 100644 index 0000000000..9ae518a079 --- /dev/null +++ b/db/write_stall_stats.h @@ -0,0 +1,50 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "db/internal_stats.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { +extern const std::string kInvalidWriteStallCauseHyphenString; + +extern const std::array(WriteStallCause::kNone)> + kWriteStallCauseToHyphenString; + +extern const std::array(WriteStallCondition::kNormal)> + kWriteStallConditionToHyphenString; + +// REQUIRES: +// cause` is CF-scope `WriteStallCause`, see `WriteStallCause` for more +// +// REQUIRES: +// `condition` != `WriteStallCondition::kNormal` +extern InternalStats::InternalCFStatsType InternalCFStat( + WriteStallCause cause, WriteStallCondition condition); + +// REQUIRES: +// cause` is DB-scope `WriteStallCause`, see `WriteStallCause` for more +// +// REQUIRES: +// `condition` != `WriteStallCondition::kNormal` +extern InternalStats::InternalDBStatsType InternalDBStat( + WriteStallCause cause, WriteStallCondition condition); + +extern bool isCFScopeWriteStallCause(WriteStallCause cause); +extern bool isDBScopeWriteStallCause(WriteStallCause cause); + +constexpr uint32_t kNumCFScopeWriteStallCauses = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax) - + static_cast(WriteStallCause::kMemtableLimit); + +constexpr uint32_t kNumDBScopeWriteStallCauses = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax) - + static_cast(WriteStallCause::kWriteBufferManagerLimit); +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 6949fadcfd..6539eb8aeb 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -953,6 +953,18 @@ class DB { // level, as well as the histogram of latency of single requests. static const std::string kCFFileHistogram; + // "rocksdb.cf-write-stall-stats" - returns a multi-line string or + // map with statistics on CF-scope write stalls for a given CF + // See`WriteStallStatsMapKeys` for structured representation of keys + // available in the map form. + static const std::string kCFWriteStallStats; + + // "rocksdb.db-write-stall-stats" - returns a multi-line string or + // map with statistics on DB-scope write stalls + // See`WriteStallStatsMapKeys` for structured representation of keys + // available in the map form. + static const std::string kDBWriteStallStats; + // "rocksdb.dbstats" - As a string property, returns a multi-line string // with general database stats, both cumulative (over the db's // lifetime) and interval (since the last retrieval of kDBStats). @@ -1873,6 +1885,24 @@ class DB { } }; +struct WriteStallStatsMapKeys { + static const std::string& TotalStops(); + static const std::string& TotalDelays(); + + static const std::string& CFL0FileCountLimitDelaysWithOngoingCompaction(); + static const std::string& CFL0FileCountLimitStopsWithOngoingCompaction(); + + // REQUIRES: + // `cause` isn't any of these: `WriteStallCause::kNone`, + // `WriteStallCause::kCFScopeWriteStallCauseEnumMax`, + // `WriteStallCause::kDBScopeWriteStallCauseEnumMax` + // + // REQUIRES: + // `condition` isn't any of these: `WriteStallCondition::kNormal` + static std::string CauseConditionCount(WriteStallCause cause, + WriteStallCondition condition); +}; + // Overloaded operators for enum class SizeApproximationFlags. inline DB::SizeApproximationFlags operator&(DB::SizeApproximationFlags lhs, DB::SizeApproximationFlags rhs) { diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index e1f97fa1bb..87bc678693 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -194,12 +194,6 @@ enum class BackgroundErrorReason { kManifestWriteNoWAL, }; -enum class WriteStallCondition { - kNormal, - kDelayed, - kStopped, -}; - struct WriteStallInfo { // the name of the column family std::string cf_name; diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h index 6fb53d8466..3f8ce97959 100644 --- a/include/rocksdb/types.h +++ b/include/rocksdb/types.h @@ -63,4 +63,32 @@ enum EntryType { kEntryOther, }; +enum class WriteStallCause { + // Beginning of CF-scope write stall causes + // + // Always keep `kMemtableLimit` as the first stat in this section + kMemtableLimit, + kL0FileCountLimit, + kPendingCompactionBytes, + kCFScopeWriteStallCauseEnumMax, + // End of CF-scope write stall causes + + // Beginning of DB-scope write stall causes + // + // Always keep `kWriteBufferManagerLimit` as the first stat in this section + kWriteBufferManagerLimit, + kDBScopeWriteStallCauseEnumMax, + // End of DB-scope write stall causes + + // Always add new WriteStallCause before `kNone` + kNone, +}; + +enum class WriteStallCondition { + kDelayed, + kStopped, + // Always add new WriteStallCondition before `kNormal` + kNormal, +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/java/src/main/java/org/rocksdb/WriteStallCondition.java b/java/src/main/java/org/rocksdb/WriteStallCondition.java index 3bc9d41043..98d9e2ce4a 100644 --- a/java/src/main/java/org/rocksdb/WriteStallCondition.java +++ b/java/src/main/java/org/rocksdb/WriteStallCondition.java @@ -6,9 +6,9 @@ package org.rocksdb; public enum WriteStallCondition { - NORMAL((byte) 0x0), - DELAYED((byte) 0x1), - STOPPED((byte) 0x2); + DELAYED((byte) 0x0), + STOPPED((byte) 0x1), + NORMAL((byte) 0x2); private final byte value; diff --git a/java/src/test/java/org/rocksdb/EventListenerTest.java b/java/src/test/java/org/rocksdb/EventListenerTest.java index aec0af617e..93ea19c2f9 100644 --- a/java/src/test/java/org/rocksdb/EventListenerTest.java +++ b/java/src/test/java/org/rocksdb/EventListenerTest.java @@ -266,7 +266,7 @@ public class EventListenerTest { final FileOperationInfo fileOperationInfoTestData = new FileOperationInfo("/file/path", TEST_LONG_VAL, TEST_LONG_VAL, 1_600_699_420_000_000_000L, 5_000_000_000L, statusTestData); final WriteStallInfo writeStallInfoTestData = - new WriteStallInfo("columnFamilyName", (byte) 0x1, (byte) 0x2); + new WriteStallInfo("columnFamilyName", (byte) 0x0, (byte) 0x1); final ExternalFileIngestionInfo externalFileIngestionInfoTestData = new ExternalFileIngestionInfo("columnFamilyName", "/external/file/path", "/internal/file/path", TEST_LONG_VAL, tablePropertiesTestData); diff --git a/src.mk b/src.mk index df1d39475f..e1ab947a06 100644 --- a/src.mk +++ b/src.mk @@ -97,6 +97,7 @@ LIB_SOURCES = \ db/write_batch.cc \ db/write_batch_base.cc \ db/write_controller.cc \ + db/write_stall_stats.cc \ db/write_thread.cc \ env/composite_env.cc \ env/env.cc \