mirror of https://github.com/facebook/rocksdb.git
Pin CacheEntryStatsCollector to fix performance bug (#8385)
Summary: If the block Cache is full with strict_capacity_limit=false, then our CacheEntryStatsCollector could be immediately evicted on release, so iterating through column families with shared block cache could trigger re-scan for each CF. This change fixes that problem by pinning the CacheEntryStatsCollector from InternalStats so that it's not evicted. I had originally thought that this object could participate in LRU like everything else, but even though a re-load+re-scan only touches memory, it can be orders of magnitude more expensive than other cache misses. One service in Facebook has scans that take ~20s over 100GB block cache that is mostly 4KB entries. (The up-side of this bug and https://github.com/facebook/rocksdb/issues/8369 is that we had a natural experiment on the effect on some service metrics even with block cache scans running continuously in the background--a kind of worst case scenario. Metrics like latency were not affected enough to trigger warnings.) Other smaller fixes: 20s is already a sizable portion of 600s stats dump period, or 180s default max age to force re-scan, so added logic to ensure that (for each block cache) we don't spend more than 0.2% of our background thread time scanning it. Nevertheless, "foreground" requests for cache entry stats (calls to `db->GetMapProperty(DB::Properties::kBlockCacheEntryStats)`) are permitted to consume more CPU. Renamed field to cache_entry_stats_ to match code style. This change is intended for patching in 6.21 release. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8385 Test Plan: unit test expanded to cover new logic (detect regression), some manual testing with db_bench Reviewed By: ajkr Differential Revision: D29042759 Pulled By: pdillinger fbshipit-source-id: 236faa902397f50038c618f50fbc8cf3f277308c
This commit is contained in:
parent
d60ae5b1c7
commit
d5a46c40e5
|
@ -52,15 +52,29 @@ template <class Stats>
|
|||
class CacheEntryStatsCollector {
|
||||
public:
|
||||
// Gathers stats and saves results into `stats`
|
||||
void GetStats(Stats *stats, int maximum_age_in_seconds = 180) {
|
||||
//
|
||||
// Maximum allowed age for a "hit" on saved results is determined by the
|
||||
// two interval parameters. Both set to 0 forces a re-scan. For example
|
||||
// with min_interval_seconds=300 and min_interval_factor=100, if the last
|
||||
// scan took 10s, we would only rescan ("miss") if the age in seconds of
|
||||
// the saved results is > max(300, 100*10).
|
||||
// Justification: scans can vary wildly in duration, e.g. from 0.02 sec
|
||||
// to as much as 20 seconds, so we want to be able to cap the absolute
|
||||
// and relative frequency of scans.
|
||||
void GetStats(Stats *stats, int min_interval_seconds,
|
||||
int min_interval_factor) {
|
||||
// Waits for any pending reader or writer (collector)
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
|
||||
// Maximum allowed age is nominally given by the parameter, but
|
||||
// to limit the possibility of accidental repeated scans, impose
|
||||
// a minimum TTL of 1 second.
|
||||
uint64_t max_age_micros =
|
||||
static_cast<uint64_t>(std::max(maximum_age_in_seconds, 1)) * 1000000U;
|
||||
static_cast<uint64_t>(std::max(min_interval_seconds, 0)) * 1000000U;
|
||||
|
||||
if (last_end_time_micros_ > last_start_time_micros_ &&
|
||||
min_interval_factor > 0) {
|
||||
max_age_micros = std::max(
|
||||
max_age_micros, min_interval_factor * (last_end_time_micros_ -
|
||||
last_start_time_micros_));
|
||||
}
|
||||
|
||||
uint64_t start_time_micros = clock_->NowMicros();
|
||||
if ((start_time_micros - last_end_time_micros_) > max_age_micros) {
|
||||
|
@ -68,6 +82,8 @@ class CacheEntryStatsCollector {
|
|||
saved_stats_.BeginCollection(cache_, clock_, start_time_micros);
|
||||
|
||||
cache_->ApplyToAllEntries(saved_stats_.GetEntryCallback(), {});
|
||||
TEST_SYNC_POINT_CALLBACK(
|
||||
"CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr);
|
||||
|
||||
uint64_t end_time_micros = clock_->NowMicros();
|
||||
last_end_time_micros_ = end_time_micros;
|
||||
|
@ -109,7 +125,8 @@ class CacheEntryStatsCollector {
|
|||
// usage to go flaky. Fix the problem somehow so we can use an
|
||||
// accurate charge.
|
||||
size_t charge = 0;
|
||||
Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h);
|
||||
Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h,
|
||||
Cache::Priority::HIGH);
|
||||
if (!s.ok()) {
|
||||
assert(h == nullptr);
|
||||
return s;
|
||||
|
|
|
@ -152,12 +152,13 @@ class DBBlockCacheTest : public DBTestBase {
|
|||
}
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
const std::array<size_t, kNumCacheEntryRoles>& GetCacheEntryRoleCounts() {
|
||||
const std::array<size_t, kNumCacheEntryRoles>& GetCacheEntryRoleCountsBg() {
|
||||
// Verify in cache entry role stats
|
||||
ColumnFamilyHandleImpl* cfh =
|
||||
static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
|
||||
InternalStats* internal_stats_ptr = cfh->cfd()->internal_stats();
|
||||
return internal_stats_ptr->TEST_GetCacheEntryRoleStats().entry_counts;
|
||||
return internal_stats_ptr->TEST_GetCacheEntryRoleStats(/*foreground=*/false)
|
||||
.entry_counts;
|
||||
}
|
||||
#endif // ROCKSDB_LITE
|
||||
};
|
||||
|
@ -935,6 +936,8 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
|
|||
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
||||
options.max_open_files = 13;
|
||||
options.table_cache_numshardbits = 0;
|
||||
// If this wakes up, it could interfere with test
|
||||
options.stats_dump_period_sec = 0;
|
||||
|
||||
BlockBasedTableOptions table_options;
|
||||
table_options.block_cache = cache;
|
||||
|
@ -970,7 +973,7 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
|
|||
std::array<size_t, kNumCacheEntryRoles> expected{};
|
||||
// For CacheEntryStatsCollector
|
||||
expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
|
||||
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
|
||||
EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
|
||||
|
||||
std::array<size_t, kNumCacheEntryRoles> prev_expected = expected;
|
||||
|
||||
|
@ -981,13 +984,13 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
|
|||
expected[static_cast<size_t>(CacheEntryRole::kFilterMetaBlock)] += 2;
|
||||
}
|
||||
// Within some time window, we will get cached entry stats
|
||||
EXPECT_EQ(prev_expected, GetCacheEntryRoleCounts());
|
||||
EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
|
||||
// Not enough to force a miss
|
||||
env_->MockSleepForSeconds(10);
|
||||
EXPECT_EQ(prev_expected, GetCacheEntryRoleCounts());
|
||||
env_->MockSleepForSeconds(45);
|
||||
EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
|
||||
// Enough to force a miss
|
||||
env_->MockSleepForSeconds(1000);
|
||||
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
|
||||
env_->MockSleepForSeconds(601);
|
||||
EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
|
||||
|
||||
// Now access index and data block
|
||||
ASSERT_EQ("value", Get("foo"));
|
||||
|
@ -997,8 +1000,22 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
|
|||
expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
|
||||
}
|
||||
expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
|
||||
env_->MockSleepForSeconds(1000);
|
||||
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
|
||||
// Enough to force a miss
|
||||
env_->MockSleepForSeconds(601);
|
||||
// But inject a simulated long scan so that we need a longer
|
||||
// interval to force a miss next time.
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
|
||||
[this](void*) {
|
||||
// To spend no more than 0.2% of time scanning, we would need
|
||||
// interval of at least 10000s
|
||||
env_->MockSleepForSeconds(20);
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
|
||||
prev_expected = expected;
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
|
||||
// The same for other file
|
||||
ASSERT_EQ("value", Get("zfoo"));
|
||||
|
@ -1008,8 +1025,14 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
|
|||
expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
|
||||
}
|
||||
expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
|
||||
env_->MockSleepForSeconds(1000);
|
||||
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
|
||||
// Because of the simulated long scan, this is not enough to force
|
||||
// a miss
|
||||
env_->MockSleepForSeconds(601);
|
||||
EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
|
||||
// But this is enough
|
||||
env_->MockSleepForSeconds(10000);
|
||||
EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
|
||||
prev_expected = expected;
|
||||
|
||||
// Also check the GetProperty interface
|
||||
std::map<std::string, std::string> values;
|
||||
|
@ -1025,8 +1048,54 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
|
|||
EXPECT_EQ(
|
||||
ToString(expected[static_cast<size_t>(CacheEntryRole::kFilterBlock)]),
|
||||
values["count.filter-block"]);
|
||||
EXPECT_EQ(
|
||||
ToString(
|
||||
prev_expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
|
||||
values["count.write-buffer"]);
|
||||
EXPECT_EQ(ToString(expected[static_cast<size_t>(CacheEntryRole::kMisc)]),
|
||||
values["count.misc"]);
|
||||
|
||||
// Add one for kWriteBuffer
|
||||
{
|
||||
WriteBufferManager wbm(size_t{1} << 20, cache);
|
||||
wbm.ReserveMem(1024);
|
||||
expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]++;
|
||||
// Now we check that the GetProperty interface is more agressive about
|
||||
// re-scanning stats, but not totally aggressive.
|
||||
// Within some time window, we will get cached entry stats
|
||||
env_->MockSleepForSeconds(1);
|
||||
EXPECT_EQ(ToString(prev_expected[static_cast<size_t>(
|
||||
CacheEntryRole::kWriteBuffer)]),
|
||||
values["count.write-buffer"]);
|
||||
// Not enough for a "background" miss but enough for a "foreground" miss
|
||||
env_->MockSleepForSeconds(45);
|
||||
|
||||
ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats,
|
||||
&values));
|
||||
EXPECT_EQ(
|
||||
ToString(
|
||||
expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
|
||||
values["count.write-buffer"]);
|
||||
}
|
||||
prev_expected = expected;
|
||||
|
||||
// With collector pinned in cache, we should be able to hit
|
||||
// even if the cache is full
|
||||
ClearCache(cache.get());
|
||||
Cache::Handle* h = nullptr;
|
||||
ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1,
|
||||
GetNoopDeleterForRole<CacheEntryRole::kMisc>(),
|
||||
&h, Cache::Priority::HIGH));
|
||||
ASSERT_GT(cache->GetUsage(), cache->GetCapacity());
|
||||
expected = {};
|
||||
expected[static_cast<size_t>(CacheEntryRole::kMisc)]++;
|
||||
// Still able to hit on saved stats
|
||||
EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
|
||||
// Enough to force a miss
|
||||
env_->MockSleepForSeconds(1000);
|
||||
EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
|
||||
|
||||
cache->Release(h);
|
||||
}
|
||||
EXPECT_GE(iterations_tested, 1);
|
||||
}
|
||||
|
|
|
@ -528,27 +528,38 @@ InternalStats::InternalStats(int num_levels, SystemClock* clock,
|
|||
cfd_(cfd),
|
||||
started_at_(clock->NowMicros()) {}
|
||||
|
||||
Status InternalStats::CollectCacheEntryStats() {
|
||||
using Collector = CacheEntryStatsCollector<CacheEntryRoleStats>;
|
||||
Status InternalStats::CollectCacheEntryStats(bool foreground) {
|
||||
// Lazy initialize/reference the collector. It is pinned in cache (through
|
||||
// a shared_ptr) so that it does not get immediately ejected from a full
|
||||
// cache, which would force a re-scan on the next GetStats.
|
||||
if (!cache_entry_stats_collector_) {
|
||||
Cache* block_cache;
|
||||
bool ok = HandleBlockCacheStat(&block_cache);
|
||||
if (ok) {
|
||||
// Extract or create stats collector.
|
||||
std::shared_ptr<Collector> collector;
|
||||
Status s = Collector::GetShared(block_cache, clock_, &collector);
|
||||
if (s.ok()) {
|
||||
// TODO: use a max age like stats_dump_period_sec / 2, but it's
|
||||
// difficult to access that setting from here with just cfd_
|
||||
collector->GetStats(&cache_entry_stats);
|
||||
} else {
|
||||
Status s = CacheEntryStatsCollector<CacheEntryRoleStats>::GetShared(
|
||||
block_cache, clock_, &cache_entry_stats_collector_);
|
||||
if (!s.ok()) {
|
||||
// Block cache likely under pressure. Scanning could make it worse,
|
||||
// so skip.
|
||||
}
|
||||
return s;
|
||||
}
|
||||
} else {
|
||||
return Status::NotFound("block cache not configured");
|
||||
}
|
||||
}
|
||||
assert(cache_entry_stats_collector_);
|
||||
|
||||
// For "background" collections, strictly cap the collection time by
|
||||
// expanding effective cache TTL. For foreground, be more aggressive about
|
||||
// getting latest data.
|
||||
int min_interval_seconds = foreground ? 10 : 180;
|
||||
// 1/500 = max of 0.2% of one CPU thread
|
||||
int min_interval_factor = foreground ? 10 : 500;
|
||||
cache_entry_stats_collector_->GetStats(
|
||||
&cache_entry_stats_, min_interval_seconds, min_interval_factor);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
|
||||
InternalStats::CacheEntryRoleStats::GetEntryCallback() {
|
||||
|
@ -638,21 +649,21 @@ void InternalStats::CacheEntryRoleStats::ToMap(
|
|||
|
||||
bool InternalStats::HandleBlockCacheEntryStats(std::string* value,
|
||||
Slice /*suffix*/) {
|
||||
Status s = CollectCacheEntryStats();
|
||||
Status s = CollectCacheEntryStats(/*foreground*/ true);
|
||||
if (!s.ok()) {
|
||||
return false;
|
||||
}
|
||||
*value = cache_entry_stats.ToString(clock_);
|
||||
*value = cache_entry_stats_.ToString(clock_);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool InternalStats::HandleBlockCacheEntryStatsMap(
|
||||
std::map<std::string, std::string>* values, Slice /*suffix*/) {
|
||||
Status s = CollectCacheEntryStats();
|
||||
Status s = CollectCacheEntryStats(/*foreground*/ true);
|
||||
if (!s.ok()) {
|
||||
return false;
|
||||
}
|
||||
cache_entry_stats.ToMap(values, clock_);
|
||||
cache_entry_stats_.ToMap(values, clock_);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1608,9 +1619,10 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
|
|||
cf_stats_snapshot_.comp_stats = compaction_stats_sum;
|
||||
cf_stats_snapshot_.stall_count = total_stall_count;
|
||||
|
||||
Status s = CollectCacheEntryStats();
|
||||
// Always treat CFStats context as "background"
|
||||
Status s = CollectCacheEntryStats(/*foreground=*/false);
|
||||
if (s.ok()) {
|
||||
value->append(cache_entry_stats.ToString(clock_));
|
||||
value->append(cache_entry_stats_.ToString(clock_));
|
||||
} else {
|
||||
value->append("Block cache: ");
|
||||
value->append(s.ToString());
|
||||
|
|
|
@ -23,6 +23,8 @@ class ColumnFamilyData;
|
|||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
template <class Stats>
|
||||
class CacheEntryStatsCollector;
|
||||
class DBImpl;
|
||||
class MemTableList;
|
||||
|
||||
|
@ -390,7 +392,7 @@ class InternalStats {
|
|||
cf_stats_count_[i] = 0;
|
||||
cf_stats_value_[i] = 0;
|
||||
}
|
||||
cache_entry_stats.Clear();
|
||||
cache_entry_stats_.Clear();
|
||||
for (auto& comp_stat : comp_stats_) {
|
||||
comp_stat.Clear();
|
||||
}
|
||||
|
@ -463,13 +465,13 @@ class InternalStats {
|
|||
return comp_stats_;
|
||||
}
|
||||
|
||||
const CacheEntryRoleStats& TEST_GetCacheEntryRoleStats() {
|
||||
Status s = CollectCacheEntryStats();
|
||||
const CacheEntryRoleStats& TEST_GetCacheEntryRoleStats(bool foreground) {
|
||||
Status s = CollectCacheEntryStats(foreground);
|
||||
if (!s.ok()) {
|
||||
assert(false);
|
||||
cache_entry_stats.Clear();
|
||||
cache_entry_stats_.Clear();
|
||||
}
|
||||
return cache_entry_stats;
|
||||
return cache_entry_stats_;
|
||||
}
|
||||
|
||||
// Store a mapping from the user-facing DB::Properties string to our
|
||||
|
@ -492,14 +494,16 @@ class InternalStats {
|
|||
|
||||
bool HandleBlockCacheStat(Cache** block_cache);
|
||||
|
||||
Status CollectCacheEntryStats();
|
||||
Status CollectCacheEntryStats(bool foreground);
|
||||
|
||||
// Per-DB stats
|
||||
std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
|
||||
// Per-ColumnFamily stats
|
||||
uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
|
||||
uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
|
||||
CacheEntryRoleStats cache_entry_stats;
|
||||
CacheEntryRoleStats cache_entry_stats_;
|
||||
std::shared_ptr<CacheEntryStatsCollector<CacheEntryRoleStats>>
|
||||
cache_entry_stats_collector_;
|
||||
// Per-ColumnFamily/level compaction stats
|
||||
std::vector<CompactionStats> comp_stats_;
|
||||
std::vector<CompactionStats> comp_stats_by_pri_;
|
||||
|
|
Loading…
Reference in New Issue