From cdb11f5ce6cb1e334e98bb672c5b121581a3af39 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Mon, 7 Aug 2023 12:20:23 -0700 Subject: [PATCH] More minor HCC refactoring + typed mmap (#11670) Summary: More code leading up to dynamic HCC. * Small enhancements to cache_bench * Extra assertion in Unref * Improve a CAS loop in ChargeUsageMaybeEvictStrict * Put load factor constants in appropriate class * Move `standalone` field to HyperClockTable::HandleImpl because it can be encoded differently in the upcoming dynamic HCC. * Add a typed version of MemMapping to simplify some future code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11670 Test Plan: existing tests, unit test added for TypedMemMapping Reviewed By: jowlyzhang Differential Revision: D48056464 Pulled By: pdillinger fbshipit-source-id: 186b7d3105c5d6d2eb6a592369bc10a97ee14a15 --- cache/cache_bench_tool.cc | 5 +++ cache/clock_cache.cc | 35 +++++++++++--------- cache/clock_cache.h | 68 +++++++++++++++++++-------------------- cache/lru_cache_test.cc | 10 +++--- memory/arena_test.cc | 29 ++++++++++------- port/mmap.h | 20 ++++++++++++ 6 files changed, 102 insertions(+), 65 deletions(-) diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index fd1b44d167..6a0ef2a735 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -436,6 +436,10 @@ class CacheBench { printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio()); + size_t occ = cache_->GetOccupancyCount(); + size_t slot = cache_->GetTableAddressCount(); + printf("Final load factor: %g (%zu / %zu)\n", 1.0 * occ / slot, occ, slot); + if (FLAGS_histograms) { printf("\nOperation latency (ns):\n"); HistogramImpl combined; @@ -676,6 +680,7 @@ class CacheBench { #endif printf("----------------------------\n"); printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); + printf("Cache impl name : %s\n", cache_->Name()); printf("DMutex impl name : %s\n", DMutex::kName()); printf("Number of threads : %u\n", FLAGS_threads); printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 16955004cf..ac9c5f8370 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -79,8 +79,10 @@ inline void Unref(const ClockHandle& h, uint64_t count = 1) { // Pretend we never took the reference // WART: there's a tiny chance we release last ref to invisible // entry here. If that happens, we let eviction take care of it. - h.meta.fetch_sub(ClockHandle::kAcquireIncrement * count, - std::memory_order_release); + uint64_t old_meta = h.meta.fetch_sub(ClockHandle::kAcquireIncrement * count, + std::memory_order_release); + assert(GetRefcount(old_meta) != 0); + (void)old_meta; } inline bool ClockUpdate(ClockHandle& h) { @@ -406,14 +408,14 @@ Status BaseClockTable::ChargeUsageMaybeEvictStrict( // Grab any available capacity, and free up any more required. size_t old_usage = usage_.load(std::memory_order_relaxed); size_t new_usage; - if (LIKELY(old_usage != capacity)) { - do { - new_usage = std::min(capacity, old_usage + total_charge); - } while (!usage_.compare_exchange_weak(old_usage, new_usage, - std::memory_order_relaxed)); - } else { - new_usage = old_usage; - } + do { + new_usage = std::min(capacity, old_usage + total_charge); + if (new_usage == old_usage) { + // No change needed + break; + } + } while (!usage_.compare_exchange_weak(old_usage, new_usage, + std::memory_order_relaxed)); // How much do we need to evict then? size_t need_evict_charge = old_usage + total_charge - new_usage; size_t request_evict_charge = need_evict_charge; @@ -1418,7 +1420,7 @@ void AddShardEvaluation(const HyperClockCache::Shard& shard, // If filled to capacity, what would the occupancy ratio be? double ratio = occ_ratio / usage_ratio; // Given max load factor, what that load factor be? - double lf = ratio * kStrictLoadFactor; + double lf = ratio * HyperClockTable::kStrictLoadFactor; predicted_load_factors.push_back(lf); // Update min_recommendation also @@ -1457,17 +1459,18 @@ void HyperClockCache::ReportProblems( predicted_load_factors.end(), 0.0) / shard_count; - constexpr double kLowSpecLoadFactor = kLoadFactor / 2; - constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414; - if (average_load_factor > kLoadFactor) { + constexpr double kLowSpecLoadFactor = HyperClockTable::kLoadFactor / 2; + constexpr double kMidSpecLoadFactor = HyperClockTable::kLoadFactor / 1.414; + if (average_load_factor > HyperClockTable::kLoadFactor) { // Out of spec => Consider reporting load factor too high // Estimate effective overall capacity loss due to enforcing occupancy limit double lost_portion = 0.0; int over_count = 0; for (double lf : predicted_load_factors) { - if (lf > kStrictLoadFactor) { + if (lf > HyperClockTable::kStrictLoadFactor) { ++over_count; - lost_portion += (lf - kStrictLoadFactor) / lf / shard_count; + lost_portion += + (lf - HyperClockTable::kStrictLoadFactor) / lf / shard_count; } } // >= 20% loss -> error diff --git a/cache/clock_cache.h b/cache/clock_cache.h index 7a1caa023f..7df65ed1bc 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -282,29 +282,6 @@ class ClockCacheTest; // ----------------------------------------------------------------------- // -// The load factor p is a real number in (0, 1) such that at all -// times at most a fraction p of all slots, without counting tombstones, -// are occupied by elements. This means that the probability that a random -// probe hits an occupied slot is at most p, and thus at most 1/p probes -// are required on average. For example, p = 70% implies that between 1 and 2 -// probes are needed on average (bear in mind that this reasoning doesn't -// consider the effects of clustering over time, which should be negligible -// with double hashing). -// Because the size of the hash table is always rounded up to the next -// power of 2, p is really an upper bound on the actual load factor---the -// actual load factor is anywhere between p/2 and p. This is a bit wasteful, -// but bear in mind that slots only hold metadata, not actual values. -// Since space cost is dominated by the values (the LSM blocks), -// overprovisioning the table with metadata only increases the total cache space -// usage by a tiny fraction. -constexpr double kLoadFactor = 0.7; - -// The user can exceed kLoadFactor if the sizes of the inserted values don't -// match estimated_value_size, or in some rare cases with -// strict_capacity_limit == false. To avoid degenerate performance, we set a -// strict upper bound on the load factor. -constexpr double kStrictLoadFactor = 0.84; - struct ClockHandleBasicData { Cache::ObjectPtr value = nullptr; const Cache::CacheItemHelper* helper = nullptr; @@ -374,17 +351,6 @@ struct ClockHandle : public ClockHandleBasicData { // See above. Mutable for read reference counting. mutable std::atomic meta{}; - - // Whether this is a "deteched" handle that is independently allocated - // with `new` (so must be deleted with `delete`). - // TODO: ideally this would be packed into some other data field, such - // as upper bits of total_charge, but that incurs a measurable performance - // regression. - bool standalone = false; - - inline bool IsStandalone() const { return standalone; } - - inline void SetStandalone() { standalone = true; } }; // struct ClockHandle class BaseClockTable { @@ -476,6 +442,7 @@ class BaseClockTable { // Clock algorithm sweep pointer. std::atomic clock_pointer_{}; + // TODO: is this separation needed if we don't do background evictions? ALIGN_AS(CACHE_LINE_SIZE) // Number of elements in the table. std::atomic occupancy_{}; @@ -508,6 +475,16 @@ class HyperClockTable : public BaseClockTable { // up in this slot or a higher one. std::atomic displacements{}; + // Whether this is a "deteched" handle that is independently allocated + // with `new` (so must be deleted with `delete`). + // TODO: ideally this would be packed into some other data field, such + // as upper bits of total_charge, but that incurs a measurable performance + // regression. + bool standalone = false; + + inline bool IsStandalone() const { return standalone; } + + inline void SetStandalone() { standalone = true; } }; // struct HandleImpl struct Opts { @@ -561,6 +538,29 @@ class HyperClockTable : public BaseClockTable { void TEST_ReleaseN(HandleImpl* handle, size_t n); #endif + // The load factor p is a real number in (0, 1) such that at all + // times at most a fraction p of all slots, without counting tombstones, + // are occupied by elements. This means that the probability that a random + // probe hits an occupied slot is at most p, and thus at most 1/p probes + // are required on average. For example, p = 70% implies that between 1 and 2 + // probes are needed on average (bear in mind that this reasoning doesn't + // consider the effects of clustering over time, which should be negligible + // with double hashing). + // Because the size of the hash table is always rounded up to the next + // power of 2, p is really an upper bound on the actual load factor---the + // actual load factor is anywhere between p/2 and p. This is a bit wasteful, + // but bear in mind that slots only hold metadata, not actual values. + // Since space cost is dominated by the values (the LSM blocks), + // overprovisioning the table with metadata only increases the total cache + // space usage by a tiny fraction. + static constexpr double kLoadFactor = 0.7; + + // The user can exceed kLoadFactor if the sizes of the inserted values don't + // match estimated_value_size, or in some rare cases with + // strict_capacity_limit == false. To avoid degenerate performance, we set a + // strict upper bound on the load factor. + static constexpr double kStrictLoadFactor = 0.84; + private: // functions // Returns x mod 2^{length_bits_}. inline size_t ModTableSize(uint64_t x) { diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index cb7beb7b1f..720a1b2c06 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -915,8 +915,10 @@ TEST_F(ClockCacheTest, TableSizesTest) { /*memory_allocator*/ nullptr, kDontChargeCacheMetadata) .MakeSharedCache(); // Table sizes are currently only powers of two - EXPECT_GE(cache->GetTableAddressCount(), est_count / kLoadFactor); - EXPECT_LE(cache->GetTableAddressCount(), est_count / kLoadFactor * 2.0); + EXPECT_GE(cache->GetTableAddressCount(), + est_count / HyperClockTable::kLoadFactor); + EXPECT_LE(cache->GetTableAddressCount(), + est_count / HyperClockTable::kLoadFactor * 2.0); EXPECT_EQ(cache->GetUsage(), 0); // kFullChargeMetaData @@ -933,9 +935,9 @@ TEST_F(ClockCacheTest, TableSizesTest) { double est_count_after_meta = (capacity - cache->GetUsage()) * 1.0 / est_val_size; EXPECT_GE(cache->GetTableAddressCount(), - est_count_after_meta / kLoadFactor); + est_count_after_meta / HyperClockTable::kLoadFactor); EXPECT_LE(cache->GetTableAddressCount(), - est_count_after_meta / kLoadFactor * 2.0); + est_count_after_meta / HyperClockTable::kLoadFactor * 2.0); } } } diff --git a/memory/arena_test.cc b/memory/arena_test.cc index 21bf7ed628..592bbd723f 100644 --- a/memory/arena_test.cc +++ b/memory/arena_test.cc @@ -219,21 +219,28 @@ size_t PopMinorPageFaultCount() { TEST(MmapTest, AllocateLazyZeroed) { // Doesn't have to be page aligned - constexpr size_t len = 1234567; - MemMapping m = MemMapping::AllocateLazyZeroed(len); - auto arr = static_cast(m.Get()); + constexpr size_t len = 1234567; // in bytes + constexpr size_t count = len / 8; // in uint64_t objects + // Implicit conversion move + TypedMemMapping pre_arr = MemMapping::AllocateLazyZeroed(len); + // Move from same type + TypedMemMapping arr = std::move(pre_arr); - // Should generally work - ASSERT_NE(arr, nullptr); + ASSERT_NE(arr.Get(), nullptr); + ASSERT_EQ(arr.Get(), &arr[0]); + ASSERT_EQ(arr.Get(), arr.MemMapping::Get()); + + ASSERT_EQ(arr.Length(), len); + ASSERT_EQ(arr.Count(), count); // Start counting page faults PopMinorPageFaultCount(); // Access half of the allocation size_t i = 0; - for (; i < len / 2; ++i) { + for (; i < count / 2; ++i) { ASSERT_EQ(arr[i], 0); - arr[i] = static_cast(i & 255); + arr[i] = i; } // Appropriate page faults (maybe more) @@ -241,9 +248,9 @@ TEST(MmapTest, AllocateLazyZeroed) { ASSERT_GE(faults, len / 2 / port::kPageSize); // Access rest of the allocation - for (; i < len; ++i) { + for (; i < count; ++i) { ASSERT_EQ(arr[i], 0); - arr[i] = static_cast(i & 255); + arr[i] = i; } // Appropriate page faults (maybe more) @@ -251,8 +258,8 @@ TEST(MmapTest, AllocateLazyZeroed) { ASSERT_GE(faults, len / 2 / port::kPageSize); // Verify data - for (i = 0; i < len; ++i) { - ASSERT_EQ(arr[i], static_cast(i & 255)); + for (i = 0; i < count; ++i) { + ASSERT_EQ(arr[i], i); } } diff --git a/port/mmap.h b/port/mmap.h index 7342a13f96..0f385522fc 100644 --- a/port/mmap.h +++ b/port/mmap.h @@ -14,6 +14,7 @@ #endif // OS_WIN #include +#include #include "rocksdb/rocksdb_namespace.h" @@ -67,4 +68,23 @@ class MemMapping { static MemMapping AllocateAnonymous(size_t length, bool huge); }; +// Simple MemMapping wrapper that presents the memory as an array of T. +// For example, +// TypedMemMapping arr = MemMapping::AllocateLazyZeroed(num_bytes); +template +class TypedMemMapping : public MemMapping { + public: + /*implicit*/ TypedMemMapping(MemMapping&& v) noexcept + : MemMapping(std::move(v)) {} + TypedMemMapping& operator=(MemMapping&& v) noexcept { + MemMapping& base = *this; + base = std::move(v); + } + + inline T* Get() const { return static_cast(MemMapping::Get()); } + inline size_t Count() const { return MemMapping::Length() / sizeof(T); } + + inline T& operator[](size_t index) const { return Get()[index]; } +}; + } // namespace ROCKSDB_NAMESPACE