More minor HCC refactoring + typed mmap (#11670)

Summary: More code leading up to dynamic HCC. * Small enhancements to cache_bench * Extra assertion in Unref * Improve a CAS loop in ChargeUsageMaybeEvictStrict * Put load factor constants in appropriate class * Move `standalone` field to HyperClockTable::HandleImpl because it can be encoded differently in the upcoming dynamic HCC. * Add a typed version of MemMapping to simplify some future code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11670 Test Plan: existing tests, unit test added for TypedMemMapping Reviewed By: jowlyzhang Differential Revision: D48056464 Pulled By: pdillinger fbshipit-source-id: 186b7d3105c5d6d2eb6a592369bc10a97ee14a15
2023-08-07 12:20:23 -07:00 · 2023-08-07 12:20:23 -07:00 · cdb11f5ce6
parent 4500a0d6ec
commit cdb11f5ce6
6 changed files with 102 additions and 65 deletions
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@ -436,6 +436,10 @@ class CacheBench {
    printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio());
    size_t occ = cache_->GetOccupancyCount();
    size_t slot = cache_->GetTableAddressCount();
    printf("Final load factor: %g (%zu / %zu)\n", 1.0 * occ / slot, occ, slot);
    if (FLAGS_histograms) {
      printf("\nOperation latency (ns):\n");
      HistogramImpl combined;
@ -676,6 +680,7 @@ class CacheBench {
 #endif
    printf("----------------------------\n");
    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
    printf("Cache impl name     : %s\n", cache_->Name());
    printf("DMutex impl name    : %s\n", DMutex::kName());
    printf("Number of threads   : %u\n", FLAGS_threads);
    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@ -79,8 +79,10 @@ inline void Unref(const ClockHandle& h, uint64_t count = 1) {
  // Pretend we never took the reference
  // WART: there's a tiny chance we release last ref to invisible
  // entry here. If that happens, we let eviction take care of it.
-  h.meta.fetch_sub(ClockHandle::kAcquireIncrement * count,
+  uint64_t old_meta = h.meta.fetch_sub(ClockHandle::kAcquireIncrement * count,
-                   std::memory_order_release);
+                                       std::memory_order_release);
  assert(GetRefcount(old_meta) != 0);
  (void)old_meta;
 }
 inline bool ClockUpdate(ClockHandle& h) {
@ -406,14 +408,14 @@ Status BaseClockTable::ChargeUsageMaybeEvictStrict(
  // Grab any available capacity, and free up any more required.
  size_t old_usage = usage_.load(std::memory_order_relaxed);
  size_t new_usage;
-  if (LIKELY(old_usage != capacity)) {
+  do {
-    do {
+    new_usage = std::min(capacity, old_usage + total_charge);
-      new_usage = std::min(capacity, old_usage + total_charge);
+    if (new_usage == old_usage) {
-    } while (!usage_.compare_exchange_weak(old_usage, new_usage,
+      // No change needed
-                                           std::memory_order_relaxed));
+      break;
-  } else {
+    }
-    new_usage = old_usage;
+  } while (!usage_.compare_exchange_weak(old_usage, new_usage,
-  }
+                                         std::memory_order_relaxed));
  // How much do we need to evict then?
  size_t need_evict_charge = old_usage + total_charge - new_usage;
  size_t request_evict_charge = need_evict_charge;
@ -1418,7 +1420,7 @@ void AddShardEvaluation(const HyperClockCache::Shard& shard,
  // If filled to capacity, what would the occupancy ratio be?
  double ratio = occ_ratio / usage_ratio;
  // Given max load factor, what that load factor be?
-  double lf = ratio * kStrictLoadFactor;
+  double lf = ratio * HyperClockTable::kStrictLoadFactor;
  predicted_load_factors.push_back(lf);
  // Update min_recommendation also
@ -1457,17 +1459,18 @@ void HyperClockCache::ReportProblems(
                      predicted_load_factors.end(), 0.0) /
      shard_count;
-  constexpr double kLowSpecLoadFactor = kLoadFactor / 2;
+  constexpr double kLowSpecLoadFactor = HyperClockTable::kLoadFactor / 2;
-  constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414;
+  constexpr double kMidSpecLoadFactor = HyperClockTable::kLoadFactor / 1.414;
-  if (average_load_factor > kLoadFactor) {
+  if (average_load_factor > HyperClockTable::kLoadFactor) {
    // Out of spec => Consider reporting load factor too high
    // Estimate effective overall capacity loss due to enforcing occupancy limit
    double lost_portion = 0.0;
    int over_count = 0;
    for (double lf : predicted_load_factors) {
-      if (lf > kStrictLoadFactor) {
+      if (lf > HyperClockTable::kStrictLoadFactor) {
        ++over_count;
-        lost_portion += (lf - kStrictLoadFactor) / lf / shard_count;
+        lost_portion +=
            (lf - HyperClockTable::kStrictLoadFactor) / lf / shard_count;
      }
    }
    // >= 20% loss -> error
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@ -282,29 +282,6 @@ class ClockCacheTest;
 // ----------------------------------------------------------------------- //
 // The load factor p is a real number in (0, 1) such that at all
 // times at most a fraction p of all slots, without counting tombstones,
 // are occupied by elements. This means that the probability that a random
 // probe hits an occupied slot is at most p, and thus at most 1/p probes
 // are required on average. For example, p = 70% implies that between 1 and 2
 // probes are needed on average (bear in mind that this reasoning doesn't
 // consider the effects of clustering over time, which should be negligible
 // with double hashing).
 // Because the size of the hash table is always rounded up to the next
 // power of 2, p is really an upper bound on the actual load factor---the
 // actual load factor is anywhere between p/2 and p. This is a bit wasteful,
 // but bear in mind that slots only hold metadata, not actual values.
 // Since space cost is dominated by the values (the LSM blocks),
 // overprovisioning the table with metadata only increases the total cache space
 // usage by a tiny fraction.
 constexpr double kLoadFactor = 0.7;
 // The user can exceed kLoadFactor if the sizes of the inserted values don't
 // match estimated_value_size, or in some rare cases with
 // strict_capacity_limit == false. To avoid degenerate performance, we set a
 // strict upper bound on the load factor.
 constexpr double kStrictLoadFactor = 0.84;
 struct ClockHandleBasicData {
  Cache::ObjectPtr value = nullptr;
  const Cache::CacheItemHelper* helper = nullptr;
@ -374,17 +351,6 @@ struct ClockHandle : public ClockHandleBasicData {
  // See above. Mutable for read reference counting.
  mutable std::atomic<uint64_t> meta{};
  // Whether this is a "deteched" handle that is independently allocated
  // with `new` (so must be deleted with `delete`).
  // TODO: ideally this would be packed into some other data field, such
  // as upper bits of total_charge, but that incurs a measurable performance
  // regression.
  bool standalone = false;
  inline bool IsStandalone() const { return standalone; }
  inline void SetStandalone() { standalone = true; }
 };  // struct ClockHandle
 class BaseClockTable {
@ -476,6 +442,7 @@ class BaseClockTable {
  // Clock algorithm sweep pointer.
  std::atomic<uint64_t> clock_pointer_{};
  // TODO: is this separation needed if we don't do background evictions?
  ALIGN_AS(CACHE_LINE_SIZE)
  // Number of elements in the table.
  std::atomic<size_t> occupancy_{};
@ -508,6 +475,16 @@ class HyperClockTable : public BaseClockTable {
    // up in this slot or a higher one.
    std::atomic<uint32_t> displacements{};
    // Whether this is a "deteched" handle that is independently allocated
    // with `new` (so must be deleted with `delete`).
    // TODO: ideally this would be packed into some other data field, such
    // as upper bits of total_charge, but that incurs a measurable performance
    // regression.
    bool standalone = false;
    inline bool IsStandalone() const { return standalone; }
    inline void SetStandalone() { standalone = true; }
  };  // struct HandleImpl
  struct Opts {
@ -561,6 +538,29 @@ class HyperClockTable : public BaseClockTable {
  void TEST_ReleaseN(HandleImpl* handle, size_t n);
 #endif
  // The load factor p is a real number in (0, 1) such that at all
  // times at most a fraction p of all slots, without counting tombstones,
  // are occupied by elements. This means that the probability that a random
  // probe hits an occupied slot is at most p, and thus at most 1/p probes
  // are required on average. For example, p = 70% implies that between 1 and 2
  // probes are needed on average (bear in mind that this reasoning doesn't
  // consider the effects of clustering over time, which should be negligible
  // with double hashing).
  // Because the size of the hash table is always rounded up to the next
  // power of 2, p is really an upper bound on the actual load factor---the
  // actual load factor is anywhere between p/2 and p. This is a bit wasteful,
  // but bear in mind that slots only hold metadata, not actual values.
  // Since space cost is dominated by the values (the LSM blocks),
  // overprovisioning the table with metadata only increases the total cache
  // space usage by a tiny fraction.
  static constexpr double kLoadFactor = 0.7;
  // The user can exceed kLoadFactor if the sizes of the inserted values don't
  // match estimated_value_size, or in some rare cases with
  // strict_capacity_limit == false. To avoid degenerate performance, we set a
  // strict upper bound on the load factor.
  static constexpr double kStrictLoadFactor = 0.84;
 private:  // functions
  // Returns x mod 2^{length_bits_}.
  inline size_t ModTableSize(uint64_t x) {
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@ -915,8 +915,10 @@ TEST_F(ClockCacheTest, TableSizesTest) {
                       /*memory_allocator*/ nullptr, kDontChargeCacheMetadata)
                       .MakeSharedCache();
      // Table sizes are currently only powers of two
-      EXPECT_GE(cache->GetTableAddressCount(), est_count / kLoadFactor);
+      EXPECT_GE(cache->GetTableAddressCount(),
-      EXPECT_LE(cache->GetTableAddressCount(), est_count / kLoadFactor * 2.0);
+                est_count / HyperClockTable::kLoadFactor);
      EXPECT_LE(cache->GetTableAddressCount(),
                est_count / HyperClockTable::kLoadFactor * 2.0);
      EXPECT_EQ(cache->GetUsage(), 0);
      // kFullChargeMetaData
@ -933,9 +935,9 @@ TEST_F(ClockCacheTest, TableSizesTest) {
        double est_count_after_meta =
            (capacity - cache->GetUsage()) * 1.0 / est_val_size;
        EXPECT_GE(cache->GetTableAddressCount(),
-                  est_count_after_meta / kLoadFactor);
+                  est_count_after_meta / HyperClockTable::kLoadFactor);
        EXPECT_LE(cache->GetTableAddressCount(),
-                  est_count_after_meta / kLoadFactor * 2.0);
+                  est_count_after_meta / HyperClockTable::kLoadFactor * 2.0);
      }
    }
  }
--- a/memory/arena_test.cc
+++ b/memory/arena_test.cc
@ -219,21 +219,28 @@ size_t PopMinorPageFaultCount() {
 TEST(MmapTest, AllocateLazyZeroed) {
  // Doesn't have to be page aligned
-  constexpr size_t len = 1234567;
+  constexpr size_t len = 1234567;    // in bytes
-  MemMapping m = MemMapping::AllocateLazyZeroed(len);
+  constexpr size_t count = len / 8;  // in uint64_t objects
-  auto arr = static_cast<char*>(m.Get());
+  // Implicit conversion move
  TypedMemMapping<uint64_t> pre_arr = MemMapping::AllocateLazyZeroed(len);
  // Move from same type
  TypedMemMapping<uint64_t> arr = std::move(pre_arr);
-  // Should generally work
+  ASSERT_NE(arr.Get(), nullptr);
-  ASSERT_NE(arr, nullptr);
+  ASSERT_EQ(arr.Get(), &arr[0]);
  ASSERT_EQ(arr.Get(), arr.MemMapping::Get());
  ASSERT_EQ(arr.Length(), len);
  ASSERT_EQ(arr.Count(), count);
  // Start counting page faults
  PopMinorPageFaultCount();
  // Access half of the allocation
  size_t i = 0;
-  for (; i < len / 2; ++i) {
+  for (; i < count / 2; ++i) {
    ASSERT_EQ(arr[i], 0);
-    arr[i] = static_cast<char>(i & 255);
+    arr[i] = i;
  }
  // Appropriate page faults (maybe more)
@ -241,9 +248,9 @@ TEST(MmapTest, AllocateLazyZeroed) {
  ASSERT_GE(faults, len / 2 / port::kPageSize);
  // Access rest of the allocation
-  for (; i < len; ++i) {
+  for (; i < count; ++i) {
    ASSERT_EQ(arr[i], 0);
-    arr[i] = static_cast<char>(i & 255);
+    arr[i] = i;
  }
  // Appropriate page faults (maybe more)
@ -251,8 +258,8 @@ TEST(MmapTest, AllocateLazyZeroed) {
  ASSERT_GE(faults, len / 2 / port::kPageSize);
  // Verify data
-  for (i = 0; i < len; ++i) {
+  for (i = 0; i < count; ++i) {
-    ASSERT_EQ(arr[i], static_cast<char>(i & 255));
+    ASSERT_EQ(arr[i], i);
  }
 }
--- a/port/mmap.h
+++ b/port/mmap.h
@ -14,6 +14,7 @@
 #endif  // OS_WIN
 #include <cstdint>
 #include <utility>
 #include "rocksdb/rocksdb_namespace.h"
@ -67,4 +68,23 @@ class MemMapping {
  static MemMapping AllocateAnonymous(size_t length, bool huge);
 };
 // Simple MemMapping wrapper that presents the memory as an array of T.
 // For example,
 //  TypedMemMapping<uint64_t> arr = MemMapping::AllocateLazyZeroed(num_bytes);
 template <typename T>
 class TypedMemMapping : public MemMapping {
 public:
  /*implicit*/ TypedMemMapping(MemMapping&& v) noexcept
      : MemMapping(std::move(v)) {}
  TypedMemMapping& operator=(MemMapping&& v) noexcept {
    MemMapping& base = *this;
    base = std::move(v);
  }
  inline T* Get() const { return static_cast<T*>(MemMapping::Get()); }
  inline size_t Count() const { return MemMapping::Length() / sizeof(T); }
  inline T& operator[](size_t index) const { return Get()[index]; }
 };
 }  // namespace ROCKSDB_NAMESPACE