Safer wrapper for std::atomic, use in HCC (#12051)

Summary: See new atomic.h file comments for motivation. I have updated HyperClockCache to use the new atomic wrapper, fixing a few cases where an implicit conversion was accidentally used and therefore mixing std::memory_order_seq_cst where release/acquire ordering (or relaxed) was intended. There probably wasn't a real bug because I think all the cases happened to be in single-threaded contexts like constructors/destructors or statistical ops like `GetCapacity()` that don't need any particular ordering constraints. Recommended follow-up: * Replace other uses of std::atomic to help keep them safe from bugs. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12051 Test Plan: Did some local correctness stress testing with cache_bench. Also triggered 15 runs of fbcode_blackbox_crash_test and saw no related failures (just 3 failures in ~CacheWithSecondaryAdapter(), already known) No performance difference seen before & after running simultaneously: ``` (while ./cache_bench -cache_type=fixed_hyper_clock_cache -populate_cache=0 -cache_size=3000000000 -ops_per_thread=500000 -threads=12 -histograms=0 2>&1 | grep parallel; do :; done) | awk '{ s += $3; c++; print "Avg time: " (s/c);}' ``` ... for both fixed_hcc and auto_hcc. Reviewed By: jowlyzhang Differential Revision: D51090518 Pulled By: pdillinger fbshipit-source-id: eeb324facb3185584603f9ea0c4de6f32919a2d7
2023-11-08 13:28:43 -08:00 · 2023-11-08 13:28:43 -08:00 · 65cde19f40
parent e406c26c4e
commit 65cde19f40
3 changed files with 359 additions and 302 deletions
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@ -24,6 +24,7 @@
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/secondary_cache.h"
+#include "util/atomic.h"
 #include "util/autovector.h"
 #include "util/math.h"

@ -368,7 +369,7 @@ struct ClockHandle : public ClockHandleBasicData {
  // TODO: make these coundown values tuning parameters for eviction?

  // See above. Mutable for read reference counting.
-  mutable std::atomic<uint64_t> meta{};
+  mutable AcqRelAtomic<uint64_t> meta{};
 };  // struct ClockHandle

 class BaseClockTable {
@ -395,19 +396,15 @@ class BaseClockTable {

  void Ref(ClockHandle& handle);

-  size_t GetOccupancy() const {
-    return occupancy_.load(std::memory_order_relaxed);
-  }
+  size_t GetOccupancy() const { return occupancy_.LoadRelaxed(); }

-  size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
+  size_t GetUsage() const { return usage_.LoadRelaxed(); }

-  size_t GetStandaloneUsage() const {
-    return standalone_usage_.load(std::memory_order_relaxed);
-  }
+  size_t GetStandaloneUsage() const { return standalone_usage_.LoadRelaxed(); }

  uint32_t GetHashSeed() const { return hash_seed_; }

-  uint64_t GetYieldCount() const { return yield_count_.load(); }
+  uint64_t GetYieldCount() const { return yield_count_.LoadRelaxed(); }

  struct EvictionData {
    size_t freed_charge = 0;
@ -460,21 +457,23 @@ class BaseClockTable {
  // operations in ClockCacheShard.

  // Clock algorithm sweep pointer.
-  std::atomic<uint64_t> clock_pointer_{};
+  // (Relaxed: only needs to be consistent with itself.)
+  RelaxedAtomic<uint64_t> clock_pointer_{};

  // Counter for number of times we yield to wait on another thread.
-  std::atomic<uint64_t> yield_count_{};
+  // (Relaxed: a simple stat counter.)
+  RelaxedAtomic<uint64_t> yield_count_{};

  // TODO: is this separation needed if we don't do background evictions?
  ALIGN_AS(CACHE_LINE_SIZE)
  // Number of elements in the table.
-  std::atomic<size_t> occupancy_{};
+  AcqRelAtomic<size_t> occupancy_{};

  // Memory usage by entries tracked by the cache (including standalone)
-  std::atomic<size_t> usage_{};
+  AcqRelAtomic<size_t> usage_{};

  // Part of usage by standalone entries (not in table)
-  std::atomic<size_t> standalone_usage_{};
+  AcqRelAtomic<size_t> standalone_usage_{};

  ALIGN_AS(CACHE_LINE_SIZE)
  const CacheMetadataChargePolicy metadata_charge_policy_;
@ -500,7 +499,11 @@ class FixedHyperClockTable : public BaseClockTable {
  struct ALIGN_AS(64U) HandleImpl : public ClockHandle {
    // The number of elements that hash to this slot or a lower one, but wind
    // up in this slot or a higher one.
-    std::atomic<uint32_t> displacements{};
+    // (Relaxed: within a Cache op, does not need consistency with entries
+    // inserted/removed during that op. For example, a Lookup() that
+    // happens-after an Insert() will see an appropriate displacements value
+    // for the entry to be in a published state.)
+    RelaxedAtomic<uint32_t> displacements{};

    // Whether this is a "deteched" handle that is independently allocated
    // with `new` (so must be deleted with `delete`).
@ -787,17 +790,16 @@ class AutoHyperClockTable : public BaseClockTable {

    // See above. The head pointer is logically independent of the rest of
    // the entry, including the chain next pointer.
-    std::atomic<uint64_t> head_next_with_shift{kUnusedMarker};
-    std::atomic<uint64_t> chain_next_with_shift{kUnusedMarker};
+    AcqRelAtomic<uint64_t> head_next_with_shift{kUnusedMarker};
+    AcqRelAtomic<uint64_t> chain_next_with_shift{kUnusedMarker};

    // For supporting CreateStandalone and some fallback cases.
    inline bool IsStandalone() const {
-      return head_next_with_shift.load(std::memory_order_acquire) ==
-             kStandaloneMarker;
+      return head_next_with_shift.Load() == kStandaloneMarker;
    }

    inline void SetStandalone() {
-      head_next_with_shift.store(kStandaloneMarker, std::memory_order_release);
+      head_next_with_shift.Store(kStandaloneMarker);
    }
  };  // struct HandleImpl

@ -942,19 +944,22 @@ class AutoHyperClockTable : public BaseClockTable {
  // To maximize parallelization of Grow() operations, this field is only
  // updated opportunistically after Grow() operations and in DoInsert() where
  // it is found to be out-of-date. See CatchUpLengthInfoNoWait().
-  std::atomic<uint64_t> length_info_;
+  AcqRelAtomic<uint64_t> length_info_;

  // An already-computed version of the usable length times the max load
  // factor. Could be slightly out of date but GrowIfNeeded()/Grow() handle
  // that internally.
-  std::atomic<size_t> occupancy_limit_;
+  // (Relaxed: allowed to lag behind length_info_ by a little)
+  RelaxedAtomic<size_t> occupancy_limit_;

  // The next index to use from array_ upon the next Grow(). Might be ahead of
  // length_info_.
-  std::atomic<size_t> grow_frontier_;
+  // (Relaxed: self-contained source of truth for next grow home)
+  RelaxedAtomic<size_t> grow_frontier_;

  // See explanation in AutoHyperClockTable::Evict
-  std::atomic<size_t> clock_pointer_mask_;
+  // (Relaxed: allowed to lag behind clock_pointer_ and length_info_ state)
+  RelaxedAtomic<size_t> clock_pointer_mask_;
 };  // class AutoHyperClockTable

 // A single shard of sharded cache.
@ -1070,10 +1075,12 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
  Table table_;

  // Maximum total charge of all elements stored in the table.
-  std::atomic<size_t> capacity_;
+  // (Relaxed: eventual consistency/update is OK)
+  RelaxedAtomic<size_t> capacity_;

  // Whether to reject insertion if cache reaches its full capacity.
-  std::atomic<bool> strict_capacity_limit_;
+  // (Relaxed: eventual consistency/update is OK)
+  RelaxedAtomic<bool> strict_capacity_limit_;
 };  // class ClockCacheShard

 template <class Table>
--- a/util/atomic.h
+++ b/util/atomic.h
@ -0,0 +1,111 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Background:
+// std::atomic is somewhat easy to misuse:
+// * Implicit conversion to T using std::memory_order_seq_cst, along with
+// memory order parameter defaults, make it easy to accidentally mix sequential
+// consistency ordering with acquire/release memory ordering. See
+// "The single total order might not be consistent with happens-before" at
+// https://en.cppreference.com/w/cpp/atomic/memory_order
+// * It's easy to use nonsensical (UB) combinations like store with
+// std::memory_order_acquire.
+// For such reasons, we provide wrappers below to make safe usage easier.
+
+// Wrapper around std::atomic to avoid certain bugs (see Background above).
+//
+// This relaxed-only wrapper is intended for atomics that do not need
+// ordering constraints with other data reads/writes aside from those
+// necessary for computing data values or given by other happens-before
+// relationships. For example, a cross-thread counter that never returns
+// the same result can be a RelaxedAtomic.
+template <typename T>
+class RelaxedAtomic {
+ public:
+  explicit RelaxedAtomic(T initial = {}) : v_(initial) {}
+  void StoreRelaxed(T desired) { v_.store(desired, std::memory_order_relaxed); }
+  T LoadRelaxed() const { return v_.load(std::memory_order_relaxed); }
+  bool CasWeakRelaxed(T& expected, T desired) {
+    return v_.compare_exchange_weak(expected, desired,
+                                    std::memory_order_relaxed);
+  }
+  bool CasStrongRelaxed(T& expected, T desired) {
+    return v_.compare_exchange_strong(expected, desired,
+                                      std::memory_order_relaxed);
+  }
+  T ExchangeRelaxed(T desired) {
+    return v_.exchange(desired, std::memory_order_relaxed);
+  }
+  T FetchAddRelaxed(T operand) {
+    return v_.fetch_add(operand, std::memory_order_relaxed);
+  }
+  T FetchSubRelaxed(T operand) {
+    return v_.fetch_sub(operand, std::memory_order_relaxed);
+  }
+  T FetchAndRelaxed(T operand) {
+    return v_.fetch_and(operand, std::memory_order_relaxed);
+  }
+  T FetchOrRelaxed(T operand) {
+    return v_.fetch_or(operand, std::memory_order_relaxed);
+  }
+  T FetchXorRelaxed(T operand) {
+    return v_.fetch_xor(operand, std::memory_order_relaxed);
+  }
+
+ protected:
+  std::atomic<T> v_;
+};
+
+// Wrapper around std::atomic to avoid certain bugs (see Background above).
+//
+// Except for some unusual cases requiring sequential consistency, this is
+// a general-purpose atomic. Relaxed operations can be mixed in as appropriate.
+template <typename T>
+class AcqRelAtomic : public RelaxedAtomic<T> {
+ public:
+  explicit AcqRelAtomic(T initial = {}) : RelaxedAtomic<T>(initial) {}
+  void Store(T desired) {
+    RelaxedAtomic<T>::v_.store(desired, std::memory_order_release);
+  }
+  T Load() const {
+    return RelaxedAtomic<T>::v_.load(std::memory_order_acquire);
+  }
+  bool CasWeak(T& expected, T desired) {
+    return RelaxedAtomic<T>::v_.compare_exchange_weak(
+        expected, desired, std::memory_order_acq_rel);
+  }
+  bool CasStrong(T& expected, T desired) {
+    return RelaxedAtomic<T>::v_.compare_exchange_strong(
+        expected, desired, std::memory_order_acq_rel);
+  }
+  T Exchange(T desired) {
+    return RelaxedAtomic<T>::v_.exchange(desired, std::memory_order_acq_rel);
+  }
+  T FetchAdd(T operand) {
+    return RelaxedAtomic<T>::v_.fetch_add(operand, std::memory_order_acq_rel);
+  }
+  T FetchSub(T operand) {
+    return RelaxedAtomic<T>::v_.fetch_sub(operand, std::memory_order_acq_rel);
+  }
+  T FetchAnd(T operand) {
+    return RelaxedAtomic<T>::v_.fetch_and(operand, std::memory_order_acq_rel);
+  }
+  T FetchOr(T operand) {
+    return RelaxedAtomic<T>::v_.fetch_or(operand, std::memory_order_acq_rel);
+  }
+  T FetchXor(T operand) {
+    return RelaxedAtomic<T>::v_.fetch_xor(operand, std::memory_order_acq_rel);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE