// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "cache/clock_cache.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cache/cache_key.h" #include "cache/secondary_cache_adapter.h" #include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics_impl.h" #include "port/lang.h" #include "rocksdb/env.h" #include "util/hash.h" #include "util/math.h" #include "util/random.h" namespace ROCKSDB_NAMESPACE { namespace clock_cache { namespace { inline uint64_t GetRefcount(uint64_t meta) { return ((meta >> ClockHandle::kAcquireCounterShift) - (meta >> ClockHandle::kReleaseCounterShift)) & ClockHandle::kCounterMask; } inline uint64_t GetInitialCountdown(Cache::Priority priority) { // Set initial clock data from priority // TODO: configuration parameters for priority handling and clock cycle // count? switch (priority) { case Cache::Priority::HIGH: return ClockHandle::kHighCountdown; case Cache::Priority::LOW: return ClockHandle::kLowCountdown; case Cache::Priority::BOTTOM: return ClockHandle::kBottomCountdown; } // Switch should have been exhaustive. assert(false); // For release build, fall back on something reasonable. return ClockHandle::kLowCountdown; } inline void MarkEmpty(ClockHandle& h) { #ifndef NDEBUG // Mark slot as empty, with assertion uint64_t meta = h.meta.Exchange(0); assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction); #else // Mark slot as empty h.meta.Store(0); #endif } inline void FreeDataMarkEmpty(ClockHandle& h, MemoryAllocator* allocator) { // NOTE: in theory there's more room for parallelism if we copy the handle // data and delay actions like this until after marking the entry as empty, // but performance tests only show a regression by copying the few words // of data. h.FreeData(allocator); MarkEmpty(h); } // Called to undo the effect of referencing an entry for internal purposes, // so it should not be marked as having been used. inline void Unref(const ClockHandle& h, uint64_t count = 1) { // Pretend we never took the reference // WART: there's a tiny chance we release last ref to invisible // entry here. If that happens, we let eviction take care of it. uint64_t old_meta = h.meta.FetchSub(ClockHandle::kAcquireIncrement * count); assert(GetRefcount(old_meta) != 0); (void)old_meta; } inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data, bool* purgeable = nullptr) { uint64_t meta; if (purgeable) { assert(*purgeable == false); // In AutoHCC, our eviction process follows the chain structure, so we // should ensure that we see the latest state of each entry, at least for // assertion checking. meta = h.meta.Load(); } else { // In FixedHCC, our eviction process is a simple iteration without regard // to probing order, displacements, etc., so it doesn't matter if we see // somewhat stale data. meta = h.meta.LoadRelaxed(); } if (((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit) == 0) { // Only clock update Shareable entries if (purgeable) { *purgeable = true; // AutoHCC only: make sure we only attempt to update non-empty slots assert((meta >> ClockHandle::kStateShift) & ClockHandle::kStateOccupiedBit); } return false; } uint64_t acquire_count = (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask; uint64_t release_count = (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask; if (acquire_count != release_count) { // Only clock update entries with no outstanding refs data->seen_pinned_count++; return false; } if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) && acquire_count > 0) { // Decrement clock uint64_t new_count = std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1); // Compare-exchange in the decremented clock info, but // not aggressively uint64_t new_meta = (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) | (meta & ClockHandle::kHitBitMask) | (new_count << ClockHandle::kReleaseCounterShift) | (new_count << ClockHandle::kAcquireCounterShift); h.meta.CasStrongRelaxed(meta, new_meta); return false; } // Otherwise, remove entry (either unreferenced invisible or // unreferenced and expired visible). if (h.meta.CasStrong(meta, (uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift) | (meta & ClockHandle::kHitBitMask))) { // Took ownership. data->freed_charge += h.GetTotalCharge(); data->freed_count += 1; return true; } else { // Compare-exchange failing probably // indicates the entry was used, so skip it in that case. return false; } } // If an entry doesn't receive clock updates but is repeatedly referenced & // released, the acquire and release counters could overflow without some // intervention. This is that intervention, which should be inexpensive // because it only incurs a simple, very predictable check. (Applying a bit // mask in addition to an increment to every Release likely would be // relatively expensive, because it's an extra atomic update.) // // We do have to assume that we never have many millions of simultaneous // references to a cache handle, because we cannot represent so many // references with the difference in counters, masked to the number of // counter bits. Similarly, we assume there aren't millions of threads // holding transient references (which might be "undone" rather than // released by the way). // // Consider these possible states for each counter: // low: less than kMaxCountdown // medium: kMaxCountdown to half way to overflow + kMaxCountdown // high: half way to overflow + kMaxCountdown, or greater // // And these possible states for the combination of counters: // acquire / release // ------- ------- // low low - Normal / common, with caveats (see below) // medium low - Can happen while holding some refs // high low - Violates assumptions (too many refs) // low medium - Violates assumptions (refs underflow, etc.) // medium medium - Normal (very read heavy cache) // high medium - Can happen while holding some refs // low high - This function is supposed to prevent // medium high - Violates assumptions (refs underflow, etc.) // high high - Needs CorrectNearOverflow // // Basically, this function detects (high, high) state (inferred from // release alone being high) and bumps it back down to (medium, medium) // state with the same refcount and the same logical countdown counter // (everything > kMaxCountdown is logically the same). Note that bumping // down to (low, low) would modify the countdown counter, so is "reserved" // in a sense. // // If near-overflow correction is triggered here, there's no guarantee // that another thread hasn't freed the entry and replaced it with another. // Therefore, it must be the case that the correction does not affect // entries unless they are very old (many millions of acquire-release cycles). // (Our bit manipulation is indeed idempotent and only affects entries in // exceptional cases.) We assume a pre-empted thread will not stall that long. // If it did, the state could be corrupted in the (unlikely) case that the top // bit of the acquire counter is set but not the release counter, and thus // we only clear the top bit of the acquire counter on resumption. It would // then appear that there are too many refs and the entry would be permanently // pinned (which is not terrible for an exceptionally rare occurrence), unless // it is referenced enough (at least kMaxCountdown more times) for the release // counter to reach "high" state again and bumped back to "medium." (This // motivates only checking for release counter in high state, not both in high // state.) inline void CorrectNearOverflow(uint64_t old_meta, AcqRelAtomic& meta) { // We clear both top-most counter bits at the same time. constexpr uint64_t kCounterTopBit = uint64_t{1} << (ClockHandle::kCounterNumBits - 1); constexpr uint64_t kClearBits = (kCounterTopBit << ClockHandle::kAcquireCounterShift) | (kCounterTopBit << ClockHandle::kReleaseCounterShift); // A simple check that allows us to initiate clearing the top bits for // a large portion of the "high" state space on release counter. constexpr uint64_t kCheckBits = (kCounterTopBit | (ClockHandle::kMaxCountdown + 1)) << ClockHandle::kReleaseCounterShift; if (UNLIKELY(old_meta & kCheckBits)) { meta.FetchAndRelaxed(~kClearBits); } } inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, uint64_t initial_countdown, bool* already_matches) { assert(*already_matches == false); // Optimistically transition the slot from "empty" to // "under construction" (no effect on other states) uint64_t old_meta = h.meta.FetchOr(uint64_t{ClockHandle::kStateOccupiedBit} << ClockHandle::kStateShift); uint64_t old_state = old_meta >> ClockHandle::kStateShift; if (old_state == ClockHandle::kStateEmpty) { // We've started inserting into an available slot, and taken // ownership. return true; } else if (old_state != ClockHandle::kStateVisible) { // Slot not usable / touchable now return false; } // Existing, visible entry, which might be a match. // But first, we need to acquire a ref to read it. In fact, number of // refs for initial countdown, so that we boost the clock state if // this is a match. old_meta = h.meta.FetchAdd(ClockHandle::kAcquireIncrement * initial_countdown); // Like Lookup if ((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateVisible) { // Acquired a read reference if (h.hashed_key == proto.hashed_key) { // Match. Release in a way that boosts the clock state old_meta = h.meta.FetchAdd(ClockHandle::kReleaseIncrement * initial_countdown); // Correct for possible (but rare) overflow CorrectNearOverflow(old_meta, h.meta); // Insert detached instead (only if return handle needed) *already_matches = true; return false; } else { // Mismatch. Unref(h, initial_countdown); } } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateInvisible)) { // Pretend we never took the reference Unref(h, initial_countdown); } else { // For other states, incrementing the acquire counter has no effect // so we don't need to undo it. // Slot not usable / touchable now. } return false; } inline void FinishSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, uint64_t initial_countdown, bool keep_ref) { // Save data fields ClockHandleBasicData* h_alias = &h; *h_alias = proto; // Transition from "under construction" state to "visible" state uint64_t new_meta = uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift; // Maybe with an outstanding reference new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift; new_meta |= (initial_countdown - keep_ref) << ClockHandle::kReleaseCounterShift; #ifndef NDEBUG // Save the state transition, with assertion uint64_t old_meta = h.meta.Exchange(new_meta); assert(old_meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction); #else // Save the state transition h.meta.Store(new_meta); #endif } bool TryInsert(const ClockHandleBasicData& proto, ClockHandle& h, uint64_t initial_countdown, bool keep_ref, bool* already_matches) { bool b = BeginSlotInsert(proto, h, initial_countdown, already_matches); if (b) { FinishSlotInsert(proto, h, initial_countdown, keep_ref); } return b; } // Func must be const HandleImpl& -> void callable template void ConstApplyToEntriesRange(const Func& func, const HandleImpl* begin, const HandleImpl* end, bool apply_if_will_be_deleted) { uint64_t check_state_mask = ClockHandle::kStateShareableBit; if (!apply_if_will_be_deleted) { check_state_mask |= ClockHandle::kStateVisibleBit; } for (const HandleImpl* h = begin; h < end; ++h) { // Note: to avoid using compare_exchange, we have to be extra careful. uint64_t old_meta = h->meta.LoadRelaxed(); // Check if it's an entry visible to lookups if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { // Increment acquire counter. Note: it's possible that the entry has // completely changed since we loaded old_meta, but incrementing acquire // count is always safe. (Similar to optimistic Lookup here.) old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); // Check whether we actually acquired a reference. if ((old_meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit) { // Apply func if appropriate if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { func(*h); } // Pretend we never took the reference Unref(*h); // No net change, so don't need to check for overflow } else { // For other states, incrementing the acquire counter has no effect // so we don't need to undo it. Furthermore, we cannot safely undo // it because we did not acquire a read reference to lock the // entry in a Shareable state. } } } } constexpr uint32_t kStrictCapacityLimitBit = 1u << 31; uint32_t SanitizeEncodeEecAndScl(int eviction_effort_cap, bool strict_capacit_limit) { eviction_effort_cap = std::max(int{1}, eviction_effort_cap); eviction_effort_cap = std::min(static_cast(~kStrictCapacityLimitBit), eviction_effort_cap); uint32_t eec_and_scl = static_cast(eviction_effort_cap); eec_and_scl |= strict_capacit_limit ? kStrictCapacityLimitBit : 0; return eec_and_scl; } } // namespace void ClockHandleBasicData::FreeData(MemoryAllocator* allocator) const { if (helper->del_cb) { helper->del_cb(value, allocator); } } template HandleImpl* BaseClockTable::StandaloneInsert( const ClockHandleBasicData& proto) { // Heap allocated separate from table HandleImpl* h = new HandleImpl(); ClockHandleBasicData* h_alias = h; *h_alias = proto; h->SetStandalone(); // Single reference (standalone entries only created if returning a refed // Handle back to user) uint64_t meta = uint64_t{ClockHandle::kStateInvisible} << ClockHandle::kStateShift; meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift; h->meta.Store(meta); // Keep track of how much of usage is standalone standalone_usage_.FetchAddRelaxed(proto.GetTotalCharge()); return h; } template typename Table::HandleImpl* BaseClockTable::CreateStandalone( ClockHandleBasicData& proto, size_t capacity, uint32_t eec_and_scl, bool allow_uncharged) { Table& derived = static_cast(*this); typename Table::InsertState state; derived.StartInsert(state); const size_t total_charge = proto.GetTotalCharge(); // NOTE: we can use eec_and_scl as eviction_effort_cap below because // strict_capacity_limit=true is supposed to disable the limit on eviction // effort, and a large value effectively does that. if (eec_and_scl & kStrictCapacityLimitBit) { Status s = ChargeUsageMaybeEvictStrict( total_charge, capacity, /*need_evict_for_occupancy=*/false, eec_and_scl, state); if (!s.ok()) { if (allow_uncharged) { proto.total_charge = 0; } else { return nullptr; } } } else { // Case strict_capacity_limit == false bool success = ChargeUsageMaybeEvictNonStrict
( total_charge, capacity, /*need_evict_for_occupancy=*/false, eec_and_scl, state); if (!success) { // Force the issue usage_.FetchAddRelaxed(total_charge); } } return StandaloneInsert(proto); } template Status BaseClockTable::ChargeUsageMaybeEvictStrict( size_t total_charge, size_t capacity, bool need_evict_for_occupancy, uint32_t eviction_effort_cap, typename Table::InsertState& state) { if (total_charge > capacity) { return Status::MemoryLimit( "Cache entry too large for a single cache shard: " + std::to_string(total_charge) + " > " + std::to_string(capacity)); } // Grab any available capacity, and free up any more required. size_t old_usage = usage_.LoadRelaxed(); size_t new_usage; do { new_usage = std::min(capacity, old_usage + total_charge); if (new_usage == old_usage) { // No change needed break; } } while (!usage_.CasWeakRelaxed(old_usage, new_usage)); // How much do we need to evict then? size_t need_evict_charge = old_usage + total_charge - new_usage; size_t request_evict_charge = need_evict_charge; if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) { // Require at least 1 eviction. request_evict_charge = 1; } if (request_evict_charge > 0) { EvictionData data; static_cast(this)->Evict(request_evict_charge, state, &data, eviction_effort_cap); occupancy_.FetchSub(data.freed_count); if (LIKELY(data.freed_charge > need_evict_charge)) { assert(data.freed_count > 0); // Evicted more than enough usage_.FetchSubRelaxed(data.freed_charge - need_evict_charge); } else if (data.freed_charge < need_evict_charge || (UNLIKELY(need_evict_for_occupancy) && data.freed_count == 0)) { // Roll back to old usage minus evicted usage_.FetchSubRelaxed(data.freed_charge + (new_usage - old_usage)); if (data.freed_charge < need_evict_charge) { return Status::MemoryLimit( "Insert failed because unable to evict entries to stay within " "capacity limit."); } else { return Status::MemoryLimit( "Insert failed because unable to evict entries to stay within " "table occupancy limit."); } } // If we needed to evict something and we are proceeding, we must have // evicted something. assert(data.freed_count > 0); } return Status::OK(); } template inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict( size_t total_charge, size_t capacity, bool need_evict_for_occupancy, uint32_t eviction_effort_cap, typename Table::InsertState& state) { // For simplicity, we consider that either the cache can accept the insert // with no evictions, or we must evict enough to make (at least) enough // space. It could lead to unnecessary failures or excessive evictions in // some extreme cases, but allows a fast, simple protocol. If we allow a // race to get us over capacity, then we might never get back to capacity // limit if the sizes of entries allow each insertion to evict the minimum // charge. Thus, we should evict some extra if it's not a signifcant // portion of the shard capacity. This can have the side benefit of // involving fewer threads in eviction. size_t old_usage = usage_.LoadRelaxed(); size_t need_evict_charge; // NOTE: if total_charge > old_usage, there isn't yet enough to evict // `total_charge` amount. Even if we only try to evict `old_usage` amount, // there's likely something referenced and we would eat CPU looking for // enough to evict. if (old_usage + total_charge <= capacity || total_charge > old_usage) { // Good enough for me (might run over with a race) need_evict_charge = 0; } else { // Try to evict enough space, and maybe some extra need_evict_charge = total_charge; if (old_usage > capacity) { // Not too much to avoid thundering herd while avoiding strict // synchronization, such as the compare_exchange used with strict // capacity limit. need_evict_charge += std::min(capacity / 1024, total_charge) + 1; } } if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) { // Special case: require at least 1 eviction if we only have to // deal with occupancy need_evict_charge = 1; } EvictionData data; if (need_evict_charge > 0) { static_cast(this)->Evict(need_evict_charge, state, &data, eviction_effort_cap); // Deal with potential occupancy deficit if (UNLIKELY(need_evict_for_occupancy) && data.freed_count == 0) { assert(data.freed_charge == 0); // Can't meet occupancy requirement return false; } else { // Update occupancy for evictions occupancy_.FetchSub(data.freed_count); } } // Track new usage even if we weren't able to evict enough usage_.FetchAddRelaxed(total_charge - data.freed_charge); // No underflow assert(usage_.LoadRelaxed() < SIZE_MAX / 2); // Success return true; } void BaseClockTable::TrackAndReleaseEvictedEntry(ClockHandle* h) { bool took_value_ownership = false; if (eviction_callback_) { // For key reconstructed from hash UniqueId64x2 unhashed; took_value_ownership = eviction_callback_(ClockCacheShard::ReverseHash( h->GetHash(), &unhashed, hash_seed_), reinterpret_cast(h), h->meta.LoadRelaxed() & ClockHandle::kHitBitMask); } if (!took_value_ownership) { h->FreeData(allocator_); } MarkEmpty(*h); } bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data, uint32_t eviction_effort_cap) { // Basically checks whether the ratio of useful effort to wasted effort is // too low, with a start-up allowance for wasted effort before any useful // effort. return (data.freed_count + 1U) * uint64_t{eviction_effort_cap} <= data.seen_pinned_count; } template Status BaseClockTable::Insert(const ClockHandleBasicData& proto, typename Table::HandleImpl** handle, Cache::Priority priority, size_t capacity, uint32_t eec_and_scl) { using HandleImpl = typename Table::HandleImpl; Table& derived = static_cast(*this); typename Table::InsertState state; derived.StartInsert(state); // Do we have the available occupancy? Optimistically assume we do // and deal with it if we don't. size_t old_occupancy = occupancy_.FetchAdd(1); // Whether we over-committed and need an eviction to make up for it bool need_evict_for_occupancy = !derived.GrowIfNeeded(old_occupancy + 1, state); // Usage/capacity handling is somewhat different depending on // strict_capacity_limit, but mostly pessimistic. bool use_standalone_insert = false; const size_t total_charge = proto.GetTotalCharge(); // NOTE: we can use eec_and_scl as eviction_effort_cap below because // strict_capacity_limit=true is supposed to disable the limit on eviction // effort, and a large value effectively does that. if (eec_and_scl & kStrictCapacityLimitBit) { Status s = ChargeUsageMaybeEvictStrict
( total_charge, capacity, need_evict_for_occupancy, eec_and_scl, state); if (!s.ok()) { // Revert occupancy occupancy_.FetchSubRelaxed(1); return s; } } else { // Case strict_capacity_limit == false bool success = ChargeUsageMaybeEvictNonStrict
( total_charge, capacity, need_evict_for_occupancy, eec_and_scl, state); if (!success) { // Revert occupancy occupancy_.FetchSubRelaxed(1); if (handle == nullptr) { // Don't insert the entry but still return ok, as if the entry // inserted into cache and evicted immediately. proto.FreeData(allocator_); return Status::OK(); } else { // Need to track usage of fallback standalone insert usage_.FetchAddRelaxed(total_charge); use_standalone_insert = true; } } } if (!use_standalone_insert) { // Attempt a table insert, but abort if we find an existing entry for the // key. If we were to overwrite old entries, we would either // * Have to gain ownership over an existing entry to overwrite it, which // would only work if there are no outstanding (read) references and would // create a small gap in availability of the entry (old or new) to lookups. // * Have to insert into a suboptimal location (more probes) so that the // old entry can be kept around as well. uint64_t initial_countdown = GetInitialCountdown(priority); assert(initial_countdown > 0); HandleImpl* e = derived.DoInsert(proto, initial_countdown, handle != nullptr, state); if (e) { // Successfully inserted if (handle) { *handle = e; } return Status::OK(); } // Not inserted // Revert occupancy occupancy_.FetchSubRelaxed(1); // Maybe fall back on standalone insert if (handle == nullptr) { // Revert usage usage_.FetchSubRelaxed(total_charge); // No underflow assert(usage_.LoadRelaxed() < SIZE_MAX / 2); // As if unrefed entry immdiately evicted proto.FreeData(allocator_); return Status::OK(); } use_standalone_insert = true; } // Run standalone insert assert(use_standalone_insert); *handle = StandaloneInsert(proto); // The OkOverwritten status is used to count "redundant" insertions into // block cache. This implementation doesn't strictly check for redundant // insertions, but we instead are probably interested in how many insertions // didn't go into the table (instead "standalone"), which could be redundant // Insert or some other reason (use_standalone_insert reasons above). return Status::OkOverwritten(); } void BaseClockTable::Ref(ClockHandle& h) { // Increment acquire counter uint64_t old_meta = h.meta.FetchAdd(ClockHandle::kAcquireIncrement); assert((old_meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit); // Must have already had a reference assert(GetRefcount(old_meta) > 0); (void)old_meta; } #ifndef NDEBUG void BaseClockTable::TEST_RefN(ClockHandle& h, size_t n) { // Increment acquire counter uint64_t old_meta = h.meta.FetchAdd(n * ClockHandle::kAcquireIncrement); assert((old_meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit); (void)old_meta; } void BaseClockTable::TEST_ReleaseNMinus1(ClockHandle* h, size_t n) { assert(n > 0); // Like n-1 Releases, but assumes one more will happen in the caller to take // care of anything like erasing an unreferenced, invisible entry. uint64_t old_meta = h->meta.FetchAdd((n - 1) * ClockHandle::kReleaseIncrement); assert((old_meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit); (void)old_meta; } #endif FixedHyperClockTable::FixedHyperClockTable( size_t capacity, CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const Opts& opts) : BaseClockTable(metadata_charge_policy, allocator, eviction_callback, hash_seed), length_bits_(CalcHashBits(capacity, opts.estimated_value_size, metadata_charge_policy)), length_bits_mask_((size_t{1} << length_bits_) - 1), occupancy_limit_(static_cast((uint64_t{1} << length_bits_) * kStrictLoadFactor)), array_(new HandleImpl[size_t{1} << length_bits_]) { if (metadata_charge_policy == CacheMetadataChargePolicy::kFullChargeCacheMetadata) { usage_.FetchAddRelaxed(size_t{GetTableSize()} * sizeof(HandleImpl)); } static_assert(sizeof(HandleImpl) == 64U, "Expecting size / alignment with common cache line size"); } FixedHyperClockTable::~FixedHyperClockTable() { // Assumes there are no references or active operations on any slot/element // in the table. for (size_t i = 0; i < GetTableSize(); i++) { HandleImpl& h = array_[i]; switch (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) { case ClockHandle::kStateEmpty: // noop break; case ClockHandle::kStateInvisible: // rare but possible case ClockHandle::kStateVisible: assert(GetRefcount(h.meta.LoadRelaxed()) == 0); h.FreeData(allocator_); #ifndef NDEBUG Rollback(h.hashed_key, &h); ReclaimEntryUsage(h.GetTotalCharge()); #endif break; // otherwise default: assert(false); break; } } #ifndef NDEBUG for (size_t i = 0; i < GetTableSize(); i++) { assert(array_[i].displacements.LoadRelaxed() == 0); } #endif assert(usage_.LoadRelaxed() == 0 || usage_.LoadRelaxed() == size_t{GetTableSize()} * sizeof(HandleImpl)); assert(occupancy_.LoadRelaxed() == 0); } void FixedHyperClockTable::StartInsert(InsertState&) {} bool FixedHyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) { return new_occupancy <= occupancy_limit_; } FixedHyperClockTable::HandleImpl* FixedHyperClockTable::DoInsert( const ClockHandleBasicData& proto, uint64_t initial_countdown, bool keep_ref, InsertState&) { bool already_matches = false; HandleImpl* e = FindSlot( proto.hashed_key, [&](HandleImpl* h) { return TryInsert(proto, *h, initial_countdown, keep_ref, &already_matches); }, [&](HandleImpl* h) { if (already_matches) { // Stop searching & roll back displacements Rollback(proto.hashed_key, h); return true; } else { // Keep going return false; } }, [&](HandleImpl* h, bool is_last) { if (is_last) { // Search is ending. Roll back displacements Rollback(proto.hashed_key, h); } else { h->displacements.FetchAddRelaxed(1); } }); if (already_matches) { // Insertion skipped return nullptr; } if (e != nullptr) { // Successfully inserted return e; } // Else, no available slot found. Occupancy check should generally prevent // this, except it's theoretically possible for other threads to evict and // replace entries in the right order to hit every slot when it is populated. // Assuming random hashing, the chance of that should be no higher than // pow(kStrictLoadFactor, n) for n slots. That should be infeasible for // roughly n >= 256, so if this assertion fails, that suggests something is // going wrong. assert(GetTableSize() < 256); return nullptr; } FixedHyperClockTable::HandleImpl* FixedHyperClockTable::Lookup( const UniqueId64x2& hashed_key) { HandleImpl* e = FindSlot( hashed_key, [&](HandleImpl* h) { // Mostly branch-free version (similar performance) /* uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement, std::memory_order_acquire); bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U; bool visible = (old_meta >> ClockHandle::kStateShift) & 1U; bool match = (h->key == key) & visible; h->meta.FetchSub(static_cast(Shareable & !match) << ClockHandle::kAcquireCounterShift); return match; */ // Optimistic lookup should pay off when the table is relatively // sparse. constexpr bool kOptimisticLookup = true; uint64_t old_meta; if (!kOptimisticLookup) { old_meta = h->meta.Load(); if ((old_meta >> ClockHandle::kStateShift) != ClockHandle::kStateVisible) { return false; } } // (Optimistically) increment acquire counter old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); // Check if it's an entry visible to lookups if ((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateVisible) { // Acquired a read reference if (h->hashed_key == hashed_key) { // Match // Update the hit bit if (eviction_callback_) { h->meta.FetchOrRelaxed(uint64_t{1} << ClockHandle::kHitBitShift); } return true; } else { // Mismatch. Pretend we never took the reference Unref(*h); } } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateInvisible)) { // Pretend we never took the reference Unref(*h); } else { // For other states, incrementing the acquire counter has no effect // so we don't need to undo it. Furthermore, we cannot safely undo // it because we did not acquire a read reference to lock the // entry in a Shareable state. } return false; }, [&](HandleImpl* h) { return h->displacements.LoadRelaxed() == 0; }, [&](HandleImpl* /*h*/, bool /*is_last*/) {}); return e; } bool FixedHyperClockTable::Release(HandleImpl* h, bool useful, bool erase_if_last_ref) { // In contrast with LRUCache's Release, this function won't delete the handle // when the cache is above capacity and the reference is the last one. Space // is only freed up by EvictFromClock (called by Insert when space is needed) // and Erase. We do this to avoid an extra atomic read of the variable usage_. uint64_t old_meta; if (useful) { // Increment release counter to indicate was used old_meta = h->meta.FetchAdd(ClockHandle::kReleaseIncrement); } else { // Decrement acquire counter to pretend it never happened old_meta = h->meta.FetchSub(ClockHandle::kAcquireIncrement); } assert((old_meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit); // No underflow assert(((old_meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask) != ((old_meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask)); if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift == ClockHandle::kStateInvisible)) { // FIXME: There's a chance here that another thread could replace this // entry and we end up erasing the wrong one. // Update for last FetchAdd op if (useful) { old_meta += ClockHandle::kReleaseIncrement; } else { old_meta -= ClockHandle::kAcquireIncrement; } // Take ownership if no refs do { if (GetRefcount(old_meta) != 0) { // Not last ref at some point in time during this Release call // Correct for possible (but rare) overflow CorrectNearOverflow(old_meta, h->meta); return false; } if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit} << ClockHandle::kStateShift)) == 0) { // Someone else took ownership return false; } // Note that there's a small chance that we release, another thread // replaces this entry with another, reaches zero refs, and then we end // up erasing that other entry. That's an acceptable risk / imprecision. } while ( !h->meta.CasWeak(old_meta, uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift)); // Took ownership size_t total_charge = h->GetTotalCharge(); if (UNLIKELY(h->IsStandalone())) { h->FreeData(allocator_); // Delete standalone handle delete h; standalone_usage_.FetchSubRelaxed(total_charge); usage_.FetchSubRelaxed(total_charge); } else { Rollback(h->hashed_key, h); FreeDataMarkEmpty(*h, allocator_); ReclaimEntryUsage(total_charge); } return true; } else { // Correct for possible (but rare) overflow CorrectNearOverflow(old_meta, h->meta); return false; } } #ifndef NDEBUG void FixedHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) { if (n > 0) { // Do n-1 simple releases first TEST_ReleaseNMinus1(h, n); // Then the last release might be more involved Release(h, /*useful*/ true, /*erase_if_last_ref*/ false); } } #endif void FixedHyperClockTable::Erase(const UniqueId64x2& hashed_key) { (void)FindSlot( hashed_key, [&](HandleImpl* h) { // Could be multiple entries in rare cases. Erase them all. // Optimistically increment acquire counter uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); // Check if it's an entry visible to lookups if ((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateVisible) { // Acquired a read reference if (h->hashed_key == hashed_key) { // Match. Set invisible. old_meta = h->meta.FetchAnd(~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift)); // Apply update to local copy old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift); for (;;) { uint64_t refcount = GetRefcount(old_meta); assert(refcount > 0); if (refcount > 1) { // Not last ref at some point in time during this Erase call // Pretend we never took the reference Unref(*h); break; } else if (h->meta.CasWeak( old_meta, uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift)) { // Took ownership assert(hashed_key == h->hashed_key); size_t total_charge = h->GetTotalCharge(); FreeDataMarkEmpty(*h, allocator_); ReclaimEntryUsage(total_charge); // We already have a copy of hashed_key in this case, so OK to // delay Rollback until after releasing the entry Rollback(hashed_key, h); break; } } } else { // Mismatch. Pretend we never took the reference Unref(*h); } } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateInvisible)) { // Pretend we never took the reference Unref(*h); } else { // For other states, incrementing the acquire counter has no effect // so we don't need to undo it. } return false; }, [&](HandleImpl* h) { return h->displacements.LoadRelaxed() == 0; }, [&](HandleImpl* /*h*/, bool /*is_last*/) {}); } void FixedHyperClockTable::EraseUnRefEntries() { for (size_t i = 0; i <= this->length_bits_mask_; i++) { HandleImpl& h = array_[i]; uint64_t old_meta = h.meta.LoadRelaxed(); if (old_meta & (uint64_t{ClockHandle::kStateShareableBit} << ClockHandle::kStateShift) && GetRefcount(old_meta) == 0 && h.meta.CasStrong(old_meta, uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift)) { // Took ownership size_t total_charge = h.GetTotalCharge(); Rollback(h.hashed_key, &h); FreeDataMarkEmpty(h, allocator_); ReclaimEntryUsage(total_charge); } } } template inline FixedHyperClockTable::HandleImpl* FixedHyperClockTable::FindSlot( const UniqueId64x2& hashed_key, const MatchFn& match_fn, const AbortFn& abort_fn, const UpdateFn& update_fn) { // NOTE: upper 32 bits of hashed_key[0] is used for sharding // // We use double-hashing probing. Every probe in the sequence is a // pseudorandom integer, computed as a linear function of two random hashes, // which we call base and increment. Specifically, the i-th probe is base + i // * increment modulo the table size. size_t base = static_cast(hashed_key[1]); // We use an odd increment, which is relatively prime with the power-of-two // table size. This implies that we cycle back to the first probe only // after probing every slot exactly once. // TODO: we could also reconsider linear probing, though locality benefits // are limited because each slot is a full cache line size_t increment = static_cast(hashed_key[0]) | 1U; size_t first = ModTableSize(base); size_t current = first; bool is_last; do { HandleImpl* h = &array_[current]; if (match_fn(h)) { return h; } if (abort_fn(h)) { return nullptr; } current = ModTableSize(current + increment); is_last = current == first; update_fn(h, is_last); } while (!is_last); // We looped back. return nullptr; } inline void FixedHyperClockTable::Rollback(const UniqueId64x2& hashed_key, const HandleImpl* h) { size_t current = ModTableSize(hashed_key[1]); size_t increment = static_cast(hashed_key[0]) | 1U; while (&array_[current] != h) { array_[current].displacements.FetchSubRelaxed(1); current = ModTableSize(current + increment); } } inline void FixedHyperClockTable::ReclaimEntryUsage(size_t total_charge) { auto old_occupancy = occupancy_.FetchSub(1U); (void)old_occupancy; // No underflow assert(old_occupancy > 0); auto old_usage = usage_.FetchSubRelaxed(total_charge); (void)old_usage; // No underflow assert(old_usage >= total_charge); } inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&, EvictionData* data, uint32_t eviction_effort_cap) { // precondition assert(requested_charge > 0); // TODO: make a tuning parameter? constexpr size_t step_size = 4; // First (concurrent) increment clock pointer uint64_t old_clock_pointer = clock_pointer_.FetchAddRelaxed(step_size); // Cap the eviction effort at this thread (along with those operating in // parallel) circling through the whole structure kMaxCountdown times. // In other words, this eviction run must find something/anything that is // unreferenced at start of and during the eviction run that isn't reclaimed // by a concurrent eviction run. uint64_t max_clock_pointer = old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_); for (;;) { for (size_t i = 0; i < step_size; i++) { HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))]; bool evicting = ClockUpdate(h, data); if (evicting) { Rollback(h.hashed_key, &h); TrackAndReleaseEvictedEntry(&h); } } // Loop exit condition if (data->freed_charge >= requested_charge) { return; } if (old_clock_pointer >= max_clock_pointer) { return; } if (IsEvictionEffortExceeded(*data, eviction_effort_cap)) { eviction_effort_exceeded_count_.FetchAddRelaxed(1); return; } // Advance clock pointer (concurrently) old_clock_pointer = clock_pointer_.FetchAddRelaxed(step_size); } } template ClockCacheShard
::ClockCacheShard( size_t capacity, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const typename Table::Opts& opts) : CacheShardBase(metadata_charge_policy), table_(capacity, metadata_charge_policy, allocator, eviction_callback, hash_seed, opts), capacity_(capacity), eec_and_scl_(SanitizeEncodeEecAndScl(opts.eviction_effort_cap, strict_capacity_limit)) { // Initial charge metadata should not exceed capacity assert(table_.GetUsage() <= capacity_.LoadRelaxed() || capacity_.LoadRelaxed() < sizeof(HandleImpl)); } template void ClockCacheShard
::EraseUnRefEntries() { table_.EraseUnRefEntries(); } template void ClockCacheShard
::ApplyToSomeEntries( const std::function& callback, size_t average_entries_per_lock, size_t* state) { // The state will be a simple index into the table. Even with a dynamic // hyper clock cache, entries will generally stay in their existing // slots, so we don't need to be aware of the high-level organization // that makes lookup efficient. size_t length = table_.GetTableSize(); assert(average_entries_per_lock > 0); size_t index_begin = *state; size_t index_end = index_begin + average_entries_per_lock; if (index_end >= length) { // Going to end. index_end = length; *state = SIZE_MAX; } else { *state = index_end; } auto hash_seed = table_.GetHashSeed(); ConstApplyToEntriesRange( [callback, hash_seed](const HandleImpl& h) { UniqueId64x2 unhashed; callback(ReverseHash(h.hashed_key, &unhashed, hash_seed), h.value, h.GetTotalCharge(), h.helper); }, table_.HandlePtr(index_begin), table_.HandlePtr(index_end), false); } int FixedHyperClockTable::CalcHashBits( size_t capacity, size_t estimated_value_size, CacheMetadataChargePolicy metadata_charge_policy) { double average_slot_charge = estimated_value_size * kLoadFactor; if (metadata_charge_policy == kFullChargeCacheMetadata) { average_slot_charge += sizeof(HandleImpl); } assert(average_slot_charge > 0.0); uint64_t num_slots = static_cast(capacity / average_slot_charge + 0.999999); int hash_bits = FloorLog2((num_slots << 1) - 1); if (metadata_charge_policy == kFullChargeCacheMetadata) { // For very small estimated value sizes, it's possible to overshoot while (hash_bits > 0 && uint64_t{sizeof(HandleImpl)} << hash_bits > capacity) { hash_bits--; } } return hash_bits; } template void ClockCacheShard
::SetCapacity(size_t capacity) { capacity_.StoreRelaxed(capacity); // next Insert will take care of any necessary evictions } template void ClockCacheShard
::SetStrictCapacityLimit( bool strict_capacity_limit) { if (strict_capacity_limit) { eec_and_scl_.FetchOrRelaxed(kStrictCapacityLimitBit); } else { eec_and_scl_.FetchAndRelaxed(~kStrictCapacityLimitBit); } // next Insert will take care of any necessary evictions } template Status ClockCacheShard
::Insert(const Slice& key, const UniqueId64x2& hashed_key, Cache::ObjectPtr value, const Cache::CacheItemHelper* helper, size_t charge, HandleImpl** handle, Cache::Priority priority) { if (UNLIKELY(key.size() != kCacheKeySize)) { return Status::NotSupported("ClockCache only supports key size " + std::to_string(kCacheKeySize) + "B"); } ClockHandleBasicData proto; proto.hashed_key = hashed_key; proto.value = value; proto.helper = helper; proto.total_charge = charge; return table_.template Insert
(proto, handle, priority, capacity_.LoadRelaxed(), eec_and_scl_.LoadRelaxed()); } template typename Table::HandleImpl* ClockCacheShard
::CreateStandalone( const Slice& key, const UniqueId64x2& hashed_key, Cache::ObjectPtr obj, const Cache::CacheItemHelper* helper, size_t charge, bool allow_uncharged) { if (UNLIKELY(key.size() != kCacheKeySize)) { return nullptr; } ClockHandleBasicData proto; proto.hashed_key = hashed_key; proto.value = obj; proto.helper = helper; proto.total_charge = charge; return table_.template CreateStandalone
(proto, capacity_.LoadRelaxed(), eec_and_scl_.LoadRelaxed(), allow_uncharged); } template typename ClockCacheShard
::HandleImpl* ClockCacheShard
::Lookup( const Slice& key, const UniqueId64x2& hashed_key) { if (UNLIKELY(key.size() != kCacheKeySize)) { return nullptr; } return table_.Lookup(hashed_key); } template bool ClockCacheShard
::Ref(HandleImpl* h) { if (h == nullptr) { return false; } table_.Ref(*h); return true; } template bool ClockCacheShard
::Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) { if (handle == nullptr) { return false; } return table_.Release(handle, useful, erase_if_last_ref); } #ifndef NDEBUG template void ClockCacheShard
::TEST_RefN(HandleImpl* h, size_t n) { table_.TEST_RefN(*h, n); } template void ClockCacheShard
::TEST_ReleaseN(HandleImpl* h, size_t n) { table_.TEST_ReleaseN(h, n); } #endif template bool ClockCacheShard
::Release(HandleImpl* handle, bool erase_if_last_ref) { return Release(handle, /*useful=*/true, erase_if_last_ref); } template void ClockCacheShard
::Erase(const Slice& key, const UniqueId64x2& hashed_key) { if (UNLIKELY(key.size() != kCacheKeySize)) { return; } table_.Erase(hashed_key); } template size_t ClockCacheShard
::GetUsage() const { return table_.GetUsage(); } template size_t ClockCacheShard
::GetStandaloneUsage() const { return table_.GetStandaloneUsage(); } template size_t ClockCacheShard
::GetCapacity() const { return capacity_.LoadRelaxed(); } template size_t ClockCacheShard
::GetPinnedUsage() const { // Computes the pinned usage by scanning the whole hash table. This // is slow, but avoids keeping an exact counter on the clock usage, // i.e., the number of not externally referenced elements. // Why avoid this counter? Because Lookup removes elements from the clock // list, so it would need to update the pinned usage every time, // which creates additional synchronization costs. size_t table_pinned_usage = 0; const bool charge_metadata = metadata_charge_policy_ == kFullChargeCacheMetadata; ConstApplyToEntriesRange( [&table_pinned_usage, charge_metadata](const HandleImpl& h) { uint64_t meta = h.meta.LoadRelaxed(); uint64_t refcount = GetRefcount(meta); // Holding one ref for ConstApplyToEntriesRange assert(refcount > 0); if (refcount > 1) { table_pinned_usage += h.GetTotalCharge(); if (charge_metadata) { table_pinned_usage += sizeof(HandleImpl); } } }, table_.HandlePtr(0), table_.HandlePtr(table_.GetTableSize()), true); return table_pinned_usage + table_.GetStandaloneUsage(); } template size_t ClockCacheShard
::GetOccupancyCount() const { return table_.GetOccupancy(); } template size_t ClockCacheShard
::GetOccupancyLimit() const { return table_.GetOccupancyLimit(); } template size_t ClockCacheShard
::GetTableAddressCount() const { return table_.GetTableSize(); } // Explicit instantiation template class ClockCacheShard; template class ClockCacheShard; template BaseHyperClockCache
::BaseHyperClockCache( const HyperClockCacheOptions& opts) : ShardedCache>(opts) { // TODO: should not need to go through two levels of pointer indirection to // get to table entries size_t per_shard = this->GetPerShardCapacity(); MemoryAllocator* alloc = this->memory_allocator(); this->InitShards([&](Shard* cs) { typename Table::Opts table_opts{opts}; new (cs) Shard(per_shard, opts.strict_capacity_limit, opts.metadata_charge_policy, alloc, &this->eviction_callback_, &this->hash_seed_, table_opts); }); } template Cache::ObjectPtr BaseHyperClockCache
::Value(Handle* handle) { return reinterpret_cast(handle)->value; } template size_t BaseHyperClockCache
::GetCharge(Handle* handle) const { return reinterpret_cast(handle) ->GetTotalCharge(); } template const Cache::CacheItemHelper* BaseHyperClockCache
::GetCacheItemHelper( Handle* handle) const { auto h = reinterpret_cast(handle); return h->helper; } namespace { // For each cache shard, estimate what the table load factor would be if // cache filled to capacity with average entries. This is considered // indicative of a potential problem if the shard is essentially operating // "at limit", which we define as high actual usage (>80% of capacity) // or actual occupancy very close to limit (>95% of limit). // Also, for each shard compute the recommended estimated_entry_charge, // and keep the minimum one for use as overall recommendation. void AddShardEvaluation(const FixedHyperClockCache::Shard& shard, std::vector& predicted_load_factors, size_t& min_recommendation) { size_t usage = shard.GetUsage() - shard.GetStandaloneUsage(); size_t capacity = shard.GetCapacity(); double usage_ratio = 1.0 * usage / capacity; size_t occupancy = shard.GetOccupancyCount(); size_t occ_limit = shard.GetOccupancyLimit(); double occ_ratio = 1.0 * occupancy / occ_limit; if (usage == 0 || occupancy == 0 || (usage_ratio < 0.8 && occ_ratio < 0.95)) { // Skip as described above return; } // If filled to capacity, what would the occupancy ratio be? double ratio = occ_ratio / usage_ratio; // Given max load factor, what that load factor be? double lf = ratio * FixedHyperClockTable::kStrictLoadFactor; predicted_load_factors.push_back(lf); // Update min_recommendation also size_t recommendation = usage / occupancy; min_recommendation = std::min(min_recommendation, recommendation); } bool IsSlotOccupied(const ClockHandle& h) { return (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) != 0; } } // namespace // NOTE: GCC might warn about subobject linkage if this is in anon namespace template class LoadVarianceStats { public: std::string Report() const { return "Overall " + PercentStr(positive_count_, samples_) + " (" + std::to_string(positive_count_) + "/" + std::to_string(samples_) + "), Min/Max/Window = " + PercentStr(min_, N) + "/" + PercentStr(max_, N) + "/" + std::to_string(N) + ", MaxRun{Pos/Neg} = " + std::to_string(max_pos_run_) + "/" + std::to_string(max_neg_run_); } void Add(bool positive) { recent_[samples_ % N] = positive; if (positive) { ++positive_count_; ++cur_pos_run_; max_pos_run_ = std::max(max_pos_run_, cur_pos_run_); cur_neg_run_ = 0; } else { ++cur_neg_run_; max_neg_run_ = std::max(max_neg_run_, cur_neg_run_); cur_pos_run_ = 0; } ++samples_; if (samples_ >= N) { size_t count_set = recent_.count(); max_ = std::max(max_, count_set); min_ = std::min(min_, count_set); } } private: size_t max_ = 0; size_t min_ = N; size_t positive_count_ = 0; size_t samples_ = 0; size_t max_pos_run_ = 0; size_t cur_pos_run_ = 0; size_t max_neg_run_ = 0; size_t cur_neg_run_ = 0; std::bitset recent_; static std::string PercentStr(size_t a, size_t b) { if (b == 0) { return "??%"; } else { return std::to_string(uint64_t{100} * a / b) + "%"; } } }; template void BaseHyperClockCache
::ReportProblems( const std::shared_ptr& info_log) const { if (info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) { LoadVarianceStats slot_stats; uint64_t eviction_effort_exceeded_count = 0; this->ForEachShard([&](const BaseHyperClockCache
::Shard* shard) { size_t count = shard->GetTableAddressCount(); for (size_t i = 0; i < count; ++i) { slot_stats.Add(IsSlotOccupied(*shard->GetTable().HandlePtr(i))); } eviction_effort_exceeded_count += shard->GetTable().GetEvictionEffortExceededCount(); }); ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL, "Slot occupancy stats: %s", slot_stats.Report().c_str()); ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL, "Eviction effort exceeded: %" PRIu64, eviction_effort_exceeded_count); } } void FixedHyperClockCache::ReportProblems( const std::shared_ptr& info_log) const { BaseHyperClockCache::ReportProblems(info_log); uint32_t shard_count = GetNumShards(); std::vector predicted_load_factors; size_t min_recommendation = SIZE_MAX; ForEachShard([&](const FixedHyperClockCache::Shard* shard) { AddShardEvaluation(*shard, predicted_load_factors, min_recommendation); }); if (predicted_load_factors.empty()) { // None operating "at limit" -> nothing to report return; } std::sort(predicted_load_factors.begin(), predicted_load_factors.end()); // First, if the average load factor is within spec, we aren't going to // complain about a few shards being out of spec. // NOTE: this is only the average among cache shards operating "at limit," // which should be representative of what we care about. It it normal, even // desirable, for a cache to operate "at limit" so this should not create // selection bias. See AddShardEvaluation(). // TODO: Consider detecting cases where decreasing the number of shards // would be good, e.g. serious imbalance among shards. double average_load_factor = std::accumulate(predicted_load_factors.begin(), predicted_load_factors.end(), 0.0) / shard_count; constexpr double kLowSpecLoadFactor = FixedHyperClockTable::kLoadFactor / 2; constexpr double kMidSpecLoadFactor = FixedHyperClockTable::kLoadFactor / 1.414; if (average_load_factor > FixedHyperClockTable::kLoadFactor) { // Out of spec => Consider reporting load factor too high // Estimate effective overall capacity loss due to enforcing occupancy limit double lost_portion = 0.0; int over_count = 0; for (double lf : predicted_load_factors) { if (lf > FixedHyperClockTable::kStrictLoadFactor) { ++over_count; lost_portion += (lf - FixedHyperClockTable::kStrictLoadFactor) / lf / shard_count; } } // >= 20% loss -> error // >= 10% loss -> consistent warning // >= 1% loss -> intermittent warning InfoLogLevel level = InfoLogLevel::INFO_LEVEL; bool report = true; if (lost_portion > 0.2) { level = InfoLogLevel::ERROR_LEVEL; } else if (lost_portion > 0.1) { level = InfoLogLevel::WARN_LEVEL; } else if (lost_portion > 0.01) { int report_percent = static_cast(lost_portion * 100.0); if (Random::GetTLSInstance()->PercentTrue(report_percent)) { level = InfoLogLevel::WARN_LEVEL; } } else { // don't report report = false; } if (report) { ROCKS_LOG_AT_LEVEL( info_log, level, "FixedHyperClockCache@%p unable to use estimated %.1f%% capacity " "because of full occupancy in %d/%u cache shards " "(estimated_entry_charge too high). " "Recommend estimated_entry_charge=%zu", this, lost_portion * 100.0, over_count, (unsigned)shard_count, min_recommendation); } } else if (average_load_factor < kLowSpecLoadFactor) { // Out of spec => Consider reporting load factor too low // But cautiously because low is not as big of a problem. // Only report if highest occupancy shard is also below // spec and only if average is substantially out of spec if (predicted_load_factors.back() < kLowSpecLoadFactor && average_load_factor < kLowSpecLoadFactor / 1.414) { InfoLogLevel level = InfoLogLevel::INFO_LEVEL; if (average_load_factor < kLowSpecLoadFactor / 2) { level = InfoLogLevel::WARN_LEVEL; } ROCKS_LOG_AT_LEVEL( info_log, level, "FixedHyperClockCache@%p table has low occupancy at full capacity. " "Higher estimated_entry_charge (about %.1fx) would likely improve " "performance. Recommend estimated_entry_charge=%zu", this, kMidSpecLoadFactor / average_load_factor, min_recommendation); } } } // ======================================================================= // AutoHyperClockCache // ======================================================================= // See AutoHyperClockTable::length_info_ etc. for how the linear hashing // metadata is encoded. Here are some example values: // // Used length | min shift | threshold | max shift // 2 | 1 | 0 | 1 // 3 | 1 | 1 | 2 // 4 | 2 | 0 | 2 // 5 | 2 | 1 | 3 // 6 | 2 | 2 | 3 // 7 | 2 | 3 | 3 // 8 | 3 | 0 | 3 // 9 | 3 | 1 | 4 // ... // Note: // * min shift = floor(log2(used length)) // * max shift = ceil(log2(used length)) // * used length == (1 << shift) + threshold // Also, shift=0 is never used in practice, so is reserved for "unset" namespace { inline int LengthInfoToMinShift(uint64_t length_info) { int mask_shift = BitwiseAnd(length_info, int{255}); assert(mask_shift <= 63); assert(mask_shift > 0); return mask_shift; } inline size_t LengthInfoToThreshold(uint64_t length_info) { return static_cast(length_info >> 8); } inline size_t LengthInfoToUsedLength(uint64_t length_info) { size_t threshold = LengthInfoToThreshold(length_info); int shift = LengthInfoToMinShift(length_info); assert(threshold < (size_t{1} << shift)); size_t used_length = (size_t{1} << shift) + threshold; assert(used_length >= 2); return used_length; } inline uint64_t UsedLengthToLengthInfo(size_t used_length) { assert(used_length >= 2); int shift = FloorLog2(used_length); uint64_t threshold = BottomNBits(used_length, shift); uint64_t length_info = (uint64_t{threshold} << 8) + static_cast(shift); assert(LengthInfoToUsedLength(length_info) == used_length); assert(LengthInfoToMinShift(length_info) == shift); assert(LengthInfoToThreshold(length_info) == threshold); return length_info; } inline size_t GetStartingLength(size_t capacity) { if (capacity > port::kPageSize) { // Start with one memory page return port::kPageSize / sizeof(AutoHyperClockTable::HandleImpl); } else { // Mostly to make unit tests happy return 4; } } inline size_t GetHomeIndex(uint64_t hash, int shift) { return static_cast(BottomNBits(hash, shift)); } inline void GetHomeIndexAndShift(uint64_t length_info, uint64_t hash, size_t* home, int* shift) { int min_shift = LengthInfoToMinShift(length_info); size_t threshold = LengthInfoToThreshold(length_info); bool extra_shift = GetHomeIndex(hash, min_shift) < threshold; *home = GetHomeIndex(hash, min_shift + extra_shift); *shift = min_shift + extra_shift; assert(*home < LengthInfoToUsedLength(length_info)); } inline int GetShiftFromNextWithShift(uint64_t next_with_shift) { return BitwiseAnd(next_with_shift, AutoHyperClockTable::HandleImpl::kShiftMask); } inline size_t GetNextFromNextWithShift(uint64_t next_with_shift) { return static_cast(next_with_shift >> AutoHyperClockTable::HandleImpl::kNextShift); } inline uint64_t MakeNextWithShift(size_t next, int shift) { return (uint64_t{next} << AutoHyperClockTable::HandleImpl::kNextShift) | static_cast(shift); } inline uint64_t MakeNextWithShiftEnd(size_t head, int shift) { return AutoHyperClockTable::HandleImpl::kNextEndFlags | MakeNextWithShift(head, shift); } // Helper function for Lookup inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h, int shift = 0, size_t home = 0, bool* full_match_or_unknown = nullptr) { // Must be at least something to match assert(hashed_key || shift > 0); uint64_t old_meta; // (Optimistically) increment acquire counter. old_meta = h.meta.FetchAdd(ClockHandle::kAcquireIncrement); // Check if it's a referencable (sharable) entry if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit} << ClockHandle::kStateShift)) == 0) { // For non-sharable states, incrementing the acquire counter has no effect // so we don't need to undo it. Furthermore, we cannot safely undo // it because we did not acquire a read reference to lock the // entry in a Shareable state. if (full_match_or_unknown) { *full_match_or_unknown = true; } return false; } // Else acquired a read reference assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0); if (hashed_key && h.hashed_key == *hashed_key && LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift))) { // Match on full key, visible if (full_match_or_unknown) { *full_match_or_unknown = true; } return true; } else if (shift > 0 && home == BottomNBits(h.hashed_key[1], shift)) { // NOTE: upper 32 bits of hashed_key[0] is used for sharding // Match on home address, possibly invisible if (full_match_or_unknown) { *full_match_or_unknown = false; } return true; } else { // Mismatch. Pretend we never took the reference Unref(h); if (full_match_or_unknown) { *full_match_or_unknown = false; } return false; } } // Assumes a chain rewrite lock prevents concurrent modification of // these chain pointers void UpgradeShiftsOnRange(AutoHyperClockTable::HandleImpl* arr, size_t& frontier, uint64_t stop_before_or_new_tail, int old_shift, int new_shift) { assert(frontier != SIZE_MAX); assert(new_shift == old_shift + 1); (void)old_shift; (void)new_shift; using HandleImpl = AutoHyperClockTable::HandleImpl; for (;;) { uint64_t next_with_shift = arr[frontier].chain_next_with_shift.Load(); assert(GetShiftFromNextWithShift(next_with_shift) == old_shift); if (next_with_shift == stop_before_or_new_tail) { // Stopping at entry with pointer matching "stop before" assert(!HandleImpl::IsEnd(next_with_shift)); return; } if (HandleImpl::IsEnd(next_with_shift)) { // Also update tail to new tail assert(HandleImpl::IsEnd(stop_before_or_new_tail)); arr[frontier].chain_next_with_shift.Store(stop_before_or_new_tail); // Mark nothing left to upgrade frontier = SIZE_MAX; return; } // Next is another entry to process, so upgrade and advance frontier arr[frontier].chain_next_with_shift.FetchAdd(1U); assert(GetShiftFromNextWithShift(next_with_shift + 1) == new_shift); frontier = GetNextFromNextWithShift(next_with_shift); } } size_t CalcOccupancyLimit(size_t used_length) { return static_cast(used_length * AutoHyperClockTable::kMaxLoadFactor + 0.999); } } // namespace // An RAII wrapper for locking a chain of entries (flag bit on the head) // so that there is only one thread allowed to remove entries from the // chain, or to rewrite it by splitting for Grow. Without the lock, // all lookups and insertions at the head can proceed wait-free. // The class also provides functions for safely manipulating the head pointer // while holding the lock--or wanting to should it become non-empty. // // The flag bits on the head are such that the head cannot be locked if it // is an empty chain, so that a "blind" FetchOr will try to lock a non-empty // chain but have no effect on an empty chain. When a potential rewrite // operation see an empty head pointer, there is no need to lock as the // operation is a no-op. However, there are some cases such as CAS-update // where locking might be required after initially not being needed, if the // operation is forced to revisit the head pointer. class AutoHyperClockTable::ChainRewriteLock { public: using HandleImpl = AutoHyperClockTable::HandleImpl; // Acquire lock if head of h is not an end explicit ChainRewriteLock(HandleImpl* h, RelaxedAtomic& yield_count) : head_ptr_(&h->head_next_with_shift) { Acquire(yield_count); } // RAII wrap existing lock held (or end) explicit ChainRewriteLock(HandleImpl* h, RelaxedAtomic& /*yield_count*/, uint64_t already_locked_or_end) : head_ptr_(&h->head_next_with_shift) { saved_head_ = already_locked_or_end; // already locked or end assert(saved_head_ & HandleImpl::kHeadLocked); } ~ChainRewriteLock() { if (!IsEnd()) { // Release lock uint64_t old = head_ptr_->FetchAnd(~HandleImpl::kHeadLocked); (void)old; assert((old & HandleImpl::kNextEndFlags) == HandleImpl::kHeadLocked); } } void Reset(HandleImpl* h, RelaxedAtomic& yield_count) { this->~ChainRewriteLock(); new (this) ChainRewriteLock(h, yield_count); } // Expected current state, assuming no parallel updates. uint64_t GetSavedHead() const { return saved_head_; } bool CasUpdate(uint64_t next_with_shift, RelaxedAtomic& yield_count) { uint64_t new_head = next_with_shift | HandleImpl::kHeadLocked; uint64_t expected = GetSavedHead(); bool success = head_ptr_->CasStrong(expected, new_head); if (success) { // Ensure IsEnd() is kept up-to-date, including for dtor saved_head_ = new_head; } else { // Parallel update to head, such as Insert() if (IsEnd()) { // Didn't previously hold a lock if (HandleImpl::IsEnd(expected)) { // Still don't need to saved_head_ = expected; } else { // Need to acquire lock before proceeding Acquire(yield_count); } } else { // Parallel update must preserve our lock assert((expected & HandleImpl::kNextEndFlags) == HandleImpl::kHeadLocked); saved_head_ = expected; } } return success; } bool IsEnd() const { return HandleImpl::IsEnd(saved_head_); } private: void Acquire(RelaxedAtomic& yield_count) { for (;;) { // Acquire removal lock on the chain uint64_t old_head = head_ptr_->FetchOr(HandleImpl::kHeadLocked); if ((old_head & HandleImpl::kNextEndFlags) != HandleImpl::kHeadLocked) { // Either acquired the lock or lock not needed (end) assert((old_head & HandleImpl::kNextEndFlags) == 0 || (old_head & HandleImpl::kNextEndFlags) == HandleImpl::kNextEndFlags); saved_head_ = old_head | HandleImpl::kHeadLocked; break; } // NOTE: one of the few yield-wait loops, which is rare enough in practice // for its performance to be insignificant. (E.g. using C++20 atomic // wait/notify would likely be worse because of wasted notify costs.) yield_count.FetchAddRelaxed(1); std::this_thread::yield(); } } AcqRelAtomic* head_ptr_; uint64_t saved_head_; }; AutoHyperClockTable::AutoHyperClockTable( size_t capacity, CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const Opts& opts) : BaseClockTable(metadata_charge_policy, allocator, eviction_callback, hash_seed), array_(MemMapping::AllocateLazyZeroed( sizeof(HandleImpl) * CalcMaxUsableLength(capacity, opts.min_avg_value_size, metadata_charge_policy))), length_info_(UsedLengthToLengthInfo(GetStartingLength(capacity))), occupancy_limit_( CalcOccupancyLimit(LengthInfoToUsedLength(length_info_.Load()))), grow_frontier_(GetTableSize()), clock_pointer_mask_( BottomNBits(UINT64_MAX, LengthInfoToMinShift(length_info_.Load()))) { if (metadata_charge_policy == CacheMetadataChargePolicy::kFullChargeCacheMetadata) { // NOTE: ignoring page boundaries for simplicity usage_.FetchAddRelaxed(size_t{GetTableSize()} * sizeof(HandleImpl)); } static_assert(sizeof(HandleImpl) == 64U, "Expecting size / alignment with common cache line size"); // Populate head pointers uint64_t length_info = length_info_.Load(); int min_shift = LengthInfoToMinShift(length_info); int max_shift = min_shift + 1; size_t major = uint64_t{1} << min_shift; size_t used_length = GetTableSize(); assert(major <= used_length); assert(used_length <= major * 2); // Initialize the initial usable set of slots. This slightly odd iteration // order makes it easier to get the correct shift amount on each head. for (size_t i = 0; i < major; ++i) { #ifndef NDEBUG int shift; size_t home; #endif if (major + i < used_length) { array_[i].head_next_with_shift.StoreRelaxed( MakeNextWithShiftEnd(i, max_shift)); array_[major + i].head_next_with_shift.StoreRelaxed( MakeNextWithShiftEnd(major + i, max_shift)); #ifndef NDEBUG // Extra invariant checking GetHomeIndexAndShift(length_info, i, &home, &shift); assert(home == i); assert(shift == max_shift); GetHomeIndexAndShift(length_info, major + i, &home, &shift); assert(home == major + i); assert(shift == max_shift); #endif } else { array_[i].head_next_with_shift.StoreRelaxed( MakeNextWithShiftEnd(i, min_shift)); #ifndef NDEBUG // Extra invariant checking GetHomeIndexAndShift(length_info, i, &home, &shift); assert(home == i); assert(shift == min_shift); GetHomeIndexAndShift(length_info, major + i, &home, &shift); assert(home == i); assert(shift == min_shift); #endif } } } AutoHyperClockTable::~AutoHyperClockTable() { // As usual, destructor assumes there are no references or active operations // on any slot/element in the table. // It's possible that there were not enough Insert() after final concurrent // Grow to ensure length_info_ (published GetTableSize()) is fully up to // date. Probe for first unused slot to ensure we see the whole structure. size_t used_end = GetTableSize(); while (used_end < array_.Count() && array_[used_end].head_next_with_shift.LoadRelaxed() != HandleImpl::kUnusedMarker) { used_end++; } #ifndef NDEBUG for (size_t i = used_end; i < array_.Count(); i++) { assert(array_[i].head_next_with_shift.LoadRelaxed() == 0); assert(array_[i].chain_next_with_shift.LoadRelaxed() == 0); assert(array_[i].meta.LoadRelaxed() == 0); } std::vector was_populated(used_end); std::vector was_pointed_to(used_end); #endif for (size_t i = 0; i < used_end; i++) { HandleImpl& h = array_[i]; switch (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) { case ClockHandle::kStateEmpty: // noop break; case ClockHandle::kStateInvisible: // rare but possible case ClockHandle::kStateVisible: assert(GetRefcount(h.meta.LoadRelaxed()) == 0); h.FreeData(allocator_); #ifndef NDEBUG // Extra invariant checking usage_.FetchSubRelaxed(h.total_charge); occupancy_.FetchSubRelaxed(1U); was_populated[i] = true; if (!HandleImpl::IsEnd(h.chain_next_with_shift.LoadRelaxed())) { assert((h.chain_next_with_shift.LoadRelaxed() & HandleImpl::kHeadLocked) == 0); size_t next = GetNextFromNextWithShift(h.chain_next_with_shift.LoadRelaxed()); assert(!was_pointed_to[next]); was_pointed_to[next] = true; } #endif break; // otherwise default: assert(false); break; } #ifndef NDEBUG // Extra invariant checking if (!HandleImpl::IsEnd(h.head_next_with_shift.LoadRelaxed())) { size_t next = GetNextFromNextWithShift(h.head_next_with_shift.LoadRelaxed()); assert(!was_pointed_to[next]); was_pointed_to[next] = true; } #endif } #ifndef NDEBUG // Extra invariant checking // This check is not perfect, but should detect most reasonable cases // of abandonned or floating entries, etc. (A floating cycle would not // be reported as bad.) for (size_t i = 0; i < used_end; i++) { if (was_populated[i]) { assert(was_pointed_to[i]); } else { assert(!was_pointed_to[i]); } } #endif // Metadata charging only follows the published table size assert(usage_.LoadRelaxed() == 0 || usage_.LoadRelaxed() == GetTableSize() * sizeof(HandleImpl)); assert(occupancy_.LoadRelaxed() == 0); } size_t AutoHyperClockTable::GetTableSize() const { return LengthInfoToUsedLength(length_info_.Load()); } size_t AutoHyperClockTable::GetOccupancyLimit() const { return occupancy_limit_.LoadRelaxed(); } void AutoHyperClockTable::StartInsert(InsertState& state) { state.saved_length_info = length_info_.Load(); } // Because we have linked lists, bugs or even hardware errors can make it // possible to create a cycle, which would lead to infinite loop. // Furthermore, when we have retry cases in the code, we want to be sure // these are not (and do not become) spin-wait loops. Given the assumption // of quality hashing and the infeasibility of consistently recurring // concurrent modifications to an entry or chain, we can safely bound the // number of loop iterations in feasible operation, whether following chain // pointers or retrying with some backtracking. A smaller limit is used for // stress testing, to detect potential issues such as cycles or spin-waits, // and a larger limit is used to break cycles should they occur in production. #define CHECK_TOO_MANY_ITERATIONS(i) \ { \ assert(i < 768); \ if (UNLIKELY(i >= 4096)) { \ std::terminate(); \ } \ } bool AutoHyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState& state) { // new_occupancy has taken into account other threads that are also trying // to insert, so as soon as we see sufficient *published* usable size, we // can declare success even if we aren't the one that grows the table. // However, there's an awkward state where other threads own growing the // table to sufficient usable size, but the udpated size is not yet // published. If we wait, then that likely slows the ramp-up cache // performance. If we unblock ourselves by ensuring we grow by at least one // slot, we could technically overshoot required size by number of parallel // threads accessing block cache. On balance considering typical cases and // the modest consequences of table being slightly too large, the latter // seems preferable. // // So if the published occupancy limit is too small, we unblock ourselves // by committing to growing the table by at least one slot. Also note that // we might need to grow more than once to actually increase the occupancy // limit (due to max load factor < 1.0) while (UNLIKELY(new_occupancy > occupancy_limit_.LoadRelaxed())) { // At this point we commit the thread to growing unless we've reached the // limit (returns false). if (!Grow(state)) { return false; } } // Success (didn't need to grow, or did successfully) return true; } bool AutoHyperClockTable::Grow(InsertState& state) { // Allocate the next grow slot size_t grow_home = grow_frontier_.FetchAddRelaxed(1); if (grow_home >= array_.Count()) { // Can't grow any more. // (Tested by unit test ClockCacheTest/Limits) // Make sure we don't overflow grow_frontier_ by reaching here repeatedly grow_frontier_.StoreRelaxed(array_.Count()); return false; } #ifdef COERCE_CONTEXT_SWITCH // This is useful in reproducing concurrency issues in Grow() while (Random::GetTLSInstance()->OneIn(2)) { std::this_thread::yield(); } #endif // Basically, to implement https://en.wikipedia.org/wiki/Linear_hashing // entries that belong in a new chain starting at grow_home will be // split off from the chain starting at old_home, which is computed here. int old_shift = FloorLog2(grow_home); size_t old_home = BottomNBits(grow_home, old_shift); assert(old_home + (size_t{1} << old_shift) == grow_home); // Wait here to ensure any Grow operations that would directly feed into // this one are finished, though the full waiting actually completes in // acquiring the rewrite lock for old_home in SplitForGrow. Here we ensure // the expected shift amount has been reached, and there we ensure the // chain rewrite lock has been released. size_t old_old_home = BottomNBits(grow_home, old_shift - 1); for (;;) { uint64_t old_old_head = array_[old_old_home].head_next_with_shift.Load(); if (GetShiftFromNextWithShift(old_old_head) >= old_shift) { if ((old_old_head & HandleImpl::kNextEndFlags) != HandleImpl::kHeadLocked) { break; } } // NOTE: one of the few yield-wait loops, which is rare enough in practice // for its performance to be insignificant. yield_count_.FetchAddRelaxed(1); std::this_thread::yield(); } // Do the dirty work of splitting the chain, including updating heads and // chain nexts for new shift amounts. SplitForGrow(grow_home, old_home, old_shift); // length_info_ can be updated any time after the new shift amount is // published to both heads, potentially before the end of SplitForGrow. // But we also can't update length_info_ until the previous Grow operation // (with grow_home := this grow_home - 1) has published the new shift amount // to both of its heads. However, we don't want to artificially wait here // on that Grow that is otherwise irrelevant. // // We could have each Grow operation advance length_info_ here as far as it // can without waiting, by checking for updated shift on the corresponding // old home and also stopping at an empty head value for possible grow_home. // However, this could increase CPU cache line sharing and in 1/64 cases // bring in an extra page from our mmap. // // Instead, part of the strategy is delegated to DoInsert(): // * Here we try to bring length_info_ up to date with this grow_home as // much as we can without waiting. It will fall short if a previous Grow // is still between reserving the grow slot and making the first big step // to publish the new shift amount. // * To avoid length_info_ being perpetually out-of-date (for a small number // of heads) after our last Grow, we do the same when Insert has to "fall // forward" due to length_info_ being out-of-date. CatchUpLengthInfoNoWait(grow_home); // See usage in DoInsert() state.likely_empty_slot = grow_home; // Success return true; } // See call in Grow() void AutoHyperClockTable::CatchUpLengthInfoNoWait( size_t known_usable_grow_home) { uint64_t current_length_info = length_info_.Load(); size_t published_usable_size = LengthInfoToUsedLength(current_length_info); while (published_usable_size <= known_usable_grow_home) { // For when published_usable_size was grow_home size_t next_usable_size = published_usable_size + 1; uint64_t next_length_info = UsedLengthToLengthInfo(next_usable_size); // known_usable_grow_home is known to be ready for Lookup/Insert with // the new shift amount, but between that and published usable size, we // need to check. if (published_usable_size < known_usable_grow_home) { int old_shift = FloorLog2(next_usable_size - 1); size_t old_home = BottomNBits(published_usable_size, old_shift); int shift = GetShiftFromNextWithShift( array_[old_home].head_next_with_shift.Load()); if (shift <= old_shift) { // Not ready break; } } // CAS update length_info_. This only moves in one direction, so if CAS // fails, someone else made progress like we are trying, and we can just // pick up the new value and keep going as appropriate. if (length_info_.CasStrong(current_length_info, next_length_info)) { current_length_info = next_length_info; // Update usage_ if metadata charge policy calls for it if (metadata_charge_policy_ == CacheMetadataChargePolicy::kFullChargeCacheMetadata) { // NOTE: ignoring page boundaries for simplicity usage_.FetchAddRelaxed(sizeof(HandleImpl)); } } published_usable_size = LengthInfoToUsedLength(current_length_info); } // After updating lengh_info_ we can update occupancy_limit_, // allowing for later operations to update it before us. // Note: there is no AcqRelAtomic max operation, so we have to use a CAS loop size_t old_occupancy_limit = occupancy_limit_.LoadRelaxed(); size_t new_occupancy_limit = CalcOccupancyLimit(published_usable_size); while (old_occupancy_limit < new_occupancy_limit) { if (occupancy_limit_.CasWeakRelaxed(old_occupancy_limit, new_occupancy_limit)) { break; } } } void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home, int old_shift) { int new_shift = old_shift + 1; HandleImpl* const arr = array_.Get(); // We implement a somewhat complicated splitting algorithm to ensure that // entries are always wait-free visible to Lookup, without Lookup needing // to double-check length_info_ to ensure every potentially relevant // existing entry is seen. This works step-by-step, carefully sharing // unmigrated parts of the chain between the source chain and the new // destination chain. This means that Lookup might see a partially migrated // chain so has to take that into consideration when checking that it hasn't // "jumped off" its intended chain (due to a parallel modification to an // "under (de)construction" entry that was found on the chain but has // been reassigned). // // We use a "rewrite lock" on the source and desination chains to exclude // removals from those, and we have a prior waiting step that ensures any Grow // operations feeding into this one have completed. But this process does have // to gracefully handle concurrent insertions to the head of the source chain, // and once marked ready, the destination chain. // // With those considerations, the migration starts with one "big step," // potentially with retries to deal with insertions in parallel. Part of the // big step is to mark the two chain heads as updated with the new shift // amount, which redirects Lookups to the appropriate new chain. // // After that big step that updates the heads, the rewrite lock makes it // relatively easy to deal with the rest of the migration. Big // simplifications come from being able to read the hashed_key of each // entry on the chain without needing to hold a read reference, and // from never "jumping our to another chain." Concurrent insertions only // happen at the chain head, which is outside of what is left to migrate. // // A series of smaller steps finishes splitting apart the existing chain into // two distinct chains, followed by some steps to fully commit the result. // // Except for trivial cases in which all entries (or remaining entries) // on the input chain go to one output chain, there is an important invariant // after each step of migration, including after the initial "big step": // For each output chain, the "zero chain" (new hash bit is zero) and the // "one chain" (new hash bit is one) we have a "frontier" entry marking the // boundary between what has been migrated and what has not. One of the // frontiers is along the old chain after the other, and all entries between // them are for the same target chain as the earlier frontier. Thus, the // chains share linked list tails starting at the latter frontier. All // pointers from the new head locations to the frontier entries are marked // with the new shift amount, while all pointers after the frontiers use the // old shift amount. // // And after each step there is a strengthening step to reach a stronger // invariant: the frontier earlier in the original chain is advanced to be // immediately before the other frontier. // // Consider this original input chain, // // OldHome -Old-> A0 -Old-> B0 -Old-> A1 -Old-> C0 -Old-> OldHome(End) // GrowHome (empty) // // == BIG STEP == // The initial big step finds the first entry that will be on the each // output chain (in this case A0 and A1). We use brackets ([]) to mark them // as our prospective frontiers. // // OldHome -Old-> [A0] -Old-> B0 -Old-> [A1] -Old-> C0 -Old-> OldHome(End) // GrowHome (empty) // // Next we speculatively update grow_home head to point to the first entry for // the one chain. This will not be used by Lookup until the head at old_home // uses the new shift amount. // // OldHome -Old-> [A0] -Old-> B0 -Old-> [A1] -Old-> C0 -Old-> OldHome(End) // GrowHome --------------New------------/ // // Observe that if Lookup were to use the new head at GrowHome, it would be // able to find all relevant entries. Finishing the initial big step // requires a CAS (compare_exchange) of the OldHome head because there // might have been parallel insertions there, in which case we roll back // and try again. (We might need to point GrowHome head differently.) // // OldHome -New-> [A0] -Old-> B0 -Old-> [A1] -Old-> C0 -Old-> OldHome(End) // GrowHome --------------New------------/ // // Upgrading the OldHome head pointer with the new shift amount, with a // compare_exchange, completes the initial big step, with [A0] as zero // chain frontier and [A1] as one chain frontier. Links before the frontiers // use the new shift amount and links after use the old shift amount. // == END BIG STEP== // == STRENGTHENING == // Zero chain frontier is advanced to [B0] (immediately before other // frontier) by updating pointers with new shift amounts. // // OldHome -New-> A0 -New-> [B0] -Old-> [A1] -Old-> C0 -Old-> OldHome(End) // GrowHome -------------New-----------/ // // == END STRENGTHENING == // == SMALL STEP #1 == // From the strong invariant state, we need to find the next entry for // the new chain with the earlier frontier. In this case, we need to find // the next entry for the zero chain that comes after [B0], which in this // case is C0. This will be our next zero chain frontier, at least under // the weak invariant. To get there, we simply update the link between // the current two frontiers to skip over the entries irreleveant to the // ealier frontier chain. In this case, the zero chain skips over A1. As a // result, he other chain is now the "earlier." // // OldHome -New-> A0 -New-> B0 -New-> [C0] -Old-> OldHome(End) // GrowHome -New-> [A1] ------Old-----/ // // == END SMALL STEP #1 == // // Repeating the cycle and end handling is not as interesting. // Acquire rewrite lock on zero chain (if it's non-empty) ChainRewriteLock zero_head_lock(&arr[old_home], yield_count_); // Used for locking the one chain below uint64_t saved_one_head; // One head has not been written to assert(arr[grow_home].head_next_with_shift.Load() == 0); // old_home will also the head of the new "zero chain" -- all entries in the // "from" chain whose next hash bit is 0. grow_home will be head of the new // "one chain". // For these, SIZE_MAX is like nullptr (unknown) size_t zero_chain_frontier = SIZE_MAX; size_t one_chain_frontier = SIZE_MAX; size_t cur = SIZE_MAX; // Set to 0 (zero chain frontier earlier), 1 (one chain), or -1 (unknown) int chain_frontier_first = -1; // Might need to retry initial update of heads for (int i = 0;; ++i) { CHECK_TOO_MANY_ITERATIONS(i); assert(zero_chain_frontier == SIZE_MAX); assert(one_chain_frontier == SIZE_MAX); assert(cur == SIZE_MAX); assert(chain_frontier_first == -1); uint64_t next_with_shift = zero_head_lock.GetSavedHead(); // Find a single representative for each target chain, or scan the whole // chain if some target chain has no representative. for (;; ++i) { CHECK_TOO_MANY_ITERATIONS(i); // Loop invariants assert((chain_frontier_first < 0) == (zero_chain_frontier == SIZE_MAX && one_chain_frontier == SIZE_MAX)); assert((cur == SIZE_MAX) == (zero_chain_frontier == SIZE_MAX && one_chain_frontier == SIZE_MAX)); assert(GetShiftFromNextWithShift(next_with_shift) == old_shift); // Check for end of original chain if (HandleImpl::IsEnd(next_with_shift)) { cur = SIZE_MAX; break; } // next_with_shift is not End cur = GetNextFromNextWithShift(next_with_shift); if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) { // Entry for zero chain if (zero_chain_frontier == SIZE_MAX) { zero_chain_frontier = cur; if (one_chain_frontier != SIZE_MAX) { // Ready to update heads break; } // Nothing yet for one chain chain_frontier_first = 0; } } else { assert(BottomNBits(arr[cur].hashed_key[1], new_shift) == grow_home); // Entry for one chain if (one_chain_frontier == SIZE_MAX) { one_chain_frontier = cur; if (zero_chain_frontier != SIZE_MAX) { // Ready to update heads break; } // Nothing yet for zero chain chain_frontier_first = 1; } } next_with_shift = arr[cur].chain_next_with_shift.Load(); } // Try to update heads for initial migration info // We only reached the end of the migrate-from chain already if one of the // target chains will be empty. assert((cur == SIZE_MAX) == (zero_chain_frontier == SIZE_MAX || one_chain_frontier == SIZE_MAX)); assert((chain_frontier_first < 0) == (zero_chain_frontier == SIZE_MAX && one_chain_frontier == SIZE_MAX)); // Always update one chain's head first (safe), and mark it as locked saved_one_head = HandleImpl::kHeadLocked | (one_chain_frontier != SIZE_MAX ? MakeNextWithShift(one_chain_frontier, new_shift) : MakeNextWithShiftEnd(grow_home, new_shift)); arr[grow_home].head_next_with_shift.Store(saved_one_head); // Make sure length_info_ hasn't been updated too early, as we're about // to make the change that makes it safe to update (e.g. in DoInsert()) assert(LengthInfoToUsedLength(length_info_.Load()) <= grow_home); // Try to set zero's head. if (zero_head_lock.CasUpdate( zero_chain_frontier != SIZE_MAX ? MakeNextWithShift(zero_chain_frontier, new_shift) : MakeNextWithShiftEnd(old_home, new_shift), yield_count_)) { // Both heads successfully updated to new shift break; } else { // Concurrent insertion. This should not happen too many times. CHECK_TOO_MANY_ITERATIONS(i); // The easiest solution is to restart. zero_chain_frontier = SIZE_MAX; one_chain_frontier = SIZE_MAX; cur = SIZE_MAX; chain_frontier_first = -1; continue; } } // Create an RAII wrapper for the one chain rewrite lock we are already // holding (if was not end) and is now "published" after successful CAS on // zero chain head. ChainRewriteLock one_head_lock(&arr[grow_home], yield_count_, saved_one_head); // Except for trivial cases, we have something like // AHome -New-> [A0] -Old-> [B0] -Old-> [C0] \ | // BHome --------------------New------------> [A1] -Old-> ... // And we need to upgrade as much as we can on the "first" chain // (the one eventually pointing to the other's frontier). This will // also finish off any case in which one of the target chains will be empty. if (chain_frontier_first >= 0) { size_t& first_frontier = chain_frontier_first == 0 ? /*&*/ zero_chain_frontier : /*&*/ one_chain_frontier; size_t& other_frontier = chain_frontier_first != 0 ? /*&*/ zero_chain_frontier : /*&*/ one_chain_frontier; uint64_t stop_before_or_new_tail = other_frontier != SIZE_MAX ? /*stop before*/ MakeNextWithShift(other_frontier, old_shift) : /*new tail*/ MakeNextWithShiftEnd( chain_frontier_first == 0 ? old_home : grow_home, new_shift); UpgradeShiftsOnRange(arr, first_frontier, stop_before_or_new_tail, old_shift, new_shift); } if (zero_chain_frontier == SIZE_MAX) { // Already finished migrating assert(one_chain_frontier == SIZE_MAX); assert(cur == SIZE_MAX); } else { // Still need to migrate between two target chains for (int i = 0;; ++i) { CHECK_TOO_MANY_ITERATIONS(i); // Overall loop invariants assert(zero_chain_frontier != SIZE_MAX); assert(one_chain_frontier != SIZE_MAX); assert(cur != SIZE_MAX); assert(chain_frontier_first >= 0); size_t& first_frontier = chain_frontier_first == 0 ? /*&*/ zero_chain_frontier : /*&*/ one_chain_frontier; size_t& other_frontier = chain_frontier_first != 0 ? /*&*/ zero_chain_frontier : /*&*/ one_chain_frontier; assert(cur != first_frontier); assert(GetNextFromNextWithShift( arr[first_frontier].chain_next_with_shift.Load()) == other_frontier); uint64_t next_with_shift = arr[cur].chain_next_with_shift.Load(); // Check for end of original chain if (HandleImpl::IsEnd(next_with_shift)) { // Can set upgraded tail on first chain uint64_t first_new_tail = MakeNextWithShiftEnd( chain_frontier_first == 0 ? old_home : grow_home, new_shift); arr[first_frontier].chain_next_with_shift.Store(first_new_tail); // And upgrade remainder of other chain uint64_t other_new_tail = MakeNextWithShiftEnd( chain_frontier_first != 0 ? old_home : grow_home, new_shift); UpgradeShiftsOnRange(arr, other_frontier, other_new_tail, old_shift, new_shift); assert(other_frontier == SIZE_MAX); // Finished break; } // next_with_shift is not End cur = GetNextFromNextWithShift(next_with_shift); int target_chain; if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) { // Entry for zero chain target_chain = 0; } else { assert(BottomNBits(arr[cur].hashed_key[1], new_shift) == grow_home); // Entry for one chain target_chain = 1; } if (target_chain == chain_frontier_first) { // Found next entry to skip to on the first chain uint64_t skip_to = MakeNextWithShift(cur, new_shift); arr[first_frontier].chain_next_with_shift.Store(skip_to); first_frontier = cur; // Upgrade other chain up to entry before that one UpgradeShiftsOnRange(arr, other_frontier, next_with_shift, old_shift, new_shift); // Swap which is marked as first chain_frontier_first = 1 - chain_frontier_first; } else { // Nothing to do yet, as we need to keep old generation pointers in // place for lookups } } } } // Variant of PurgeImplLocked: Removes all "under (de) construction" entries // from a chain where already holding a rewrite lock using PurgeLockedOpData = void; // Variant of PurgeImplLocked: Clock-updates all entries in a chain, in // addition to functionality of PurgeLocked, where already holding a rewrite // lock. (Caller finalizes eviction on entries added to the autovector, in part // so that we don't hold the rewrite lock while doing potentially expensive // callback and allocator free.) using ClockUpdateChainLockedOpData = autovector; template void AutoHyperClockTable::PurgeImplLocked(OpData* op_data, ChainRewriteLock& rewrite_lock, size_t home, BaseClockTable::EvictionData* data) { constexpr bool kIsPurge = std::is_same_v; constexpr bool kIsClockUpdateChain = std::is_same_v; // Exactly one op specified static_assert(kIsPurge + kIsClockUpdateChain == 1); HandleImpl* const arr = array_.Get(); uint64_t next_with_shift = rewrite_lock.GetSavedHead(); assert(!HandleImpl::IsEnd(next_with_shift)); int home_shift = GetShiftFromNextWithShift(next_with_shift); (void)home; (void)home_shift; size_t next = GetNextFromNextWithShift(next_with_shift); assert(next < array_.Count()); HandleImpl* h = &arr[next]; HandleImpl* prev_to_keep = nullptr; #ifndef NDEBUG uint64_t prev_to_keep_next_with_shift = 0; #endif // Whether there are entries between h and prev_to_keep that should be // purged from the chain. bool pending_purge = false; // Walk the chain, and stitch together any entries that are still // "shareable," possibly after clock update. prev_to_keep tells us where // the last "stitch back to" location is (nullptr => head). for (size_t i = 0;; ++i) { CHECK_TOO_MANY_ITERATIONS(i); bool purgeable = false; // In last iteration, h will be nullptr, to stitch together the tail of // the chain. if (h) { // NOTE: holding a rewrite lock on the chain prevents any "under // (de)construction" entries in the chain from being marked empty, which // allows us to access the hashed_keys without holding a read ref. assert(home == BottomNBits(h->hashed_key[1], home_shift)); if constexpr (kIsClockUpdateChain) { // Clock update and/or check for purgeable (under (de)construction) if (ClockUpdate(*h, data, &purgeable)) { // Remember for finishing eviction op_data->push_back(h); // Entries for eviction become purgeable purgeable = true; assert((h->meta.Load() >> ClockHandle::kStateShift) == ClockHandle::kStateConstruction); } } else { (void)op_data; (void)data; purgeable = ((h->meta.Load() >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit) == 0; } } if (purgeable) { assert((h->meta.Load() >> ClockHandle::kStateShift) == ClockHandle::kStateConstruction); pending_purge = true; } else if (pending_purge) { if (prev_to_keep) { // Update chain next to skip purgeable entries assert(prev_to_keep->chain_next_with_shift.Load() == prev_to_keep_next_with_shift); prev_to_keep->chain_next_with_shift.Store(next_with_shift); } else if (rewrite_lock.CasUpdate(next_with_shift, yield_count_)) { // Managed to update head without any parallel insertions } else { // Parallel insertion must have interfered. Need to do a purge // from updated head to here. Since we have no prev_to_keep, there's // no risk of duplicate clock updates to entries. Any entries already // updated must have been evicted (purgeable) and it's OK to clock // update any new entries just inserted in parallel. // Can simply restart (GetSavedHead() already updated from CAS failure). next_with_shift = rewrite_lock.GetSavedHead(); assert(!HandleImpl::IsEnd(next_with_shift)); next = GetNextFromNextWithShift(next_with_shift); assert(next < array_.Count()); h = &arr[next]; pending_purge = false; assert(prev_to_keep == nullptr); assert(GetShiftFromNextWithShift(next_with_shift) == home_shift); continue; } pending_purge = false; prev_to_keep = h; } else { prev_to_keep = h; } if (h == nullptr) { // Reached end of the chain return; } // Read chain pointer next_with_shift = h->chain_next_with_shift.Load(); #ifndef NDEBUG if (prev_to_keep == h) { prev_to_keep_next_with_shift = next_with_shift; } #endif assert(GetShiftFromNextWithShift(next_with_shift) == home_shift); // Check for end marker if (HandleImpl::IsEnd(next_with_shift)) { h = nullptr; } else { next = GetNextFromNextWithShift(next_with_shift); assert(next < array_.Count()); h = &arr[next]; assert(h != prev_to_keep); } } } // Variant of PurgeImpl: Removes all "under (de) construction" entries in a // chain, such that any entry with the given key must have been purged. using PurgeOpData = const UniqueId64x2; // Variant of PurgeImpl: Clock-updates all entries in a chain, in addition to // purging as appropriate. (Caller finalizes eviction on entries added to the // autovector, in part so that we don't hold the rewrite lock while doing // potentially expensive callback and allocator free.) using ClockUpdateChainOpData = ClockUpdateChainLockedOpData; template void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home, BaseClockTable::EvictionData* data) { // Early efforts to make AutoHCC fully wait-free ran into too many problems // that needed obscure and potentially inefficient work-arounds to have a // chance at working. // // The implementation settled on "essentially wait-free" which can be // achieved by locking at the level of each probing chain and only for // operations that might remove entries from the chain. Because parallel // clock updates and Grow operations are ordered, contention is very rare. // However, parallel insertions at any chain head have to be accommodated // to keep them wait-free. // // This function implements Purge and ClockUpdateChain functions (see above // OpData type definitions) as part of higher-level operations. This function // ensures the correct chain is (eventually) covered and handles rewrite // locking the chain. PurgeImplLocked has lower level details. // // In general, these operations and Grow are kept simpler by allowing eager // purging of under (de-)construction entries. For example, an Erase // operation might find that another thread has purged the entry from the // chain by the time its own purge operation acquires the rewrite lock and // proceeds. This is OK, and potentially reduces the number of lock/unlock // cycles because empty chains are not rewrite-lockable. constexpr bool kIsPurge = std::is_same_v; constexpr bool kIsClockUpdateChain = std::is_same_v; // Exactly one op specified static_assert(kIsPurge + kIsClockUpdateChain == 1); int home_shift = 0; if constexpr (kIsPurge) { // Purge callers leave home unspecified, to be determined from key assert(home == SIZE_MAX); GetHomeIndexAndShift(length_info_.Load(), (*op_data)[1], &home, &home_shift); assert(home_shift > 0); } else { assert(kIsClockUpdateChain); // Evict callers must specify home assert(home < SIZE_MAX); } HandleImpl* const arr = array_.Get(); // Acquire the RAII rewrite lock (if not an empty chain) ChainRewriteLock rewrite_lock(&arr[home], yield_count_); if constexpr (kIsPurge) { // Ensure we are at the correct home for the shift in effect for the // chain head. for (;;) { int shift = GetShiftFromNextWithShift(rewrite_lock.GetSavedHead()); if (shift > home_shift) { // Found a newer shift at candidate head, which must apply to us. // Newer shift might not yet be reflected in length_info_ (an atomicity // gap in Grow), so operate as if it is. Note that other insertions // could happen using this shift before length_info_ is updated, and // it's possible (though unlikely) that multiple generations of Grow // have occurred. If shift is more than one generation ahead of // home_shift, it's possible that not all descendent homes have // reached the `shift` generation. Thus, we need to advance only one // shift at a time looking for a home+head with a matching shift // amount. home_shift++; home = GetHomeIndex((*op_data)[1], home_shift); rewrite_lock.Reset(&arr[home], yield_count_); continue; } else { assert(shift == home_shift); } break; } } // If the chain is empty, nothing to do if (!rewrite_lock.IsEnd()) { if constexpr (kIsPurge) { PurgeLockedOpData* locked_op_data{}; PurgeImplLocked(locked_op_data, rewrite_lock, home, data); } else { PurgeImplLocked(op_data, rewrite_lock, home, data); } } } AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert( const ClockHandleBasicData& proto, uint64_t initial_countdown, bool take_ref, InsertState& state) { size_t home; int orig_home_shift; GetHomeIndexAndShift(state.saved_length_info, proto.hashed_key[1], &home, &orig_home_shift); HandleImpl* const arr = array_.Get(); // We could go searching through the chain for any duplicate, but that's // not typically helpful, except for the REDUNDANT block cache stats. // (Inferior duplicates will age out with eviction.) However, we do skip // insertion if the home slot (or some other we happen to probe) already // has a match (already_matches below). This helps to keep better locality // when we can. // // And we can do that as part of searching for an available slot to // insert the new entry, because our preferred location and first slot // checked will be the home slot. // // As the table initially grows to size, few entries will be in the same // cache line as the chain head. However, churn in the cache relatively // quickly improves the proportion of entries sharing that cache line with // the chain head. Data: // // Initial population only: (cache_bench with -ops_per_thread=1) // Entries at home count: 29,202 (out of 129,170 entries in 94,411 chains) // Approximate average cache lines read to find an existing entry: // 129.2 / 94.4 [without the heads] // + (94.4 - 29.2) / 94.4 [the heads not included with entries] // = 2.06 cache lines // // After 10 million ops: (-threads=10 -ops_per_thread=100000) // Entries at home count: 67,556 (out of 129,359 entries in 94,756 chains) // That's a majority of entries and more than 2/3rds of chains. // Approximate average cache lines read to find an existing entry: // = 1.65 cache lines // Even if we aren't saving a ref to this entry (take_ref == false), we need // to keep a reference while we are inserting the entry into a chain, so that // it is not erased by another thread while trying to insert it on the chain. constexpr bool initial_take_ref = true; size_t used_length = LengthInfoToUsedLength(state.saved_length_info); assert(home < used_length); size_t idx = home; bool already_matches = false; bool already_matches_ignore = false; if (TryInsert(proto, arr[idx], initial_countdown, initial_take_ref, &already_matches)) { assert(idx == home); } else if (already_matches) { return nullptr; // Here we try to populate newly-opened slots in the table, but not // when we can add something to its home slot. This makes the structure // more performant more quickly on (initial) growth. We ignore "already // matches" in this case because it is unlikely and difficult to // incorporate logic for here cleanly and efficiently. } else if (UNLIKELY(state.likely_empty_slot > 0) && TryInsert(proto, arr[state.likely_empty_slot], initial_countdown, initial_take_ref, &already_matches_ignore)) { idx = state.likely_empty_slot; } else { // We need to search for an available slot outside of the home. // Linear hashing provides nice resizing but does typically mean // that some heads (home locations) have (in expectation) twice as // many entries mapped to them as other heads. For example if the // usable length is 80, then heads 16-63 are (in expectation) twice // as loaded as heads 0-15 and 64-79, which are using another hash bit. // // This means that if we just use linear probing (by a small constant) // to find an available slot, part of the structure could easily fill up // and resort to linear time operations even when the overall load factor // is only modestly high, like 70%. Even though each slot has its own CPU // cache line, there appears to be a small locality benefit (e.g. TLB and // paging) to iterating one by one, as long as we don't afoul of the // linear hashing imbalance. // // In a traditional non-concurrent structure, we could keep a "free list" // to ensure immediate access to an available slot, but maintaining such // a structure could require more cross-thread coordination to ensure // all entries are eventually available to all threads. // // The way we solve this problem is to use unit-increment linear probing // with a small bound, and then fall back on big jumps to have a good // chance of finding a slot in an under-populated region quickly if that // doesn't work. size_t i = 0; constexpr size_t kMaxLinearProbe = 4; for (; i < kMaxLinearProbe; i++) { idx++; if (idx >= used_length) { idx -= used_length; } if (TryInsert(proto, arr[idx], initial_countdown, initial_take_ref, &already_matches)) { break; } if (already_matches) { return nullptr; } } if (i == kMaxLinearProbe) { // Keep searching, but change to a search method that should quickly // find any under-populated region. Switching to an increment based // on the golden ratio helps with that, but we also inject some minor // variation (less than 2%, 1 in 2^6) to avoid clustering effects on // this larger increment (if it were a fixed value in steady state // operation). Here we are primarily using upper bits of hashed_key[1] // while home is based on lowest bits. uint64_t incr_ratio = 0x9E3779B185EBCA87U + (proto.hashed_key[1] >> 6); size_t incr = FastRange64(incr_ratio, used_length); assert(incr > 0); size_t start = idx; for (;; i++) { idx += incr; if (idx >= used_length) { // Wrap around (faster than %) idx -= used_length; } if (idx == start) { // We have just completed a cycle that might not have covered all // slots. (incr and used_length could have common factors.) // Increment for the next cycle, which eventually ensures complete // iteration over the set of slots before repeating. idx++; if (idx >= used_length) { idx -= used_length; } start++; if (start >= used_length) { start -= used_length; } if (i >= used_length) { used_length = LengthInfoToUsedLength(length_info_.Load()); if (i >= used_length * 2) { // Cycling back should not happen unless there is enough random // churn in parallel that we happen to hit each slot at a time // that it's occupied, which is really only feasible for small // structures, though with linear probing to find empty slots, // "small" here might be larger than for double hashing. assert(used_length <= 256); // Fall back on standalone insert in case something goes awry to // cause this return nullptr; } } } if (TryInsert(proto, arr[idx], initial_countdown, initial_take_ref, &already_matches)) { break; } if (already_matches) { return nullptr; } } } } // Now insert into chain using head pointer uint64_t next_with_shift; int home_shift = orig_home_shift; // Might need to retry for (int i = 0;; ++i) { CHECK_TOO_MANY_ITERATIONS(i); next_with_shift = arr[home].head_next_with_shift.Load(); int shift = GetShiftFromNextWithShift(next_with_shift); if (UNLIKELY(shift != home_shift)) { // NOTE: shift increases with table growth if (shift > home_shift) { // Must be grow in progress or completed since reading length_info. // Pull out one more hash bit. (See Lookup() for why we can't // safely jump to the shift that was read.) home_shift++; uint64_t hash_bit_mask = uint64_t{1} << (home_shift - 1); assert((home & hash_bit_mask) == 0); // BEGIN leftover updates to length_info_ for Grow() size_t grow_home = home + hash_bit_mask; assert(arr[grow_home].head_next_with_shift.Load() != HandleImpl::kUnusedMarker); CatchUpLengthInfoNoWait(grow_home); // END leftover updates to length_info_ for Grow() home += proto.hashed_key[1] & hash_bit_mask; continue; } else { // Should not happen because length_info_ is only updated after both // old and new home heads are marked with new shift assert(false); } } // Values to update to uint64_t head_next_with_shift = MakeNextWithShift(idx, home_shift); uint64_t chain_next_with_shift = next_with_shift; // Preserve the locked state in head, without propagating to chain next // where it is meaningless (and not allowed) if (UNLIKELY((next_with_shift & HandleImpl::kNextEndFlags) == HandleImpl::kHeadLocked)) { head_next_with_shift |= HandleImpl::kHeadLocked; chain_next_with_shift &= ~HandleImpl::kHeadLocked; } arr[idx].chain_next_with_shift.Store(chain_next_with_shift); if (arr[home].head_next_with_shift.CasWeak(next_with_shift, head_next_with_shift)) { // Success if (!take_ref) { Unref(arr[idx]); } return arr + idx; } } } AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup( const UniqueId64x2& hashed_key) { // Lookups are wait-free with low occurrence of retries, back-tracking, // and fallback. We do not have the benefit of holding a rewrite lock on // the chain so must be prepared for many kinds of mayhem, most notably // "falling off our chain" where a slot that Lookup has identified but // has not read-referenced is removed from one chain and inserted into // another. The full algorithm uses the following mitigation strategies to // ensure every relevant entry inserted before this Lookup, and not yet // evicted, is seen by Lookup, without excessive backtracking etc.: // * Keep a known good read ref in the chain for "island hopping." When // we observe that a concurrent write takes us off to another chain, we // only need to fall back to our last known good read ref (most recent // entry on the chain that is not "under construction," which is a transient // state). We don't want to compound the CPU toil of a long chain with // operations that might need to retry from scratch, with probability // in proportion to chain length. // * Only detect a chain is potentially incomplete because of a Grow in // progress by looking at shift in the next pointer tags (rather than // re-checking length_info_). // * SplitForGrow, Insert, and PurgeImplLocked ensure that there are no // transient states that might cause this full Lookup algorithm to skip over // live entries. // Reading length_info_ is not strictly required for Lookup, if we were // to increment shift sizes until we see a shift size match on the // relevant head pointer. Thus, reading with relaxed memory order gives // us a safe and almost always up-to-date jump into finding the correct // home and head. size_t home; int home_shift; GetHomeIndexAndShift(length_info_.LoadRelaxed(), hashed_key[1], &home, &home_shift); assert(home_shift > 0); // The full Lookup algorithm however is not great for hot path efficiency, // because of the extra careful tracking described above. Overwhelmingly, // we can find what we're looking for with a naive linked list traversal // of the chain. Even if we "fall off our chain" to another, we don't // violate memory safety. We just won't match the key we're looking for. // And we would eventually reach an end state, possibly even experiencing a // cycle as an entry is freed and reused during our traversal (though at // any point in time the structure doesn't have cycles). // // So for hot path efficiency, we start with a naive Lookup attempt, and // then fall back on full Lookup if we don't find the correct entry. To // cap how much we invest into the naive Lookup, we simply cap the traversal // length before falling back. Also, when we do fall back on full Lookup, // we aren't paying much penalty by starting over. Much or most of the cost // of Lookup is memory latency in following the chain pointers, and the // naive Lookup has warmed the CPU cache for these entries, using as tight // of a loop as possible. HandleImpl* const arr = array_.Get(); uint64_t next_with_shift = arr[home].head_next_with_shift.LoadRelaxed(); for (size_t i = 0; !HandleImpl::IsEnd(next_with_shift) && i < 10; ++i) { HandleImpl* h = &arr[GetNextFromNextWithShift(next_with_shift)]; // Attempt cheap key match without acquiring a read ref. This could give a // false positive, which is re-checked after acquiring read ref, or false // negative, which is re-checked in the full Lookup. Also, this is a // technical UB data race according to TSAN, but we don't need to read // a "correct" value here for correct overall behavior. #ifdef __SANITIZE_THREAD__ bool probably_equal = Random::GetTLSInstance()->OneIn(2); #else bool probably_equal = h->hashed_key == hashed_key; #endif if (probably_equal) { // Increment acquire counter for definitive check uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); // Check if it's a referencable (sharable) entry if (LIKELY(old_meta & (uint64_t{ClockHandle::kStateShareableBit} << ClockHandle::kStateShift))) { assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0); if (LIKELY(h->hashed_key == hashed_key) && LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift))) { return h; } else { Unref(*h); } } else { // For non-sharable states, incrementing the acquire counter has no // effect so we don't need to undo it. Furthermore, we cannot safely // undo it because we did not acquire a read reference to lock the entry // in a Shareable state. } } next_with_shift = h->chain_next_with_shift.LoadRelaxed(); } // If we get here, falling back on full Lookup algorithm. HandleImpl* h = nullptr; HandleImpl* read_ref_on_chain = nullptr; for (size_t i = 0;; ++i) { CHECK_TOO_MANY_ITERATIONS(i); // Read head or chain pointer next_with_shift = h ? h->chain_next_with_shift.Load() : arr[home].head_next_with_shift.Load(); int shift = GetShiftFromNextWithShift(next_with_shift); // Make sure it's usable size_t effective_home = home; if (UNLIKELY(shift != home_shift)) { // We have potentially gone awry somehow, but it's possible we're just // hitting old data that is not yet completed Grow. // NOTE: shift bits goes up with table growth. if (shift < home_shift) { // To avoid waiting on Grow in progress, an old shift amount needs // to be processed as if we were still using it and (potentially // different or the same) the old home. // We can assert it's not too old, because each generation of Grow // waits on its ancestor in the previous generation. assert(shift + 1 == home_shift); effective_home = GetHomeIndex(home, shift); } else if (h == read_ref_on_chain) { assert(shift > home_shift); // At head or coming from an entry on our chain where we're holding // a read reference. Thus, we know the newer shift applies to us. // Newer shift might not yet be reflected in length_info_ (an atomicity // gap in Grow), so operate as if it is. Note that other insertions // could happen using this shift before length_info_ is updated, and // it's possible (though unlikely) that multiple generations of Grow // have occurred. If shift is more than one generation ahead of // home_shift, it's possible that not all descendent homes have // reached the `shift` generation. Thus, we need to advance only one // shift at a time looking for a home+head with a matching shift // amount. home_shift++; // Update home in case it has changed home = GetHomeIndex(hashed_key[1], home_shift); // This should be rare enough occurrence that it's simplest just // to restart (TODO: improve in some cases?) h = nullptr; if (read_ref_on_chain) { Unref(*read_ref_on_chain); read_ref_on_chain = nullptr; } // Didn't make progress & retry continue; } else { assert(shift > home_shift); assert(h != nullptr); // An "under (de)construction" entry has a new shift amount, which // means we have either gotten off our chain or our home shift is out // of date. If we revert back to saved ref, we will get updated info. h = read_ref_on_chain; // Didn't make progress & retry continue; } } // Check for end marker if (HandleImpl::IsEnd(next_with_shift)) { // To ensure we didn't miss anything in the chain, the end marker must // point back to the correct home. if (LIKELY(GetNextFromNextWithShift(next_with_shift) == effective_home)) { // Complete, clean iteration of the chain, not found. // Clean up. if (read_ref_on_chain) { Unref(*read_ref_on_chain); } return nullptr; } else { // Something went awry. Revert back to a safe point (if we have it) h = read_ref_on_chain; // Didn't make progress & retry continue; } } // Follow the next and check for full key match, home match, or neither h = &arr[GetNextFromNextWithShift(next_with_shift)]; bool full_match_or_unknown = false; if (MatchAndRef(&hashed_key, *h, shift, effective_home, &full_match_or_unknown)) { // Got a read ref on next (h). // // There is a very small chance that between getting the next pointer // (now h) and doing MatchAndRef on it, another thread erased/evicted it // reinserted it into the same chain, causing us to cycle back in the // same chain and potentially see some entries again if we keep walking. // Newly-inserted entries are inserted before older ones, so we are at // least guaranteed not to miss anything. Here in Lookup, it's just a // transient, slight hiccup in performance. if (full_match_or_unknown) { // Full match. // Release old read ref on chain if applicable if (read_ref_on_chain) { // Pretend we never took the reference. Unref(*read_ref_on_chain); } // Update the hit bit if (eviction_callback_) { h->meta.FetchOrRelaxed(uint64_t{1} << ClockHandle::kHitBitShift); } // All done. return h; } else if (UNLIKELY(shift != home_shift) && home != BottomNBits(h->hashed_key[1], home_shift)) { // This chain is in a Grow operation and we've landed on an entry // that belongs to the wrong destination chain. We can keep going, but // there's a chance we'll need to backtrack back *before* this entry, // if the Grow finishes before this Lookup. We cannot save this entry // for backtracking because it might soon or already be on the wrong // chain. // NOTE: if we simply backtrack rather than continuing, we would // be in a wait loop (not allowed in Lookup!) until the other thread // finishes its Grow. Unref(*h); } else { // Correct home location, so we are on the right chain. // With new usable read ref, can release old one (if applicable). if (read_ref_on_chain) { // Pretend we never took the reference. Unref(*read_ref_on_chain); } // And keep the new one. read_ref_on_chain = h; } } else { if (full_match_or_unknown) { // Must have been an "under construction" entry. Can safely skip it, // but there's a chance we'll have to backtrack later } else { // Home mismatch! Revert back to a safe point (if we have it) h = read_ref_on_chain; // Didn't make progress & retry } } } } void AutoHyperClockTable::Remove(HandleImpl* h) { assert((h->meta.Load() >> ClockHandle::kStateShift) == ClockHandle::kStateConstruction); const HandleImpl& c_h = *h; PurgeImpl(&c_h.hashed_key); } bool AutoHyperClockTable::TryEraseHandle(HandleImpl* h, bool holding_ref, bool mark_invisible) { uint64_t meta; if (mark_invisible) { // Set invisible meta = h->meta.FetchAnd( ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift)); // To local variable also meta &= ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift); } else { meta = h->meta.Load(); } // Take ownership if no other refs do { if (GetRefcount(meta) != uint64_t{holding_ref}) { // Not last ref at some point in time during this call return false; } if ((meta & (uint64_t{ClockHandle::kStateShareableBit} << ClockHandle::kStateShift)) == 0) { // Someone else took ownership return false; } // Note that if !holding_ref, there's a small chance that we release, // another thread replaces this entry with another, reaches zero refs, and // then we end up erasing that other entry. That's an acceptable risk / // imprecision. } while (!h->meta.CasWeak(meta, uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift)); // Took ownership // TODO? Delay freeing? h->FreeData(allocator_); size_t total_charge = h->total_charge; if (UNLIKELY(h->IsStandalone())) { // Delete detached handle delete h; standalone_usage_.FetchSubRelaxed(total_charge); } else { Remove(h); MarkEmpty(*h); occupancy_.FetchSub(1U); } usage_.FetchSubRelaxed(total_charge); assert(usage_.LoadRelaxed() < SIZE_MAX / 2); return true; } bool AutoHyperClockTable::Release(HandleImpl* h, bool useful, bool erase_if_last_ref) { // In contrast with LRUCache's Release, this function won't delete the handle // when the cache is above capacity and the reference is the last one. Space // is only freed up by Evict/PurgeImpl (called by Insert when space // is needed) and Erase. We do this to avoid an extra atomic read of the // variable usage_. uint64_t old_meta; if (useful) { // Increment release counter to indicate was used old_meta = h->meta.FetchAdd(ClockHandle::kReleaseIncrement); // Correct for possible (but rare) overflow CorrectNearOverflow(old_meta, h->meta); } else { // Decrement acquire counter to pretend it never happened old_meta = h->meta.FetchSub(ClockHandle::kAcquireIncrement); } assert((old_meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit); // No underflow assert(((old_meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask) != ((old_meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask)); if ((erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift == ClockHandle::kStateInvisible))) { // FIXME: There's a chance here that another thread could replace this // entry and we end up erasing the wrong one. return TryEraseHandle(h, /*holding_ref=*/false, /*mark_invisible=*/false); } else { return false; } } #ifndef NDEBUG void AutoHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) { if (n > 0) { // Do n-1 simple releases first TEST_ReleaseNMinus1(h, n); // Then the last release might be more involved Release(h, /*useful*/ true, /*erase_if_last_ref*/ false); } } #endif void AutoHyperClockTable::Erase(const UniqueId64x2& hashed_key) { // Don't need to be efficient. // Might be one match masking another, so loop. while (HandleImpl* h = Lookup(hashed_key)) { bool gone = TryEraseHandle(h, /*holding_ref=*/true, /*mark_invisible=*/true); if (!gone) { // Only marked invisible, which is ok. // Pretend we never took the reference from Lookup. Unref(*h); } } } void AutoHyperClockTable::EraseUnRefEntries() { size_t usable_size = GetTableSize(); for (size_t i = 0; i < usable_size; i++) { HandleImpl& h = array_[i]; uint64_t old_meta = h.meta.LoadRelaxed(); if (old_meta & (uint64_t{ClockHandle::kStateShareableBit} << ClockHandle::kStateShift) && GetRefcount(old_meta) == 0 && h.meta.CasStrong(old_meta, uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift)) { // Took ownership h.FreeData(allocator_); usage_.FetchSubRelaxed(h.total_charge); // NOTE: could be more efficient with a dedicated variant of // PurgeImpl, but this is not a common operation Remove(&h); MarkEmpty(h); occupancy_.FetchSub(1U); } } } void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state, EvictionData* data, uint32_t eviction_effort_cap) { // precondition assert(requested_charge > 0); // We need the clock pointer to seemlessly "wrap around" at the end of the // table, and to be reasonably stable under Grow operations. This is // challenging when the linear hashing progressively opens additional // most-significant-hash-bits in determining home locations. // TODO: make a tuning parameter? // Up to 2x this number of homes will be evicted per step. In very rare // cases, possibly more, as homes of an out-of-date generation will be // resolved to multiple in a newer generation. constexpr size_t step_size = 4; // A clock_pointer_mask_ field separate from length_info_ enables us to use // the same mask (way of dividing up the space among evicting threads) for // iterating over the whole structure before considering changing the mask // at the beginning of each pass. This ensures we do not have a large portion // of the space that receives redundant or missed clock updates. However, // with two variables, for each update to clock_pointer_mask (< 64 ever in // the life of the cache), there will be a brief period where concurrent // eviction threads could use the old mask value, possibly causing redundant // or missed clock updates for a *small* portion of the table. size_t clock_pointer_mask = clock_pointer_mask_.LoadRelaxed(); uint64_t max_clock_pointer = 0; // unset // TODO: consider updating during a long eviction size_t used_length = LengthInfoToUsedLength(state.saved_length_info); autovector to_finish_eviction; // Loop until enough freed, or limit reached (see bottom of loop) for (;;) { // First (concurrent) increment clock pointer uint64_t old_clock_pointer = clock_pointer_.FetchAddRelaxed(step_size); if (UNLIKELY((old_clock_pointer & clock_pointer_mask) == 0)) { // Back at the beginning. See if clock_pointer_mask should be updated. uint64_t mask = BottomNBits( UINT64_MAX, LengthInfoToMinShift(state.saved_length_info)); if (clock_pointer_mask != mask) { clock_pointer_mask = static_cast(mask); clock_pointer_mask_.StoreRelaxed(clock_pointer_mask); } } size_t major_step = clock_pointer_mask + 1; assert((major_step & clock_pointer_mask) == 0); for (size_t base_home = old_clock_pointer & clock_pointer_mask; base_home < used_length; base_home += major_step) { for (size_t i = 0; i < step_size; i++) { size_t home = base_home + i; if (home >= used_length) { break; } PurgeImpl(&to_finish_eviction, home, data); } } for (HandleImpl* h : to_finish_eviction) { TrackAndReleaseEvictedEntry(h); // NOTE: setting likely_empty_slot here can cause us to reduce the // portion of "at home" entries, probably because an evicted entry // is more likely to come back than a random new entry and would be // unable to go into its home slot. } to_finish_eviction.clear(); // Loop exit conditions if (data->freed_charge >= requested_charge) { return; } if (max_clock_pointer == 0) { // Cap the eviction effort at this thread (along with those operating in // parallel) circling through the whole structure kMaxCountdown times. // In other words, this eviction run must find something/anything that is // unreferenced at start of and during the eviction run that isn't // reclaimed by a concurrent eviction run. // TODO: Does HyperClockCache need kMaxCountdown + 1? max_clock_pointer = old_clock_pointer + (uint64_t{ClockHandle::kMaxCountdown + 1} * major_step); } if (old_clock_pointer + step_size >= max_clock_pointer) { return; } if (IsEvictionEffortExceeded(*data, eviction_effort_cap)) { eviction_effort_exceeded_count_.FetchAddRelaxed(1); return; } } } size_t AutoHyperClockTable::CalcMaxUsableLength( size_t capacity, size_t min_avg_value_size, CacheMetadataChargePolicy metadata_charge_policy) { double min_avg_slot_charge = min_avg_value_size * kMaxLoadFactor; if (metadata_charge_policy == kFullChargeCacheMetadata) { min_avg_slot_charge += sizeof(HandleImpl); } assert(min_avg_slot_charge > 0.0); size_t num_slots = static_cast(capacity / min_avg_slot_charge + 0.999999); const size_t slots_per_page = port::kPageSize / sizeof(HandleImpl); // Round up to page size return ((num_slots + slots_per_page - 1) / slots_per_page) * slots_per_page; } namespace { bool IsHeadNonempty(const AutoHyperClockTable::HandleImpl& h) { return !AutoHyperClockTable::HandleImpl::IsEnd( h.head_next_with_shift.LoadRelaxed()); } bool IsEntryAtHome(const AutoHyperClockTable::HandleImpl& h, int shift, size_t home) { if (MatchAndRef(nullptr, h, shift, home)) { Unref(h); return true; } else { return false; } } } // namespace void AutoHyperClockCache::ReportProblems( const std::shared_ptr& info_log) const { BaseHyperClockCache::ReportProblems(info_log); if (info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) { LoadVarianceStats head_stats; size_t entry_at_home_count = 0; uint64_t yield_count = 0; this->ForEachShard([&](const Shard* shard) { size_t count = shard->GetTableAddressCount(); uint64_t length_info = UsedLengthToLengthInfo(count); for (size_t i = 0; i < count; ++i) { const auto& h = *shard->GetTable().HandlePtr(i); head_stats.Add(IsHeadNonempty(h)); int shift; size_t home; GetHomeIndexAndShift(length_info, i, &home, &shift); assert(home == i); entry_at_home_count += IsEntryAtHome(h, shift, home); } yield_count += shard->GetTable().GetYieldCount(); }); ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL, "Head occupancy stats: %s", head_stats.Report().c_str()); ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL, "Entries at home count: %zu", entry_at_home_count); ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL, "Yield count: %" PRIu64, yield_count); } } } // namespace clock_cache // DEPRECATED (see public API) std::shared_ptr NewClockCache( size_t capacity, int num_shard_bits, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy) { return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, /* high_pri_pool_ratio */ 0.5, nullptr, kDefaultToAdaptiveMutex, metadata_charge_policy, /* low_pri_pool_ratio */ 0.0); } std::shared_ptr HyperClockCacheOptions::MakeSharedCache() const { // For sanitized options HyperClockCacheOptions opts = *this; if (opts.num_shard_bits >= 20) { return nullptr; // The cache cannot be sharded into too many fine pieces. } if (opts.num_shard_bits < 0) { // Use larger shard size to reduce risk of large entries clustering // or skewing individual shards. constexpr size_t min_shard_size = 32U * 1024U * 1024U; opts.num_shard_bits = GetDefaultCacheShardBits(opts.capacity, min_shard_size); } std::shared_ptr cache; if (opts.estimated_entry_charge == 0) { cache = std::make_shared(opts); } else { cache = std::make_shared(opts); } if (opts.secondary_cache) { cache = std::make_shared(cache, opts.secondary_cache); } return cache; } } // namespace ROCKSDB_NAMESPACE