rocksdb/cache/clock_cache.cc

//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "cache/clock_cache.h"

#include <algorithm>
#include <atomic>
#include <bitset>
#include <cassert>
#include <cinttypes>
#include <cstddef>
#include <exception>
#include <functional>
#include <numeric>
#include <string>
#include <thread>
#include <type_traits>

#include "cache/cache_key.h"
#include "cache/secondary_cache_adapter.h"
#include "logging/logging.h"
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics_impl.h"
#include "port/lang.h"
#include "rocksdb/env.h"
#include "util/hash.h"
#include "util/math.h"
#include "util/random.h"

namespace ROCKSDB_NAMESPACE {

namespace clock_cache {

namespace {
inline uint64_t GetRefcount(uint64_t meta) {
  return ((meta >> ClockHandle::kAcquireCounterShift) -
          (meta >> ClockHandle::kReleaseCounterShift)) &
         ClockHandle::kCounterMask;
}

inline uint64_t GetInitialCountdown(Cache::Priority priority) {
  // Set initial clock data from priority
  // TODO: configuration parameters for priority handling and clock cycle
  // count?
  switch (priority) {
    case Cache::Priority::HIGH:
      return ClockHandle::kHighCountdown;
    default:
      assert(false);
      FALLTHROUGH_INTENDED;
    case Cache::Priority::LOW:
      return ClockHandle::kLowCountdown;
    case Cache::Priority::BOTTOM:
      return ClockHandle::kBottomCountdown;
  }
}

inline void MarkEmpty(ClockHandle& h) {
#ifndef NDEBUG
  // Mark slot as empty, with assertion
  uint64_t meta = h.meta.exchange(0, std::memory_order_release);
  assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction);
#else
  // Mark slot as empty
  h.meta.store(0, std::memory_order_release);
#endif
}

inline void FreeDataMarkEmpty(ClockHandle& h, MemoryAllocator* allocator) {
  // NOTE: in theory there's more room for parallelism if we copy the handle
  // data and delay actions like this until after marking the entry as empty,
  // but performance tests only show a regression by copying the few words
  // of data.
  h.FreeData(allocator);

  MarkEmpty(h);
}

// Called to undo the effect of referencing an entry for internal purposes,
// so it should not be marked as having been used.
inline void Unref(const ClockHandle& h, uint64_t count = 1) {
  // Pretend we never took the reference
  // WART: there's a tiny chance we release last ref to invisible
  // entry here. If that happens, we let eviction take care of it.
  uint64_t old_meta = h.meta.fetch_sub(ClockHandle::kAcquireIncrement * count,
                                       std::memory_order_release);
  assert(GetRefcount(old_meta) != 0);
  (void)old_meta;
}

inline bool ClockUpdate(ClockHandle& h, bool* purgeable = nullptr) {
  uint64_t meta;
  if (purgeable) {
    assert(*purgeable == false);
    // In AutoHCC, our eviction process follows the chain structure, so we
    // should ensure that we see the latest state of each entry, at least for
    // assertion checking.
    meta = h.meta.load(std::memory_order_acquire);
  } else {
    // In FixedHCC, our eviction process is a simple iteration without regard
    // to probing order, displacements, etc., so it doesn't matter if we see
    // somewhat stale data.
    meta = h.meta.load(std::memory_order_relaxed);
  }

  if (((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit) ==
      0) {
    // Only clock update Shareable entries
    if (purgeable) {
      *purgeable = true;
      // AutoHCC only: make sure we only attempt to update non-empty slots
      assert((meta >> ClockHandle::kStateShift) &
             ClockHandle::kStateOccupiedBit);
    }
    return false;
  }
  uint64_t acquire_count =
      (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask;
  uint64_t release_count =
      (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask;
  if (acquire_count != release_count) {
    // Only clock update entries with no outstanding refs
    return false;
  }
  if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
      acquire_count > 0) {
    // Decrement clock
    uint64_t new_count =
        std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1);
    // Compare-exchange in the decremented clock info, but
    // not aggressively
    uint64_t new_meta =
        (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) |
        (meta & ClockHandle::kHitBitMask) |
        (new_count << ClockHandle::kReleaseCounterShift) |
        (new_count << ClockHandle::kAcquireCounterShift);
    h.meta.compare_exchange_strong(meta, new_meta, std::memory_order_relaxed);
    return false;
  }
  // Otherwise, remove entry (either unreferenced invisible or
  // unreferenced and expired visible).
  if (h.meta.compare_exchange_strong(meta,
                                     (uint64_t{ClockHandle::kStateConstruction}
                                      << ClockHandle::kStateShift) |
                                         (meta & ClockHandle::kHitBitMask),
                                     std::memory_order_acquire)) {
    // Took ownership.
    return true;
  } else {
    // Compare-exchange failing probably
    // indicates the entry was used, so skip it in that case.
    return false;
  }
}

// If an entry doesn't receive clock updates but is repeatedly referenced &
// released, the acquire and release counters could overflow without some
// intervention. This is that intervention, which should be inexpensive
// because it only incurs a simple, very predictable check. (Applying a bit
// mask in addition to an increment to every Release likely would be
// relatively expensive, because it's an extra atomic update.)
//
// We do have to assume that we never have many millions of simultaneous
// references to a cache handle, because we cannot represent so many
// references with the difference in counters, masked to the number of
// counter bits. Similarly, we assume there aren't millions of threads
// holding transient references (which might be "undone" rather than
// released by the way).
//
// Consider these possible states for each counter:
// low: less than kMaxCountdown
// medium: kMaxCountdown to half way to overflow + kMaxCountdown
// high: half way to overflow + kMaxCountdown, or greater
//
// And these possible states for the combination of counters:
// acquire / release
// -------   -------
// low       low       - Normal / common, with caveats (see below)
// medium    low       - Can happen while holding some refs
// high      low       - Violates assumptions (too many refs)
// low       medium    - Violates assumptions (refs underflow, etc.)
// medium    medium    - Normal (very read heavy cache)
// high      medium    - Can happen while holding some refs
// low       high      - This function is supposed to prevent
// medium    high      - Violates assumptions (refs underflow, etc.)
// high      high      - Needs CorrectNearOverflow
//
// Basically, this function detects (high, high) state (inferred from
// release alone being high) and bumps it back down to (medium, medium)
// state with the same refcount and the same logical countdown counter
// (everything > kMaxCountdown is logically the same). Note that bumping
// down to (low, low) would modify the countdown counter, so is "reserved"
// in a sense.
//
// If near-overflow correction is triggered here, there's no guarantee
// that another thread hasn't freed the entry and replaced it with another.
// Therefore, it must be the case that the correction does not affect
// entries unless they are very old (many millions of acquire-release cycles).
// (Our bit manipulation is indeed idempotent and only affects entries in
// exceptional cases.) We assume a pre-empted thread will not stall that long.
// If it did, the state could be corrupted in the (unlikely) case that the top
// bit of the acquire counter is set but not the release counter, and thus
// we only clear the top bit of the acquire counter on resumption. It would
// then appear that there are too many refs and the entry would be permanently
// pinned (which is not terrible for an exceptionally rare occurrence), unless
// it is referenced enough (at least kMaxCountdown more times) for the release
// counter to reach "high" state again and bumped back to "medium." (This
// motivates only checking for release counter in high state, not both in high
// state.)
inline void CorrectNearOverflow(uint64_t old_meta,
                                std::atomic<uint64_t>& meta) {
  // We clear both top-most counter bits at the same time.
  constexpr uint64_t kCounterTopBit = uint64_t{1}
                                      << (ClockHandle::kCounterNumBits - 1);
  constexpr uint64_t kClearBits =
      (kCounterTopBit << ClockHandle::kAcquireCounterShift) |
      (kCounterTopBit << ClockHandle::kReleaseCounterShift);
  // A simple check that allows us to initiate clearing the top bits for
  // a large portion of the "high" state space on release counter.
  constexpr uint64_t kCheckBits =
      (kCounterTopBit | (ClockHandle::kMaxCountdown + 1))
      << ClockHandle::kReleaseCounterShift;

  if (UNLIKELY(old_meta & kCheckBits)) {
    meta.fetch_and(~kClearBits, std::memory_order_relaxed);
  }
}

inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
                            uint64_t initial_countdown, bool* already_matches) {
  assert(*already_matches == false);
  // Optimistically transition the slot from "empty" to
  // "under construction" (no effect on other states)
  uint64_t old_meta = h.meta.fetch_or(
      uint64_t{ClockHandle::kStateOccupiedBit} << ClockHandle::kStateShift,
      std::memory_order_acq_rel);
  uint64_t old_state = old_meta >> ClockHandle::kStateShift;

  if (old_state == ClockHandle::kStateEmpty) {
    // We've started inserting into an available slot, and taken
    // ownership.
    return true;
  } else if (old_state != ClockHandle::kStateVisible) {
    // Slot not usable / touchable now
    return false;
  }
  // Existing, visible entry, which might be a match.
  // But first, we need to acquire a ref to read it. In fact, number of
  // refs for initial countdown, so that we boost the clock state if
  // this is a match.
  old_meta =
      h.meta.fetch_add(ClockHandle::kAcquireIncrement * initial_countdown,
                       std::memory_order_acq_rel);
  // Like Lookup
  if ((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateVisible) {
    // Acquired a read reference
    if (h.hashed_key == proto.hashed_key) {
      // Match. Release in a way that boosts the clock state
      old_meta =
          h.meta.fetch_add(ClockHandle::kReleaseIncrement * initial_countdown,
                           std::memory_order_acq_rel);
      // Correct for possible (but rare) overflow
      CorrectNearOverflow(old_meta, h.meta);
      // Insert detached instead (only if return handle needed)
      *already_matches = true;
      return false;
    } else {
      // Mismatch.
      Unref(h, initial_countdown);
    }
  } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
                      ClockHandle::kStateInvisible)) {
    // Pretend we never took the reference
    Unref(h, initial_countdown);
  } else {
    // For other states, incrementing the acquire counter has no effect
    // so we don't need to undo it.
    // Slot not usable / touchable now.
  }
  return false;
}

inline void FinishSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
                             uint64_t initial_countdown, bool keep_ref) {
  // Save data fields
  ClockHandleBasicData* h_alias = &h;
  *h_alias = proto;

  // Transition from "under construction" state to "visible" state
  uint64_t new_meta = uint64_t{ClockHandle::kStateVisible}
                      << ClockHandle::kStateShift;

  // Maybe with an outstanding reference
  new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift;
  new_meta |= (initial_countdown - keep_ref)
              << ClockHandle::kReleaseCounterShift;

#ifndef NDEBUG
  // Save the state transition, with assertion
  uint64_t old_meta = h.meta.exchange(new_meta, std::memory_order_release);
  assert(old_meta >> ClockHandle::kStateShift ==
         ClockHandle::kStateConstruction);
#else
  // Save the state transition
  h.meta.store(new_meta, std::memory_order_release);
#endif
}

bool TryInsert(const ClockHandleBasicData& proto, ClockHandle& h,
               uint64_t initial_countdown, bool keep_ref,
               bool* already_matches) {
  bool b = BeginSlotInsert(proto, h, initial_countdown, already_matches);
  if (b) {
    FinishSlotInsert(proto, h, initial_countdown, keep_ref);
  }
  return b;
}

// Func must be const HandleImpl& -> void callable
template <class HandleImpl, class Func>
void ConstApplyToEntriesRange(const Func& func, const HandleImpl* begin,
                              const HandleImpl* end,
                              bool apply_if_will_be_deleted) {
  uint64_t check_state_mask = ClockHandle::kStateShareableBit;
  if (!apply_if_will_be_deleted) {
    check_state_mask |= ClockHandle::kStateVisibleBit;
  }

  for (const HandleImpl* h = begin; h < end; ++h) {
    // Note: to avoid using compare_exchange, we have to be extra careful.
    uint64_t old_meta = h->meta.load(std::memory_order_relaxed);
    // Check if it's an entry visible to lookups
    if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
      // Increment acquire counter. Note: it's possible that the entry has
      // completely changed since we loaded old_meta, but incrementing acquire
      // count is always safe. (Similar to optimistic Lookup here.)
      old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
                                   std::memory_order_acquire);
      // Check whether we actually acquired a reference.
      if ((old_meta >> ClockHandle::kStateShift) &
          ClockHandle::kStateShareableBit) {
        // Apply func if appropriate
        if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
          func(*h);
        }
        // Pretend we never took the reference
        Unref(*h);
        // No net change, so don't need to check for overflow
      } else {
        // For other states, incrementing the acquire counter has no effect
        // so we don't need to undo it. Furthermore, we cannot safely undo
        // it because we did not acquire a read reference to lock the
        // entry in a Shareable state.
      }
    }
  }
}

}  // namespace

void ClockHandleBasicData::FreeData(MemoryAllocator* allocator) const {
  if (helper->del_cb) {
    helper->del_cb(value, allocator);
  }
}

template <class HandleImpl>
HandleImpl* BaseClockTable::StandaloneInsert(
    const ClockHandleBasicData& proto) {
  // Heap allocated separate from table
  HandleImpl* h = new HandleImpl();
  ClockHandleBasicData* h_alias = h;
  *h_alias = proto;
  h->SetStandalone();
  // Single reference (standalone entries only created if returning a refed
  // Handle back to user)
  uint64_t meta = uint64_t{ClockHandle::kStateInvisible}
                  << ClockHandle::kStateShift;
  meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift;
  h->meta.store(meta, std::memory_order_release);
  // Keep track of how much of usage is standalone
  standalone_usage_.fetch_add(proto.GetTotalCharge(),
                              std::memory_order_relaxed);
  return h;
}

template <class Table>
typename Table::HandleImpl* BaseClockTable::CreateStandalone(
    ClockHandleBasicData& proto, size_t capacity, bool strict_capacity_limit,
    bool allow_uncharged) {
  Table& derived = static_cast<Table&>(*this);
  typename Table::InsertState state;
  derived.StartInsert(state);

  const size_t total_charge = proto.GetTotalCharge();
  if (strict_capacity_limit) {
    Status s = ChargeUsageMaybeEvictStrict<Table>(
        total_charge, capacity,
        /*need_evict_for_occupancy=*/false, state);
    if (!s.ok()) {
      if (allow_uncharged) {
        proto.total_charge = 0;
      } else {
        return nullptr;
      }
    }
  } else {
    // Case strict_capacity_limit == false
    bool success = ChargeUsageMaybeEvictNonStrict<Table>(
        total_charge, capacity,
        /*need_evict_for_occupancy=*/false, state);
    if (!success) {
      // Force the issue
      usage_.fetch_add(total_charge, std::memory_order_relaxed);
    }
  }

  return StandaloneInsert<typename Table::HandleImpl>(proto);
}

template <class Table>
Status BaseClockTable::ChargeUsageMaybeEvictStrict(
    size_t total_charge, size_t capacity, bool need_evict_for_occupancy,
    typename Table::InsertState& state) {
  if (total_charge > capacity) {
    return Status::MemoryLimit(
        "Cache entry too large for a single cache shard: " +
        std::to_string(total_charge) + " > " + std::to_string(capacity));
  }
  // Grab any available capacity, and free up any more required.
  size_t old_usage = usage_.load(std::memory_order_relaxed);
  size_t new_usage;
  do {
    new_usage = std::min(capacity, old_usage + total_charge);
    if (new_usage == old_usage) {
      // No change needed
      break;
    }
  } while (!usage_.compare_exchange_weak(old_usage, new_usage,
                                         std::memory_order_relaxed));
  // How much do we need to evict then?
  size_t need_evict_charge = old_usage + total_charge - new_usage;
  size_t request_evict_charge = need_evict_charge;
  if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) {
    // Require at least 1 eviction.
    request_evict_charge = 1;
  }
  if (request_evict_charge > 0) {
    EvictionData data;
    static_cast<Table*>(this)->Evict(request_evict_charge, state, &data);
    occupancy_.fetch_sub(data.freed_count, std::memory_order_release);
    if (LIKELY(data.freed_charge > need_evict_charge)) {
      assert(data.freed_count > 0);
      // Evicted more than enough
      usage_.fetch_sub(data.freed_charge - need_evict_charge,
                       std::memory_order_relaxed);
    } else if (data.freed_charge < need_evict_charge ||
               (UNLIKELY(need_evict_for_occupancy) && data.freed_count == 0)) {
      // Roll back to old usage minus evicted
      usage_.fetch_sub(data.freed_charge + (new_usage - old_usage),
                       std::memory_order_relaxed);
      if (data.freed_charge < need_evict_charge) {
        return Status::MemoryLimit(
            "Insert failed because unable to evict entries to stay within "
            "capacity limit.");
      } else {
        return Status::MemoryLimit(
            "Insert failed because unable to evict entries to stay within "
            "table occupancy limit.");
      }
    }
    // If we needed to evict something and we are proceeding, we must have
    // evicted something.
    assert(data.freed_count > 0);
  }
  return Status::OK();
}

template <class Table>
inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict(
    size_t total_charge, size_t capacity, bool need_evict_for_occupancy,
    typename Table::InsertState& state) {
  // For simplicity, we consider that either the cache can accept the insert
  // with no evictions, or we must evict enough to make (at least) enough
  // space. It could lead to unnecessary failures or excessive evictions in
  // some extreme cases, but allows a fast, simple protocol. If we allow a
  // race to get us over capacity, then we might never get back to capacity
  // limit if the sizes of entries allow each insertion to evict the minimum
  // charge. Thus, we should evict some extra if it's not a signifcant
  // portion of the shard capacity. This can have the side benefit of
  // involving fewer threads in eviction.
  size_t old_usage = usage_.load(std::memory_order_relaxed);
  size_t need_evict_charge;
  // NOTE: if total_charge > old_usage, there isn't yet enough to evict
  // `total_charge` amount. Even if we only try to evict `old_usage` amount,
  // there's likely something referenced and we would eat CPU looking for
  // enough to evict.
  if (old_usage + total_charge <= capacity || total_charge > old_usage) {
    // Good enough for me (might run over with a race)
    need_evict_charge = 0;
  } else {
    // Try to evict enough space, and maybe some extra
    need_evict_charge = total_charge;
    if (old_usage > capacity) {
      // Not too much to avoid thundering herd while avoiding strict
      // synchronization, such as the compare_exchange used with strict
      // capacity limit.
      need_evict_charge += std::min(capacity / 1024, total_charge) + 1;
    }
  }
  if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) {
    // Special case: require at least 1 eviction if we only have to
    // deal with occupancy
    need_evict_charge = 1;
  }
  EvictionData data;
  if (need_evict_charge > 0) {
    static_cast<Table*>(this)->Evict(need_evict_charge, state, &data);
    // Deal with potential occupancy deficit
    if (UNLIKELY(need_evict_for_occupancy) && data.freed_count == 0) {
      assert(data.freed_charge == 0);
      // Can't meet occupancy requirement
      return false;
    } else {
      // Update occupancy for evictions
      occupancy_.fetch_sub(data.freed_count, std::memory_order_release);
    }
  }
  // Track new usage even if we weren't able to evict enough
  usage_.fetch_add(total_charge - data.freed_charge, std::memory_order_relaxed);
  // No underflow
  assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
  // Success
  return true;
}

void BaseClockTable::TrackAndReleaseEvictedEntry(
    ClockHandle* h, BaseClockTable::EvictionData* data) {
  data->freed_charge += h->GetTotalCharge();
  data->freed_count += 1;

  bool took_value_ownership = false;
  if (eviction_callback_) {
    // For key reconstructed from hash
    UniqueId64x2 unhashed;
    took_value_ownership = eviction_callback_(
        ClockCacheShard<FixedHyperClockTable>::ReverseHash(
            h->GetHash(), &unhashed, hash_seed_),
        reinterpret_cast<Cache::Handle*>(h),
        h->meta.load(std::memory_order_relaxed) & ClockHandle::kHitBitMask);
  }
  if (!took_value_ownership) {
    h->FreeData(allocator_);
  }
  MarkEmpty(*h);
}

template <class Table>
Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
                              typename Table::HandleImpl** handle,
                              Cache::Priority priority, size_t capacity,
                              bool strict_capacity_limit) {
  using HandleImpl = typename Table::HandleImpl;
  Table& derived = static_cast<Table&>(*this);

  typename Table::InsertState state;
  derived.StartInsert(state);

  // Do we have the available occupancy? Optimistically assume we do
  // and deal with it if we don't.
  size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
  // Whether we over-committed and need an eviction to make up for it
  bool need_evict_for_occupancy =
      !derived.GrowIfNeeded(old_occupancy + 1, state);

  // Usage/capacity handling is somewhat different depending on
  // strict_capacity_limit, but mostly pessimistic.
  bool use_standalone_insert = false;
  const size_t total_charge = proto.GetTotalCharge();
  if (strict_capacity_limit) {
    Status s = ChargeUsageMaybeEvictStrict<Table>(
        total_charge, capacity, need_evict_for_occupancy, state);
    if (!s.ok()) {
      // Revert occupancy
      occupancy_.fetch_sub(1, std::memory_order_relaxed);
      return s;
    }
  } else {
    // Case strict_capacity_limit == false
    bool success = ChargeUsageMaybeEvictNonStrict<Table>(
        total_charge, capacity, need_evict_for_occupancy, state);
    if (!success) {
      // Revert occupancy
      occupancy_.fetch_sub(1, std::memory_order_relaxed);
      if (handle == nullptr) {
        // Don't insert the entry but still return ok, as if the entry
        // inserted into cache and evicted immediately.
        proto.FreeData(allocator_);
        return Status::OK();
      } else {
        // Need to track usage of fallback standalone insert
        usage_.fetch_add(total_charge, std::memory_order_relaxed);
        use_standalone_insert = true;
      }
    }
  }

  if (!use_standalone_insert) {
    // Attempt a table insert, but abort if we find an existing entry for the
    // key. If we were to overwrite old entries, we would either
    // * Have to gain ownership over an existing entry to overwrite it, which
    // would only work if there are no outstanding (read) references and would
    // create a small gap in availability of the entry (old or new) to lookups.
    // * Have to insert into a suboptimal location (more probes) so that the
    // old entry can be kept around as well.

    uint64_t initial_countdown = GetInitialCountdown(priority);
    assert(initial_countdown > 0);

    HandleImpl* e =
        derived.DoInsert(proto, initial_countdown, handle != nullptr, state);

    if (e) {
      // Successfully inserted
      if (handle) {
        *handle = e;
      }
      return Status::OK();
    }
    // Not inserted
    // Revert occupancy
    occupancy_.fetch_sub(1, std::memory_order_relaxed);
    // Maybe fall back on standalone insert
    if (handle == nullptr) {
      // Revert usage
      usage_.fetch_sub(total_charge, std::memory_order_relaxed);
      // No underflow
      assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
      // As if unrefed entry immdiately evicted
      proto.FreeData(allocator_);
      return Status::OK();
    }

    use_standalone_insert = true;
  }

  // Run standalone insert
  assert(use_standalone_insert);

  *handle = StandaloneInsert<HandleImpl>(proto);

  // The OkOverwritten status is used to count "redundant" insertions into
  // block cache. This implementation doesn't strictly check for redundant
  // insertions, but we instead are probably interested in how many insertions
  // didn't go into the table (instead "standalone"), which could be redundant
  // Insert or some other reason (use_standalone_insert reasons above).
  return Status::OkOverwritten();
}

void BaseClockTable::Ref(ClockHandle& h) {
  // Increment acquire counter
  uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
                                       std::memory_order_acquire);

  assert((old_meta >> ClockHandle::kStateShift) &
         ClockHandle::kStateShareableBit);
  // Must have already had a reference
  assert(GetRefcount(old_meta) > 0);
  (void)old_meta;
}

#ifndef NDEBUG
void BaseClockTable::TEST_RefN(ClockHandle& h, size_t n) {
  // Increment acquire counter
  uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement,
                                       std::memory_order_acquire);

  assert((old_meta >> ClockHandle::kStateShift) &
         ClockHandle::kStateShareableBit);
  (void)old_meta;
}

void BaseClockTable::TEST_ReleaseNMinus1(ClockHandle* h, size_t n) {
  assert(n > 0);

  // Like n-1 Releases, but assumes one more will happen in the caller to take
  // care of anything like erasing an unreferenced, invisible entry.
  uint64_t old_meta = h->meta.fetch_add(
      (n - 1) * ClockHandle::kReleaseIncrement, std::memory_order_acquire);
  assert((old_meta >> ClockHandle::kStateShift) &
         ClockHandle::kStateShareableBit);
  (void)old_meta;
}
#endif

FixedHyperClockTable::FixedHyperClockTable(
    size_t capacity, bool /*strict_capacity_limit*/,
    CacheMetadataChargePolicy metadata_charge_policy,
    MemoryAllocator* allocator,
    const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed,
    const Opts& opts)
    : BaseClockTable(metadata_charge_policy, allocator, eviction_callback,
                     hash_seed),
      length_bits_(CalcHashBits(capacity, opts.estimated_value_size,
                                metadata_charge_policy)),
      length_bits_mask_((size_t{1} << length_bits_) - 1),
      occupancy_limit_(static_cast<size_t>((uint64_t{1} << length_bits_) *
                                           kStrictLoadFactor)),
      array_(new HandleImpl[size_t{1} << length_bits_]) {
  if (metadata_charge_policy ==
      CacheMetadataChargePolicy::kFullChargeCacheMetadata) {
    usage_ += size_t{GetTableSize()} * sizeof(HandleImpl);
  }

  static_assert(sizeof(HandleImpl) == 64U,
                "Expecting size / alignment with common cache line size");
}

FixedHyperClockTable::~FixedHyperClockTable() {
  // Assumes there are no references or active operations on any slot/element
  // in the table.
  for (size_t i = 0; i < GetTableSize(); i++) {
    HandleImpl& h = array_[i];
    switch (h.meta >> ClockHandle::kStateShift) {
      case ClockHandle::kStateEmpty:
        // noop
        break;
      case ClockHandle::kStateInvisible:  // rare but possible
      case ClockHandle::kStateVisible:
        assert(GetRefcount(h.meta) == 0);
        h.FreeData(allocator_);
#ifndef NDEBUG
        Rollback(h.hashed_key, &h);
        ReclaimEntryUsage(h.GetTotalCharge());
#endif
        break;
      // otherwise
      default:
        assert(false);
        break;
    }
  }

#ifndef NDEBUG
  for (size_t i = 0; i < GetTableSize(); i++) {
    assert(array_[i].displacements.load() == 0);
  }
#endif

  assert(usage_.load() == 0 ||
         usage_.load() == size_t{GetTableSize()} * sizeof(HandleImpl));
  assert(occupancy_ == 0);
}

void FixedHyperClockTable::StartInsert(InsertState&) {}

bool FixedHyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) {
  return new_occupancy <= occupancy_limit_;
}

FixedHyperClockTable::HandleImpl* FixedHyperClockTable::DoInsert(
    const ClockHandleBasicData& proto, uint64_t initial_countdown,
    bool keep_ref, InsertState&) {
  bool already_matches = false;
  HandleImpl* e = FindSlot(
      proto.hashed_key,
      [&](HandleImpl* h) {
        return TryInsert(proto, *h, initial_countdown, keep_ref,
                         &already_matches);
      },
      [&](HandleImpl* h) {
        if (already_matches) {
          // Stop searching & roll back displacements
          Rollback(proto.hashed_key, h);
          return true;
        } else {
          // Keep going
          return false;
        }
      },
      [&](HandleImpl* h, bool is_last) {
        if (is_last) {
          // Search is ending. Roll back displacements
          Rollback(proto.hashed_key, h);
        } else {
          h->displacements.fetch_add(1, std::memory_order_relaxed);
        }
      });
  if (already_matches) {
    // Insertion skipped
    return nullptr;
  }
  if (e != nullptr) {
    // Successfully inserted
    return e;
  }
  // Else, no available slot found. Occupancy check should generally prevent
  // this, except it's theoretically possible for other threads to evict and
  // replace entries in the right order to hit every slot when it is populated.
  // Assuming random hashing, the chance of that should be no higher than
  // pow(kStrictLoadFactor, n) for n slots. That should be infeasible for
  // roughly n >= 256, so if this assertion fails, that suggests something is
  // going wrong.
  assert(GetTableSize() < 256);
  return nullptr;
}

FixedHyperClockTable::HandleImpl* FixedHyperClockTable::Lookup(
    const UniqueId64x2& hashed_key) {
  HandleImpl* e = FindSlot(
      hashed_key,
      [&](HandleImpl* h) {
        // Mostly branch-free version (similar performance)
        /*
        uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
                                     std::memory_order_acquire);
        bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U;
        bool visible = (old_meta >> ClockHandle::kStateShift) & 1U;
        bool match = (h->key == key) & visible;
        h->meta.fetch_sub(static_cast<uint64_t>(Shareable & !match) <<
        ClockHandle::kAcquireCounterShift, std::memory_order_release); return
        match;
        */
        // Optimistic lookup should pay off when the table is relatively
        // sparse.
        constexpr bool kOptimisticLookup = true;
        uint64_t old_meta;
        if (!kOptimisticLookup) {
          old_meta = h->meta.load(std::memory_order_acquire);
          if ((old_meta >> ClockHandle::kStateShift) !=
              ClockHandle::kStateVisible) {
            return false;
          }
        }
        // (Optimistically) increment acquire counter
        old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
                                     std::memory_order_acquire);
        // Check if it's an entry visible to lookups
        if ((old_meta >> ClockHandle::kStateShift) ==
            ClockHandle::kStateVisible) {
          // Acquired a read reference
          if (h->hashed_key == hashed_key) {
            // Match
            // Update the hit bit
            if (eviction_callback_) {
              h->meta.fetch_or(uint64_t{1} << ClockHandle::kHitBitShift,
                               std::memory_order_relaxed);
            }
            return true;
          } else {
            // Mismatch. Pretend we never took the reference
            Unref(*h);
          }
        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
                            ClockHandle::kStateInvisible)) {
          // Pretend we never took the reference
          Unref(*h);
        } else {
          // For other states, incrementing the acquire counter has no effect
          // so we don't need to undo it. Furthermore, we cannot safely undo
          // it because we did not acquire a read reference to lock the
          // entry in a Shareable state.
        }
        return false;
      },
      [&](HandleImpl* h) {
        return h->displacements.load(std::memory_order_relaxed) == 0;
      },
      [&](HandleImpl* /*h*/, bool /*is_last*/) {});

  return e;
}

bool FixedHyperClockTable::Release(HandleImpl* h, bool useful,
                                   bool erase_if_last_ref) {
  // In contrast with LRUCache's Release, this function won't delete the handle
  // when the cache is above capacity and the reference is the last one. Space
  // is only freed up by EvictFromClock (called by Insert when space is needed)
  // and Erase. We do this to avoid an extra atomic read of the variable usage_.

  uint64_t old_meta;
  if (useful) {
    // Increment release counter to indicate was used
    old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement,
                                 std::memory_order_release);
  } else {
    // Decrement acquire counter to pretend it never happened
    old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
                                 std::memory_order_release);
  }

  assert((old_meta >> ClockHandle::kStateShift) &
         ClockHandle::kStateShareableBit);
  // No underflow
  assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
          ClockHandle::kCounterMask) !=
         ((old_meta >> ClockHandle::kReleaseCounterShift) &
          ClockHandle::kCounterMask));

  if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
                                    ClockHandle::kStateInvisible)) {
    // FIXME: There's a chance here that another thread could replace this
    // entry and we end up erasing the wrong one.

    // Update for last fetch_add op
    if (useful) {
      old_meta += ClockHandle::kReleaseIncrement;
    } else {
      old_meta -= ClockHandle::kAcquireIncrement;
    }
    // Take ownership if no refs
    do {
      if (GetRefcount(old_meta) != 0) {
        // Not last ref at some point in time during this Release call
        // Correct for possible (but rare) overflow
        CorrectNearOverflow(old_meta, h->meta);
        return false;
      }
      if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
                       << ClockHandle::kStateShift)) == 0) {
        // Someone else took ownership
        return false;
      }
      // Note that there's a small chance that we release, another thread
      // replaces this entry with another, reaches zero refs, and then we end
      // up erasing that other entry. That's an acceptable risk / imprecision.
    } while (!h->meta.compare_exchange_weak(
        old_meta,
        uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
        std::memory_order_acquire));
    // Took ownership
    size_t total_charge = h->GetTotalCharge();
    if (UNLIKELY(h->IsStandalone())) {
      h->FreeData(allocator_);
      // Delete standalone handle
      delete h;
      standalone_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
      usage_.fetch_sub(total_charge, std::memory_order_relaxed);
    } else {
      Rollback(h->hashed_key, h);
      FreeDataMarkEmpty(*h, allocator_);
      ReclaimEntryUsage(total_charge);
    }
    return true;
  } else {
    // Correct for possible (but rare) overflow
    CorrectNearOverflow(old_meta, h->meta);
    return false;
  }
}

#ifndef NDEBUG
void FixedHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
  if (n > 0) {
    // Do n-1 simple releases first
    TEST_ReleaseNMinus1(h, n);

    // Then the last release might be more involved
    Release(h, /*useful*/ true, /*erase_if_last_ref*/ false);
  }
}
#endif

void FixedHyperClockTable::Erase(const UniqueId64x2& hashed_key) {
  (void)FindSlot(
      hashed_key,
      [&](HandleImpl* h) {
        // Could be multiple entries in rare cases. Erase them all.
        // Optimistically increment acquire counter
        uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
                                              std::memory_order_acquire);
        // Check if it's an entry visible to lookups
        if ((old_meta >> ClockHandle::kStateShift) ==
            ClockHandle::kStateVisible) {
          // Acquired a read reference
          if (h->hashed_key == hashed_key) {
            // Match. Set invisible.
            old_meta =
                h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit}
                                    << ClockHandle::kStateShift),
                                  std::memory_order_acq_rel);
            // Apply update to local copy
            old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
                          << ClockHandle::kStateShift);
            for (;;) {
              uint64_t refcount = GetRefcount(old_meta);
              assert(refcount > 0);
              if (refcount > 1) {
                // Not last ref at some point in time during this Erase call
                // Pretend we never took the reference
                Unref(*h);
                break;
              } else if (h->meta.compare_exchange_weak(
                             old_meta,
                             uint64_t{ClockHandle::kStateConstruction}
                                 << ClockHandle::kStateShift,
                             std::memory_order_acq_rel)) {
                // Took ownership
                assert(hashed_key == h->hashed_key);
                size_t total_charge = h->GetTotalCharge();
                FreeDataMarkEmpty(*h, allocator_);
                ReclaimEntryUsage(total_charge);
                // We already have a copy of hashed_key in this case, so OK to
                // delay Rollback until after releasing the entry
                Rollback(hashed_key, h);
                break;
              }
            }
          } else {
            // Mismatch. Pretend we never took the reference
            Unref(*h);
          }
        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
                            ClockHandle::kStateInvisible)) {
          // Pretend we never took the reference
          Unref(*h);
        } else {
          // For other states, incrementing the acquire counter has no effect
          // so we don't need to undo it.
        }
        return false;
      },
      [&](HandleImpl* h) {
        return h->displacements.load(std::memory_order_relaxed) == 0;
      },
      [&](HandleImpl* /*h*/, bool /*is_last*/) {});
}

void FixedHyperClockTable::EraseUnRefEntries() {
  for (size_t i = 0; i <= this->length_bits_mask_; i++) {
    HandleImpl& h = array_[i];

    uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
    if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
                    << ClockHandle::kStateShift) &&
        GetRefcount(old_meta) == 0 &&
        h.meta.compare_exchange_strong(old_meta,
                                       uint64_t{ClockHandle::kStateConstruction}
                                           << ClockHandle::kStateShift,
                                       std::memory_order_acquire)) {
      // Took ownership
      size_t total_charge = h.GetTotalCharge();
      Rollback(h.hashed_key, &h);
      FreeDataMarkEmpty(h, allocator_);
      ReclaimEntryUsage(total_charge);
    }
  }
}

template <typename MatchFn, typename AbortFn, typename UpdateFn>
inline FixedHyperClockTable::HandleImpl* FixedHyperClockTable::FindSlot(
    const UniqueId64x2& hashed_key, const MatchFn& match_fn,
    const AbortFn& abort_fn, const UpdateFn& update_fn) {
  // NOTE: upper 32 bits of hashed_key[0] is used for sharding
  //
  // We use double-hashing probing. Every probe in the sequence is a
  // pseudorandom integer, computed as a linear function of two random hashes,
  // which we call base and increment. Specifically, the i-th probe is base + i
  // * increment modulo the table size.
  size_t base = static_cast<size_t>(hashed_key[1]);
  // We use an odd increment, which is relatively prime with the power-of-two
  // table size. This implies that we cycle back to the first probe only
  // after probing every slot exactly once.
  // TODO: we could also reconsider linear probing, though locality benefits
  // are limited because each slot is a full cache line
  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
  size_t first = ModTableSize(base);
  size_t current = first;
  bool is_last;
  do {
    HandleImpl* h = &array_[current];
    if (match_fn(h)) {
      return h;
    }
    if (abort_fn(h)) {
      return nullptr;
    }
    current = ModTableSize(current + increment);
    is_last = current == first;
    update_fn(h, is_last);
  } while (!is_last);
  // We looped back.
  return nullptr;
}

inline void FixedHyperClockTable::Rollback(const UniqueId64x2& hashed_key,
                                           const HandleImpl* h) {
  size_t current = ModTableSize(hashed_key[1]);
  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
  while (&array_[current] != h) {
    array_[current].displacements.fetch_sub(1, std::memory_order_relaxed);
    current = ModTableSize(current + increment);
  }
}

inline void FixedHyperClockTable::ReclaimEntryUsage(size_t total_charge) {
  auto old_occupancy = occupancy_.fetch_sub(1U, std::memory_order_release);
  (void)old_occupancy;
  // No underflow
  assert(old_occupancy > 0);
  auto old_usage = usage_.fetch_sub(total_charge, std::memory_order_relaxed);
  (void)old_usage;
  // No underflow
  assert(old_usage >= total_charge);
}

inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&,
                                        EvictionData* data) {
  // precondition
  assert(requested_charge > 0);

  // TODO: make a tuning parameter?
  constexpr size_t step_size = 4;

  // First (concurrent) increment clock pointer
  uint64_t old_clock_pointer =
      clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);

  // Cap the eviction effort at this thread (along with those operating in
  // parallel) circling through the whole structure kMaxCountdown times.
  // In other words, this eviction run must find something/anything that is
  // unreferenced at start of and during the eviction run that isn't reclaimed
  // by a concurrent eviction run.
  uint64_t max_clock_pointer =
      old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_);

  for (;;) {
    for (size_t i = 0; i < step_size; i++) {
      HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))];
      bool evicting = ClockUpdate(h);
      if (evicting) {
        Rollback(h.hashed_key, &h);
        TrackAndReleaseEvictedEntry(&h, data);
      }
    }

    // Loop exit condition
    if (data->freed_charge >= requested_charge) {
      return;
    }
    if (old_clock_pointer >= max_clock_pointer) {
      return;
    }

    // Advance clock pointer (concurrently)
    old_clock_pointer =
        clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
  }
}

template <class Table>
ClockCacheShard<Table>::ClockCacheShard(
    size_t capacity, bool strict_capacity_limit,
    CacheMetadataChargePolicy metadata_charge_policy,
    MemoryAllocator* allocator,
    const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed,
    const typename Table::Opts& opts)
    : CacheShardBase(metadata_charge_policy),
      table_(capacity, strict_capacity_limit, metadata_charge_policy, allocator,
             eviction_callback, hash_seed, opts),
      capacity_(capacity),
      strict_capacity_limit_(strict_capacity_limit) {
  // Initial charge metadata should not exceed capacity
  assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(HandleImpl));
}

template <class Table>
void ClockCacheShard<Table>::EraseUnRefEntries() {
  table_.EraseUnRefEntries();
}

template <class Table>
void ClockCacheShard<Table>::ApplyToSomeEntries(
    const std::function<void(const Slice& key, Cache::ObjectPtr value,
                             size_t charge,
                             const Cache::CacheItemHelper* helper)>& callback,
    size_t average_entries_per_lock, size_t* state) {
  // The state will be a simple index into the table. Even with a dynamic
  // hyper clock cache, entries will generally stay in their existing
  // slots, so we don't need to be aware of the high-level organization
  // that makes lookup efficient.
  size_t length = table_.GetTableSize();

  assert(average_entries_per_lock > 0);

  size_t index_begin = *state;
  size_t index_end = index_begin + average_entries_per_lock;
  if (index_end >= length) {
    // Going to end.
    index_end = length;
    *state = SIZE_MAX;
  } else {
    *state = index_end;
  }

  auto hash_seed = table_.GetHashSeed();
  ConstApplyToEntriesRange(
      [callback, hash_seed](const HandleImpl& h) {
        UniqueId64x2 unhashed;
        callback(ReverseHash(h.hashed_key, &unhashed, hash_seed), h.value,
                 h.GetTotalCharge(), h.helper);
      },
      table_.HandlePtr(index_begin), table_.HandlePtr(index_end), false);
}

int FixedHyperClockTable::CalcHashBits(
    size_t capacity, size_t estimated_value_size,
    CacheMetadataChargePolicy metadata_charge_policy) {
  double average_slot_charge = estimated_value_size * kLoadFactor;
  if (metadata_charge_policy == kFullChargeCacheMetadata) {
    average_slot_charge += sizeof(HandleImpl);
  }
  assert(average_slot_charge > 0.0);
  uint64_t num_slots =
      static_cast<uint64_t>(capacity / average_slot_charge + 0.999999);

  int hash_bits = FloorLog2((num_slots << 1) - 1);
  if (metadata_charge_policy == kFullChargeCacheMetadata) {
    // For very small estimated value sizes, it's possible to overshoot
    while (hash_bits > 0 &&
           uint64_t{sizeof(HandleImpl)} << hash_bits > capacity) {
      hash_bits--;
    }
  }
  return hash_bits;
}

template <class Table>
void ClockCacheShard<Table>::SetCapacity(size_t capacity) {
  capacity_.store(capacity, std::memory_order_relaxed);
  // next Insert will take care of any necessary evictions
}

template <class Table>
void ClockCacheShard<Table>::SetStrictCapacityLimit(
    bool strict_capacity_limit) {
  strict_capacity_limit_.store(strict_capacity_limit,
                               std::memory_order_relaxed);
  // next Insert will take care of any necessary evictions
}

template <class Table>
Status ClockCacheShard<Table>::Insert(const Slice& key,
                                      const UniqueId64x2& hashed_key,
                                      Cache::ObjectPtr value,
                                      const Cache::CacheItemHelper* helper,
                                      size_t charge, HandleImpl** handle,
                                      Cache::Priority priority) {
  if (UNLIKELY(key.size() != kCacheKeySize)) {
    return Status::NotSupported("ClockCache only supports key size " +
                                std::to_string(kCacheKeySize) + "B");
  }
  ClockHandleBasicData proto;
  proto.hashed_key = hashed_key;
  proto.value = value;
  proto.helper = helper;
  proto.total_charge = charge;
  return table_.template Insert<Table>(
      proto, handle, priority, capacity_.load(std::memory_order_relaxed),
      strict_capacity_limit_.load(std::memory_order_relaxed));
}

template <class Table>
typename Table::HandleImpl* ClockCacheShard<Table>::CreateStandalone(
    const Slice& key, const UniqueId64x2& hashed_key, Cache::ObjectPtr obj,
    const Cache::CacheItemHelper* helper, size_t charge, bool allow_uncharged) {
  if (UNLIKELY(key.size() != kCacheKeySize)) {
    return nullptr;
  }
  ClockHandleBasicData proto;
  proto.hashed_key = hashed_key;
  proto.value = obj;
  proto.helper = helper;
  proto.total_charge = charge;
  return table_.template CreateStandalone<Table>(
      proto, capacity_.load(std::memory_order_relaxed),
      strict_capacity_limit_.load(std::memory_order_relaxed), allow_uncharged);
}

template <class Table>
typename ClockCacheShard<Table>::HandleImpl* ClockCacheShard<Table>::Lookup(
    const Slice& key, const UniqueId64x2& hashed_key) {
  if (UNLIKELY(key.size() != kCacheKeySize)) {
    return nullptr;
  }
  return table_.Lookup(hashed_key);
}

template <class Table>
bool ClockCacheShard<Table>::Ref(HandleImpl* h) {
  if (h == nullptr) {
    return false;
  }
  table_.Ref(*h);
  return true;
}

template <class Table>
bool ClockCacheShard<Table>::Release(HandleImpl* handle, bool useful,
                                     bool erase_if_last_ref) {
  if (handle == nullptr) {
    return false;
  }
  return table_.Release(handle, useful, erase_if_last_ref);
}

#ifndef NDEBUG
template <class Table>
void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, size_t n) {
  table_.TEST_RefN(*h, n);
}

template <class Table>
void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, size_t n) {
  table_.TEST_ReleaseN(h, n);
}
#endif

template <class Table>
bool ClockCacheShard<Table>::Release(HandleImpl* handle,
                                     bool erase_if_last_ref) {
  return Release(handle, /*useful=*/true, erase_if_last_ref);
}

template <class Table>
void ClockCacheShard<Table>::Erase(const Slice& key,
                                   const UniqueId64x2& hashed_key) {
  if (UNLIKELY(key.size() != kCacheKeySize)) {
    return;
  }
  table_.Erase(hashed_key);
}

template <class Table>
size_t ClockCacheShard<Table>::GetUsage() const {
  return table_.GetUsage();
}

template <class Table>
size_t ClockCacheShard<Table>::GetStandaloneUsage() const {
  return table_.GetStandaloneUsage();
}

template <class Table>
size_t ClockCacheShard<Table>::GetCapacity() const {
  return capacity_;
}

template <class Table>
size_t ClockCacheShard<Table>::GetPinnedUsage() const {
  // Computes the pinned usage by scanning the whole hash table. This
  // is slow, but avoids keeping an exact counter on the clock usage,
  // i.e., the number of not externally referenced elements.
  // Why avoid this counter? Because Lookup removes elements from the clock
  // list, so it would need to update the pinned usage every time,
  // which creates additional synchronization costs.
  size_t table_pinned_usage = 0;
  const bool charge_metadata =
      metadata_charge_policy_ == kFullChargeCacheMetadata;
  ConstApplyToEntriesRange(
      [&table_pinned_usage, charge_metadata](const HandleImpl& h) {
        uint64_t meta = h.meta.load(std::memory_order_relaxed);
        uint64_t refcount = GetRefcount(meta);
        // Holding one ref for ConstApplyToEntriesRange
        assert(refcount > 0);
        if (refcount > 1) {
          table_pinned_usage += h.GetTotalCharge();
          if (charge_metadata) {
            table_pinned_usage += sizeof(HandleImpl);
          }
        }
      },
      table_.HandlePtr(0), table_.HandlePtr(table_.GetTableSize()), true);

  return table_pinned_usage + table_.GetStandaloneUsage();
}

template <class Table>
size_t ClockCacheShard<Table>::GetOccupancyCount() const {
  return table_.GetOccupancy();
}

template <class Table>
size_t ClockCacheShard<Table>::GetOccupancyLimit() const {
  return table_.GetOccupancyLimit();
}

template <class Table>
size_t ClockCacheShard<Table>::GetTableAddressCount() const {
  return table_.GetTableSize();
}

// Explicit instantiation
template class ClockCacheShard<FixedHyperClockTable>;
template class ClockCacheShard<AutoHyperClockTable>;

template <class Table>
BaseHyperClockCache<Table>::BaseHyperClockCache(
    const HyperClockCacheOptions& opts)
    : ShardedCache<ClockCacheShard<Table>>(opts) {
  // TODO: should not need to go through two levels of pointer indirection to
  // get to table entries
  size_t per_shard = this->GetPerShardCapacity();
  MemoryAllocator* alloc = this->memory_allocator();
  this->InitShards([&](Shard* cs) {
    typename Table::Opts table_opts{opts};
    new (cs) Shard(per_shard, opts.strict_capacity_limit,
                   opts.metadata_charge_policy, alloc,
                   &this->eviction_callback_, &this->hash_seed_, table_opts);
  });
}

template <class Table>
Cache::ObjectPtr BaseHyperClockCache<Table>::Value(Handle* handle) {
  return reinterpret_cast<const typename Table::HandleImpl*>(handle)->value;
}

template <class Table>
size_t BaseHyperClockCache<Table>::GetCharge(Handle* handle) const {
  return reinterpret_cast<const typename Table::HandleImpl*>(handle)
      ->GetTotalCharge();
}

template <class Table>
const Cache::CacheItemHelper* BaseHyperClockCache<Table>::GetCacheItemHelper(
    Handle* handle) const {
  auto h = reinterpret_cast<const typename Table::HandleImpl*>(handle);
  return h->helper;
}

namespace {

// For each cache shard, estimate what the table load factor would be if
// cache filled to capacity with average entries. This is considered
// indicative of a potential problem if the shard is essentially operating
// "at limit", which we define as high actual usage (>80% of capacity)
// or actual occupancy very close to limit (>95% of limit).
// Also, for each shard compute the recommended estimated_entry_charge,
// and keep the minimum one for use as overall recommendation.
void AddShardEvaluation(const FixedHyperClockCache::Shard& shard,
                        std::vector<double>& predicted_load_factors,
                        size_t& min_recommendation) {
  size_t usage = shard.GetUsage() - shard.GetStandaloneUsage();
  size_t capacity = shard.GetCapacity();
  double usage_ratio = 1.0 * usage / capacity;

  size_t occupancy = shard.GetOccupancyCount();
  size_t occ_limit = shard.GetOccupancyLimit();
  double occ_ratio = 1.0 * occupancy / occ_limit;
  if (usage == 0 || occupancy == 0 || (usage_ratio < 0.8 && occ_ratio < 0.95)) {
    // Skip as described above
    return;
  }

  // If filled to capacity, what would the occupancy ratio be?
  double ratio = occ_ratio / usage_ratio;
  // Given max load factor, what that load factor be?
  double lf = ratio * FixedHyperClockTable::kStrictLoadFactor;
  predicted_load_factors.push_back(lf);

  // Update min_recommendation also
  size_t recommendation = usage / occupancy;
  min_recommendation = std::min(min_recommendation, recommendation);
}

bool IsSlotOccupied(const ClockHandle& h) {
  return (h.meta.load(std::memory_order_relaxed) >> ClockHandle::kStateShift) !=
         0;
}
}  // namespace

// NOTE: GCC might warn about subobject linkage if this is in anon namespace
template <size_t N = 500>
class LoadVarianceStats {
 public:
  std::string Report() const {
    return "Overall " + PercentStr(positive_count_, samples_) + " (" +
           std::to_string(positive_count_) + "/" + std::to_string(samples_) +
           "), Min/Max/Window = " + PercentStr(min_, N) + "/" +
           PercentStr(max_, N) + "/" + std::to_string(N) +
           ", MaxRun{Pos/Neg} = " + std::to_string(max_pos_run_) + "/" +
           std::to_string(max_neg_run_);
  }

  void Add(bool positive) {
    recent_[samples_ % N] = positive;
    if (positive) {
      ++positive_count_;
      ++cur_pos_run_;
      max_pos_run_ = std::max(max_pos_run_, cur_pos_run_);
      cur_neg_run_ = 0;
    } else {
      ++cur_neg_run_;
      max_neg_run_ = std::max(max_neg_run_, cur_neg_run_);
      cur_pos_run_ = 0;
    }
    ++samples_;
    if (samples_ >= N) {
      size_t count_set = recent_.count();
      max_ = std::max(max_, count_set);
      min_ = std::min(min_, count_set);
    }
  }

 private:
  size_t max_ = 0;
  size_t min_ = N;
  size_t positive_count_ = 0;
  size_t samples_ = 0;
  size_t max_pos_run_ = 0;
  size_t cur_pos_run_ = 0;
  size_t max_neg_run_ = 0;
  size_t cur_neg_run_ = 0;
  std::bitset<N> recent_;

  static std::string PercentStr(size_t a, size_t b) {
    if (b == 0) {
      return "??%";
    } else {
      return std::to_string(uint64_t{100} * a / b) + "%";
    }
  }
};

template <class Table>
void BaseHyperClockCache<Table>::ReportProblems(
    const std::shared_ptr<Logger>& info_log) const {
  if (info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) {
    LoadVarianceStats slot_stats;
    this->ForEachShard([&](const BaseHyperClockCache<Table>::Shard* shard) {
      size_t count = shard->GetTableAddressCount();
      for (size_t i = 0; i < count; ++i) {
        slot_stats.Add(IsSlotOccupied(*shard->GetTable().HandlePtr(i)));
      }
    });
    ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL,
                       "Slot occupancy stats: %s", slot_stats.Report().c_str());
  }
}

void FixedHyperClockCache::ReportProblems(
    const std::shared_ptr<Logger>& info_log) const {
  BaseHyperClockCache::ReportProblems(info_log);

  uint32_t shard_count = GetNumShards();
  std::vector<double> predicted_load_factors;
  size_t min_recommendation = SIZE_MAX;
  ForEachShard([&](const FixedHyperClockCache::Shard* shard) {
    AddShardEvaluation(*shard, predicted_load_factors, min_recommendation);
  });

  if (predicted_load_factors.empty()) {
    // None operating "at limit" -> nothing to report
    return;
  }
  std::sort(predicted_load_factors.begin(), predicted_load_factors.end());

  // First, if the average load factor is within spec, we aren't going to
  // complain about a few shards being out of spec.
  // NOTE: this is only the average among cache shards operating "at limit,"
  // which should be representative of what we care about. It it normal, even
  // desirable, for a cache to operate "at limit" so this should not create
  // selection bias. See AddShardEvaluation().
  // TODO: Consider detecting cases where decreasing the number of shards
  // would be good, e.g. serious imbalance among shards.
  double average_load_factor =
      std::accumulate(predicted_load_factors.begin(),
                      predicted_load_factors.end(), 0.0) /
      shard_count;

  constexpr double kLowSpecLoadFactor = FixedHyperClockTable::kLoadFactor / 2;
  constexpr double kMidSpecLoadFactor =
      FixedHyperClockTable::kLoadFactor / 1.414;
  if (average_load_factor > FixedHyperClockTable::kLoadFactor) {
    // Out of spec => Consider reporting load factor too high
    // Estimate effective overall capacity loss due to enforcing occupancy limit
    double lost_portion = 0.0;
    int over_count = 0;
    for (double lf : predicted_load_factors) {
      if (lf > FixedHyperClockTable::kStrictLoadFactor) {
        ++over_count;
        lost_portion +=
            (lf - FixedHyperClockTable::kStrictLoadFactor) / lf / shard_count;
      }
    }
    // >= 20% loss -> error
    // >= 10% loss -> consistent warning
    // >= 1% loss -> intermittent warning
    InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
    bool report = true;
    if (lost_portion > 0.2) {
      level = InfoLogLevel::ERROR_LEVEL;
    } else if (lost_portion > 0.1) {
      level = InfoLogLevel::WARN_LEVEL;
    } else if (lost_portion > 0.01) {
      int report_percent = static_cast<int>(lost_portion * 100.0);
      if (Random::GetTLSInstance()->PercentTrue(report_percent)) {
        level = InfoLogLevel::WARN_LEVEL;
      }
    } else {
      // don't report
      report = false;
    }
    if (report) {
      ROCKS_LOG_AT_LEVEL(
          info_log, level,
          "FixedHyperClockCache@%p unable to use estimated %.1f%% capacity "
          "because of full occupancy in %d/%u cache shards "
          "(estimated_entry_charge too high). "
          "Recommend estimated_entry_charge=%zu",
          this, lost_portion * 100.0, over_count, (unsigned)shard_count,
          min_recommendation);
    }
  } else if (average_load_factor < kLowSpecLoadFactor) {
    // Out of spec => Consider reporting load factor too low
    // But cautiously because low is not as big of a problem.

    // Only report if highest occupancy shard is also below
    // spec and only if average is substantially out of spec
    if (predicted_load_factors.back() < kLowSpecLoadFactor &&
        average_load_factor < kLowSpecLoadFactor / 1.414) {
      InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
      if (average_load_factor < kLowSpecLoadFactor / 2) {
        level = InfoLogLevel::WARN_LEVEL;
      }
      ROCKS_LOG_AT_LEVEL(
          info_log, level,
          "FixedHyperClockCache@%p table has low occupancy at full capacity. "
          "Higher estimated_entry_charge (about %.1fx) would likely improve "
          "performance. Recommend estimated_entry_charge=%zu",
          this, kMidSpecLoadFactor / average_load_factor, min_recommendation);
    }
  }
}

// =======================================================================
//                             AutoHyperClockCache
// =======================================================================

// See AutoHyperClockTable::length_info_ etc. for how the linear hashing
// metadata is encoded. Here are some example values:
//
// Used length  | min shift  | threshold  | max shift
// 2            | 1          | 0          | 1
// 3            | 1          | 1          | 2
// 4            | 2          | 0          | 2
// 5            | 2          | 1          | 3
// 6            | 2          | 2          | 3
// 7            | 2          | 3          | 3
// 8            | 3          | 0          | 3
// 9            | 3          | 1          | 4
// ...
// Note:
// * min shift = floor(log2(used length))
// * max shift = ceil(log2(used length))
// * used length == (1 << shift) + threshold
// Also, shift=0 is never used in practice, so is reserved for "unset"

namespace {

inline int LengthInfoToMinShift(uint64_t length_info) {
  int mask_shift = BitwiseAnd(length_info, int{255});
  assert(mask_shift <= 63);
  assert(mask_shift > 0);
  return mask_shift;
}

inline size_t LengthInfoToThreshold(uint64_t length_info) {
  return static_cast<size_t>(length_info >> 8);
}

inline size_t LengthInfoToUsedLength(uint64_t length_info) {
  size_t threshold = LengthInfoToThreshold(length_info);
  int shift = LengthInfoToMinShift(length_info);
  assert(threshold < (size_t{1} << shift));
  size_t used_length = (size_t{1} << shift) + threshold;
  assert(used_length >= 2);
  return used_length;
}

inline uint64_t UsedLengthToLengthInfo(size_t used_length) {
  assert(used_length >= 2);
  int shift = FloorLog2(used_length);
  uint64_t threshold = BottomNBits(used_length, shift);
  uint64_t length_info =
      (uint64_t{threshold} << 8) + static_cast<uint64_t>(shift);
  assert(LengthInfoToUsedLength(length_info) == used_length);
  assert(LengthInfoToMinShift(length_info) == shift);
  assert(LengthInfoToThreshold(length_info) == threshold);
  return length_info;
}

inline size_t GetStartingLength(size_t capacity) {
  if (capacity > port::kPageSize) {
    // Start with one memory page
    return port::kPageSize / sizeof(AutoHyperClockTable::HandleImpl);
  } else {
    // Mostly to make unit tests happy
    return 4;
  }
}

inline size_t GetHomeIndex(uint64_t hash, int shift) {
  return static_cast<size_t>(BottomNBits(hash, shift));
}

inline void GetHomeIndexAndShift(uint64_t length_info, uint64_t hash,
                                 size_t* home, int* shift) {
  int min_shift = LengthInfoToMinShift(length_info);
  size_t threshold = LengthInfoToThreshold(length_info);
  bool extra_shift = GetHomeIndex(hash, min_shift) < threshold;
  *home = GetHomeIndex(hash, min_shift + extra_shift);
  *shift = min_shift + extra_shift;
  assert(*home < LengthInfoToUsedLength(length_info));
}

inline int GetShiftFromNextWithShift(uint64_t next_with_shift) {
  return BitwiseAnd(next_with_shift,
                    AutoHyperClockTable::HandleImpl::kShiftMask);
}

inline size_t GetNextFromNextWithShift(uint64_t next_with_shift) {
  return static_cast<size_t>(next_with_shift >>
                             AutoHyperClockTable::HandleImpl::kNextShift);
}

inline uint64_t MakeNextWithShift(size_t next, int shift) {
  return (uint64_t{next} << AutoHyperClockTable::HandleImpl::kNextShift) |
         static_cast<uint64_t>(shift);
}

inline uint64_t MakeNextWithShiftEnd(size_t head, int shift) {
  return AutoHyperClockTable::HandleImpl::kNextEndFlags |
         MakeNextWithShift(head, shift);
}

// Helper function for Lookup
inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h,
                        int shift = 0, size_t home = 0,
                        bool* full_match_or_unknown = nullptr) {
  // Must be at least something to match
  assert(hashed_key || shift > 0);

  uint64_t old_meta;
  // (Optimistically) increment acquire counter.
  old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
                              std::memory_order_acquire);
  // Check if it's a referencable (sharable) entry
  if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
                   << ClockHandle::kStateShift)) == 0) {
    // For non-sharable states, incrementing the acquire counter has no effect
    // so we don't need to undo it. Furthermore, we cannot safely undo
    // it because we did not acquire a read reference to lock the
    // entry in a Shareable state.
    if (full_match_or_unknown) {
      *full_match_or_unknown = true;
    }
    return false;
  }
  // Else acquired a read reference
  assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0);
  if (hashed_key && h.hashed_key == *hashed_key &&
      LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit}
                         << ClockHandle::kStateShift))) {
    // Match on full key, visible
    if (full_match_or_unknown) {
      *full_match_or_unknown = true;
    }
    return true;
  } else if (shift > 0 && home == BottomNBits(h.hashed_key[1], shift)) {
    // NOTE: upper 32 bits of hashed_key[0] is used for sharding
    // Match on home address, possibly invisible
    if (full_match_or_unknown) {
      *full_match_or_unknown = false;
    }
    return true;
  } else {
    // Mismatch. Pretend we never took the reference
    Unref(h);
    if (full_match_or_unknown) {
      *full_match_or_unknown = false;
    }
    return false;
  }
}

// Assumes a chain rewrite lock prevents concurrent modification of
// these chain pointers
void UpgradeShiftsOnRange(AutoHyperClockTable::HandleImpl* arr,
                          size_t& frontier, uint64_t stop_before_or_new_tail,
                          int old_shift, int new_shift) {
  assert(frontier != SIZE_MAX);
  assert(new_shift == old_shift + 1);
  (void)old_shift;
  (void)new_shift;
  using HandleImpl = AutoHyperClockTable::HandleImpl;
  for (;;) {
    uint64_t next_with_shift =
        arr[frontier].chain_next_with_shift.load(std::memory_order_acquire);
    assert(GetShiftFromNextWithShift(next_with_shift) == old_shift);
    if (next_with_shift == stop_before_or_new_tail) {
      // Stopping at entry with pointer matching "stop before"
      assert(!HandleImpl::IsEnd(next_with_shift));
      return;
    }
    if (HandleImpl::IsEnd(next_with_shift)) {
      // Also update tail to new tail
      assert(HandleImpl::IsEnd(stop_before_or_new_tail));
      arr[frontier].chain_next_with_shift.store(stop_before_or_new_tail,
                                                std::memory_order_release);
      // Mark nothing left to upgrade
      frontier = SIZE_MAX;
      return;
    }
    // Next is another entry to process, so upgrade and advance frontier
    arr[frontier].chain_next_with_shift.fetch_add(1U,
                                                  std::memory_order_acq_rel);
    assert(GetShiftFromNextWithShift(next_with_shift + 1) == new_shift);
    frontier = GetNextFromNextWithShift(next_with_shift);
  }
}

size_t CalcOccupancyLimit(size_t used_length) {
  return static_cast<size_t>(used_length * AutoHyperClockTable::kMaxLoadFactor +
                             0.999);
}

}  // namespace

// An RAII wrapper for locking a chain of entries (flag bit on the head)
// so that there is only one thread allowed to remove entries from the
// chain, or to rewrite it by splitting for Grow. Without the lock,
// all lookups and insertions at the head can proceed wait-free.
// The class also provides functions for safely manipulating the head pointer
// while holding the lock--or wanting to should it become non-empty.
//
// The flag bits on the head are such that the head cannot be locked if it
// is an empty chain, so that a "blind" fetch_or will try to lock a non-empty
// chain but have no effect on an empty chain. When a potential rewrite
// operation see an empty head pointer, there is no need to lock as the
// operation is a no-op. However, there are some cases such as CAS-update
// where locking might be required after initially not being needed, if the
// operation is forced to revisit the head pointer.
class AutoHyperClockTable::ChainRewriteLock {
 public:
  using HandleImpl = AutoHyperClockTable::HandleImpl;
  explicit ChainRewriteLock(HandleImpl* h, std::atomic<uint64_t>& yield_count,
                            bool already_locked_or_end = false)
      : head_ptr_(&h->head_next_with_shift) {
    if (already_locked_or_end) {
      new_head_ = head_ptr_->load(std::memory_order_acquire);
      // already locked or end
      assert(new_head_ & HandleImpl::kHeadLocked);
      return;
    }
    Acquire(yield_count);
  }

  ~ChainRewriteLock() {
    if (!IsEnd()) {
      // Release lock
      uint64_t old = head_ptr_->fetch_and(~HandleImpl::kHeadLocked,
                                          std::memory_order_release);
      (void)old;
      assert((old & HandleImpl::kNextEndFlags) == HandleImpl::kHeadLocked);
    }
  }

  void Reset(HandleImpl* h, std::atomic<uint64_t>& yield_count) {
    this->~ChainRewriteLock();
    new (this) ChainRewriteLock(h, yield_count);
  }

  // Expected current state, assuming no parallel updates.
  uint64_t GetNewHead() const { return new_head_; }

  // Only safe if we know that the value hasn't changed from other threads
  void SimpleUpdate(uint64_t next_with_shift) {
    assert(head_ptr_->load(std::memory_order_acquire) == new_head_);
    new_head_ = next_with_shift | HandleImpl::kHeadLocked;
    head_ptr_->store(new_head_, std::memory_order_release);
  }

  bool CasUpdate(uint64_t next_with_shift, std::atomic<uint64_t>& yield_count) {
    uint64_t new_head = next_with_shift | HandleImpl::kHeadLocked;
    uint64_t expected = GetNewHead();
    bool success = head_ptr_->compare_exchange_strong(
        expected, new_head, std::memory_order_acq_rel);
    if (success) {
      // Ensure IsEnd() is kept up-to-date, including for dtor
      new_head_ = new_head;
    } else {
      // Parallel update to head, such as Insert()
      if (IsEnd()) {
        // Didn't previously hold a lock
        if (HandleImpl::IsEnd(expected)) {
          // Still don't need to
          new_head_ = expected;
        } else {
          // Need to acquire lock before proceeding
          Acquire(yield_count);
        }
      } else {
        // Parallel update must preserve our lock
        assert((expected & HandleImpl::kNextEndFlags) ==
               HandleImpl::kHeadLocked);
        new_head_ = expected;
      }
    }
    return success;
  }

  bool IsEnd() const { return HandleImpl::IsEnd(new_head_); }

 private:
  void Acquire(std::atomic<uint64_t>& yield_count) {
    for (;;) {
      // Acquire removal lock on the chain
      uint64_t old_head = head_ptr_->fetch_or(HandleImpl::kHeadLocked,
                                              std::memory_order_acq_rel);
      if ((old_head & HandleImpl::kNextEndFlags) != HandleImpl::kHeadLocked) {
        // Either acquired the lock or lock not needed (end)
        assert((old_head & HandleImpl::kNextEndFlags) == 0 ||
               (old_head & HandleImpl::kNextEndFlags) ==
                   HandleImpl::kNextEndFlags);

        new_head_ = old_head | HandleImpl::kHeadLocked;
        break;
      }
      // NOTE: one of the few yield-wait loops, which is rare enough in practice
      // for its performance to be insignificant. (E.g. using C++20 atomic
      // wait/notify would likely be worse because of wasted notify costs.)
      yield_count.fetch_add(1, std::memory_order_relaxed);
      std::this_thread::yield();
    }
  }

  std::atomic<uint64_t>* head_ptr_;
  uint64_t new_head_;
};

AutoHyperClockTable::AutoHyperClockTable(
    size_t capacity, bool /*strict_capacity_limit*/,
    CacheMetadataChargePolicy metadata_charge_policy,
    MemoryAllocator* allocator,
    const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed,
    const Opts& opts)
    : BaseClockTable(metadata_charge_policy, allocator, eviction_callback,
                     hash_seed),
      array_(MemMapping::AllocateLazyZeroed(
          sizeof(HandleImpl) * CalcMaxUsableLength(capacity,
                                                   opts.min_avg_value_size,
                                                   metadata_charge_policy))),
      length_info_(UsedLengthToLengthInfo(GetStartingLength(capacity))),
      occupancy_limit_(
          CalcOccupancyLimit(LengthInfoToUsedLength(length_info_.load()))),
      clock_pointer_mask_(
          BottomNBits(UINT64_MAX, LengthInfoToMinShift(length_info_.load()))) {
  if (metadata_charge_policy ==
      CacheMetadataChargePolicy::kFullChargeCacheMetadata) {
    // NOTE: ignoring page boundaries for simplicity
    usage_ += size_t{GetTableSize()} * sizeof(HandleImpl);
  }

  static_assert(sizeof(HandleImpl) == 64U,
                "Expecting size / alignment with common cache line size");

  // Populate head pointers
  uint64_t length_info = length_info_.load();
  int min_shift = LengthInfoToMinShift(length_info);
  int max_shift = min_shift + 1;
  size_t major = uint64_t{1} << min_shift;
  size_t used_length = GetTableSize();

  assert(major <= used_length);
  assert(used_length <= major * 2);

  // Initialize the initial usable set of slots. This slightly odd iteration
  // order makes it easier to get the correct shift amount on each head.
  for (size_t i = 0; i < major; ++i) {
#ifndef NDEBUG
    int shift;
    size_t home;
#endif
    if (major + i < used_length) {
      array_[i].head_next_with_shift = MakeNextWithShiftEnd(i, max_shift);
      array_[major + i].head_next_with_shift =
          MakeNextWithShiftEnd(major + i, max_shift);
#ifndef NDEBUG  // Extra invariant checking
      GetHomeIndexAndShift(length_info, i, &home, &shift);
      assert(home == i);
      assert(shift == max_shift);
      GetHomeIndexAndShift(length_info, major + i, &home, &shift);
      assert(home == major + i);
      assert(shift == max_shift);
#endif
    } else {
      array_[i].head_next_with_shift = MakeNextWithShiftEnd(i, min_shift);
#ifndef NDEBUG  // Extra invariant checking
      GetHomeIndexAndShift(length_info, i, &home, &shift);
      assert(home == i);
      assert(shift == min_shift);
      GetHomeIndexAndShift(length_info, major + i, &home, &shift);
      assert(home == i);
      assert(shift == min_shift);
#endif
    }
  }
}

AutoHyperClockTable::~AutoHyperClockTable() {
  // As usual, destructor assumes there are no references or active operations
  // on any slot/element in the table.

  // It's possible that there were not enough Insert() after final concurrent
  // Grow to ensure length_info_ (published GetTableSize()) is fully up to
  // date. Probe for first unused slot to ensure we see the whole structure.
  size_t used_end = GetTableSize();
  while (used_end < array_.Count() &&
         array_[used_end].head_next_with_shift.load() !=
             HandleImpl::kUnusedMarker) {
    used_end++;
  }
#ifndef NDEBUG
  for (size_t i = used_end; i < array_.Count(); i++) {
    assert(array_[i].head_next_with_shift.load() == 0);
    assert(array_[i].chain_next_with_shift.load() == 0);
    assert(array_[i].meta.load() == 0);
  }
  std::vector<bool> was_populated(used_end);
  std::vector<bool> was_pointed_to(used_end);
#endif
  for (size_t i = 0; i < used_end; i++) {
    HandleImpl& h = array_[i];
    switch (h.meta >> ClockHandle::kStateShift) {
      case ClockHandle::kStateEmpty:
        // noop
        break;
      case ClockHandle::kStateInvisible:  // rare but possible
      case ClockHandle::kStateVisible:
        assert(GetRefcount(h.meta) == 0);
        h.FreeData(allocator_);
#ifndef NDEBUG  // Extra invariant checking
        usage_.fetch_sub(h.total_charge, std::memory_order_relaxed);
        occupancy_.fetch_sub(1U, std::memory_order_relaxed);
        was_populated[i] = true;
        if (!HandleImpl::IsEnd(h.chain_next_with_shift)) {
          assert((h.chain_next_with_shift & HandleImpl::kHeadLocked) == 0);
          size_t next = GetNextFromNextWithShift(h.chain_next_with_shift);
          assert(!was_pointed_to[next]);
          was_pointed_to[next] = true;
        }
#endif
        break;
      // otherwise
      default:
        assert(false);
        break;
    }
#ifndef NDEBUG  // Extra invariant checking
    if (!HandleImpl::IsEnd(h.head_next_with_shift)) {
      size_t next = GetNextFromNextWithShift(h.head_next_with_shift);
      assert(!was_pointed_to[next]);
      was_pointed_to[next] = true;
    }
#endif
  }
#ifndef NDEBUG  // Extra invariant checking
  // This check is not perfect, but should detect most reasonable cases
  // of abandonned or floating entries, etc.  (A floating cycle would not
  // be reported as bad.)
  for (size_t i = 0; i < used_end; i++) {
    if (was_populated[i]) {
      assert(was_pointed_to[i]);
    } else {
      assert(!was_pointed_to[i]);
    }
  }
#endif

  // Metadata charging only follows the published table size
  assert(usage_.load() == 0 ||
         usage_.load() == GetTableSize() * sizeof(HandleImpl));
  assert(occupancy_ == 0);
}

size_t AutoHyperClockTable::GetTableSize() const {
  return LengthInfoToUsedLength(length_info_.load(std::memory_order_acquire));
}

size_t AutoHyperClockTable::GetOccupancyLimit() const {
  return occupancy_limit_.load(std::memory_order_acquire);
}

void AutoHyperClockTable::StartInsert(InsertState& state) {
  state.saved_length_info = length_info_.load(std::memory_order_acquire);
}

// Because we have linked lists, bugs or even hardware errors can make it
// possible to create a cycle, which would lead to infinite loop.
// Furthermore, when we have retry cases in the code, we want to be sure
// these are not (and do not become) spin-wait loops. Given the assumption
// of quality hashing and the infeasibility of consistently recurring
// concurrent modifications to an entry or chain, we can safely bound the
// number of loop iterations in feasible operation, whether following chain
// pointers or retrying with some backtracking. A smaller limit is used for
// stress testing, to detect potential issues such as cycles or spin-waits,
// and a larger limit is used to break cycles should they occur in production.
#define CHECK_TOO_MANY_ITERATIONS(i) \
  {                                  \
    assert(i < 768);                 \
    if (UNLIKELY(i >= 4096)) {       \
      std::terminate();              \
    }                                \
  }

bool AutoHyperClockTable::GrowIfNeeded(size_t new_occupancy,
                                       InsertState& state) {
  // new_occupancy has taken into account other threads that are also trying
  // to insert, so as soon as we see sufficient *published* usable size, we
  // can declare success even if we aren't the one that grows the table.
  // However, there's an awkward state where other threads own growing the
  // table to sufficient usable size, but the udpated size is not yet
  // published. If we wait, then that likely slows the ramp-up cache
  // performance. If we unblock ourselves by ensure we grow by at least one
  // slot, we could technically overshoot required size by number of parallel
  // threads accessing block cache. On balance considering typical cases and
  // the modest consequences of table being slightly too large, the latter
  // seems preferable.
  //
  // So if the published occupancy limit is too small, we unblock ourselves
  // by committing to growing the table by at least one slot. Also note that
  // we might need to grow more than once to actually increase the occupancy
  // limit (due to max load factor < 1.0)

  while (UNLIKELY(new_occupancy >
                  occupancy_limit_.load(std::memory_order_relaxed))) {
    // At this point we commit the thread to growing unless we've reached the
    // limit (returns false).
    if (!Grow(state)) {
      return false;
    }
  }
  // Success (didn't need to grow, or did successfully)
  return true;
}

bool AutoHyperClockTable::Grow(InsertState& state) {
  size_t used_length = LengthInfoToUsedLength(state.saved_length_info);

  // Try to take ownership of a grow slot as the first thread to set its
  // head_next_with_shift to non-zero, specifically a valid empty chain
  // in case that is to be the final value.
  // (We don't need to be super efficient here.)
  size_t grow_home = used_length;
  int old_shift;
  for (;; ++grow_home) {
    if (grow_home >= array_.Count()) {
      // Can't grow any more.
      // (Tested by unit test ClockCacheTest/Limits)
      return false;
    }

    old_shift = FloorLog2(grow_home);
    assert(old_shift >= 1);

    uint64_t empty_head = MakeNextWithShiftEnd(grow_home, old_shift + 1);
    uint64_t expected_zero = HandleImpl::kUnusedMarker;
    bool own = array_[grow_home].head_next_with_shift.compare_exchange_strong(
        expected_zero, empty_head, std::memory_order_acq_rel);
    if (own) {
      assert(array_[grow_home].meta.load(std::memory_order_acquire) == 0);
      break;
    } else {
      // Taken by another thread. Try next slot.
      assert(expected_zero != 0);
    }
  }
#ifdef COERCE_CONTEXT_SWITCH
  // This is useful in reproducing concurrency issues in Grow()
  while (Random::GetTLSInstance()->OneIn(2)) {
    std::this_thread::yield();
  }
#endif
  // Basically, to implement https://en.wikipedia.org/wiki/Linear_hashing
  // entries that belong in a new chain starting at grow_home will be
  // split off from the chain starting at old_home, which is computed here.
  size_t old_home = BottomNBits(grow_home, old_shift);
  assert(old_home + (size_t{1} << old_shift) == grow_home);

  // Wait here to ensure any Grow operations that would directly feed into
  // this one are finished, though the full waiting actually completes in
  // acquiring the rewrite lock for old_home in SplitForGrow.
  size_t old_old_home = BottomNBits(grow_home, old_shift - 1);
  for (;;) {
    uint64_t old_old_head = array_[old_old_home].head_next_with_shift.load(
        std::memory_order_acquire);
    if (GetShiftFromNextWithShift(old_old_head) >= old_shift) {
      if ((old_old_head & HandleImpl::kNextEndFlags) !=
          HandleImpl::kHeadLocked) {
        break;
      }
    }
    // NOTE: one of the few yield-wait loops, which is rare enough in practice
    // for its performance to be insignificant.
    yield_count_.fetch_add(1, std::memory_order_relaxed);
    std::this_thread::yield();
  }

  // Do the dirty work of splitting the chain, including updating heads and
  // chain nexts for new shift amounts.
  SplitForGrow(grow_home, old_home, old_shift);

  // length_info_ can be updated any time after the new shift amount is
  // published to both heads, potentially before the end of SplitForGrow.
  // But we also can't update length_info_ until the previous Grow operation
  // (with grow_home := this grow_home - 1) has published the new shift amount
  // to both of its heads. However, we don't want to artificially wait here
  // on that Grow that is otherwise irrelevant.
  //
  // We could have each Grow operation advance length_info_ here as far as it
  // can without waiting, by checking for updated shift on the corresponding
  // old home and also stopping at an empty head value for possible grow_home.
  // However, this could increase CPU cache line sharing and in 1/64 cases
  // bring in an extra page from our mmap.
  //
  // Instead, part of the strategy is delegated to DoInsert():
  // * Here we try to bring length_info_ up to date with this grow_home as
  // much as we can without waiting. It will fall short if a previous Grow
  // is still between reserving the grow slot and making the first big step
  // to publish the new shift amount.
  // * To avoid length_info_ being perpetually out-of-date (for a small number
  // of heads) after our last Grow, we do the same when Insert has to "fall
  // forward" due to length_info_ being out-of-date.
  CatchUpLengthInfoNoWait(grow_home);

  // See usage in DoInsert()
  state.likely_empty_slot = grow_home;

  // Success
  return true;
}

// See call in Grow()
void AutoHyperClockTable::CatchUpLengthInfoNoWait(
    size_t known_usable_grow_home) {
  uint64_t current_length_info = length_info_.load(std::memory_order_acquire);
  size_t published_usable_size = LengthInfoToUsedLength(current_length_info);
  while (published_usable_size <= known_usable_grow_home) {
    // For when published_usable_size was grow_home
    size_t next_usable_size = published_usable_size + 1;
    uint64_t next_length_info = UsedLengthToLengthInfo(next_usable_size);

    // known_usable_grow_home is known to be ready for Lookup/Insert with
    // the new shift amount, but between that and published usable size, we
    // need to check.
    if (published_usable_size < known_usable_grow_home) {
      int old_shift = FloorLog2(next_usable_size - 1);
      size_t old_home = BottomNBits(published_usable_size, old_shift);
      int shift =
          GetShiftFromNextWithShift(array_[old_home].head_next_with_shift.load(
              std::memory_order_acquire));
      if (shift <= old_shift) {
        // Not ready
        break;
      }
    }
    // CAS update length_info_. This only moves in one direction, so if CAS
    // fails, someone else made progress like we are trying, and we can just
    // pick up the new value and keep going as appropriate.
    if (length_info_.compare_exchange_strong(
            current_length_info, next_length_info, std::memory_order_acq_rel)) {
      current_length_info = next_length_info;
      // Update usage_ if metadata charge policy calls for it
      if (metadata_charge_policy_ ==
          CacheMetadataChargePolicy::kFullChargeCacheMetadata) {
        // NOTE: ignoring page boundaries for simplicity
        usage_.fetch_add(sizeof(HandleImpl), std::memory_order_relaxed);
      }
    }
    published_usable_size = LengthInfoToUsedLength(current_length_info);
  }

  // After updating lengh_info_ we can update occupancy_limit_,
  // allowing for later operations to update it before us.
  // Note: there is no std::atomic max operation, so we have to use a CAS loop
  size_t old_occupancy_limit = occupancy_limit_.load(std::memory_order_acquire);
  size_t new_occupancy_limit = CalcOccupancyLimit(published_usable_size);
  while (old_occupancy_limit < new_occupancy_limit) {
    if (occupancy_limit_.compare_exchange_weak(old_occupancy_limit,
                                               new_occupancy_limit,
                                               std::memory_order_acq_rel)) {
      break;
    }
  }
}

void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
                                       int old_shift) {
  int new_shift = old_shift + 1;
  HandleImpl* const arr = array_.Get();

  // We implement a somewhat complicated splitting algorithm to ensure that
  // entries are always wait-free visible to Lookup, without Lookup needing
  // to double-check length_info_ to ensure every potentially relevant
  // existing entry is seen. This works step-by-step, carefully sharing
  // unmigrated parts of the chain between the source chain and the new
  // destination chain. This means that Lookup might see a partially migrated
  // chain so has to take that into consideration when checking that it hasn't
  // "jumped off" its intended chain (due to a parallel modification to an
  // "under (de)construction" entry that was found on the chain but has
  // been reassigned).
  //
  // We use a "rewrite lock" on the source and desination chains to exclude
  // removals from those, and we have a prior waiting step that ensures any Grow
  // operations feeding into this one have completed. But this process does have
  // to gracefully handle concurrent insertions to the head of the source chain,
  // and once marked ready, the destination chain.
  //
  // With those considerations, the migration starts with one "big step,"
  // potentially with retries to deal with insertions in parallel. Part of the
  // big step is to mark the two chain heads as updated with the new shift
  // amount, which redirects Lookups to the appropriate new chain.
  //
  // After that big step that updates the heads, the rewrite lock makes it
  // relatively easy to deal with the rest of the migration. Big
  // simplifications come from being able to read the hashed_key of each
  // entry on the chain without needing to hold a read reference, and
  // from never "jumping our to another chain." Concurrent insertions only
  // happen at the chain head, which is outside of what is left to migrate.
  //
  // A series of smaller steps finishes splitting apart the existing chain into
  // two distinct chains, followed by some steps to fully commit the result.
  //
  // Except for trivial cases in which all entries (or remaining entries)
  // on the input chain go to one output chain, there is an important invariant
  // after each step of migration, including after the initial "big step":
  // For each output chain, the "zero chain" (new hash bit is zero) and the
  // "one chain" (new hash bit is one) we have a "frontier" entry marking the
  // boundary between what has been migrated and what has not. One of the
  // frontiers is along the old chain after the other, and all entries between
  // them are for the same target chain as the earlier frontier. Thus, the
  // chains share linked list tails starting at the latter frontier. All
  // pointers from the new head locations to the frontier entries are marked
  // with the new shift amount, while all pointers after the frontiers use the
  // old shift amount.
  //
  // And after each step there is a strengthening step to reach a stronger
  // invariant: the frontier earlier in the original chain is advanced to be
  // immediately before the other frontier.
  //
  // Consider this original input chain,
  //
  // OldHome  -Old-> A0 -Old-> B0 -Old-> A1 -Old-> C0 -Old-> OldHome(End)
  // GrowHome (empty)
  //
  // == BIG STEP ==
  // The initial big step finds the first entry that will be on the each
  // output chain (in this case A0 and A1). We use brackets ([]) to mark them
  // as our prospective frontiers.
  //
  // OldHome  -Old-> [A0] -Old-> B0 -Old-> [A1] -Old-> C0 -Old-> OldHome(End)
  // GrowHome (empty)
  //
  // Next we speculatively update grow_home head to point to the first entry for
  // the one chain. This will not be used by Lookup until the head at old_home
  // uses the new shift amount.
  //
  // OldHome  -Old-> [A0] -Old-> B0 -Old-> [A1] -Old-> C0 -Old-> OldHome(End)
  // GrowHome --------------New------------/
  //
  // Observe that if Lookup were to use the new head at GrowHome, it would be
  // able to find all relevant entries. Finishing the initial big step
  // requires a CAS (compare_exchange) of the OldHome head because there
  // might have been parallel insertions there, in which case we roll back
  // and try again. (We might need to point GrowHome head differently.)
  //
  // OldHome  -New-> [A0] -Old-> B0 -Old-> [A1] -Old-> C0 -Old-> OldHome(End)
  // GrowHome --------------New------------/
  //
  // Upgrading the OldHome head pointer with the new shift amount, with a
  // compare_exchange, completes the initial big step, with [A0] as zero
  // chain frontier and [A1] as one chain frontier. Links before the frontiers
  // use the new shift amount and links after use the old shift amount.
  // == END BIG STEP==
  // == STRENGTHENING ==
  // Zero chain frontier is advanced to [B0] (immediately before other
  // frontier) by updating pointers with new shift amounts.
  //
  // OldHome  -New-> A0 -New-> [B0] -Old-> [A1] -Old-> C0 -Old-> OldHome(End)
  // GrowHome -------------New-----------/
  //
  // == END STRENGTHENING ==
  // == SMALL STEP #1 ==
  // From the strong invariant state, we need to find the next entry for
  // the new chain with the earlier frontier. In this case, we need to find
  // the next entry for the zero chain that comes after [B0], which in this
  // case is C0. This will be our next zero chain frontier, at least under
  // the weak invariant. To get there, we simply update the link between
  // the current two frontiers to skip over the entries irreleveant to the
  // ealier frontier chain. In this case, the zero chain skips over A1. As a
  // result, he other chain is now the "earlier."
  //
  // OldHome  -New-> A0 -New-> B0 -New-> [C0] -Old-> OldHome(End)
  // GrowHome -New-> [A1] ------Old-----/
  //
  // == END SMALL STEP #1 ==
  //
  // Repeating the cycle and end handling is not as interesting.

  // Acquire rewrite lock on zero chain (if it's non-empty)
  ChainRewriteLock zero_head_lock(&arr[old_home], yield_count_);
  // Create an RAII wrapper for one chain rewrite lock, for once it becomes
  // non-empty. This head is unused by Lookup and DoInsert until the zero
  // head is updated with new shift amount.
  ChainRewriteLock one_head_lock(&arr[grow_home], yield_count_,
                                 /*already_locked_or_end=*/true);
  assert(one_head_lock.IsEnd());

  // old_home will also the head of the new "zero chain" -- all entries in the
  // "from" chain whose next hash bit is 0. grow_home will be head of the new
  // "one chain".

  // For these, SIZE_MAX is like nullptr (unknown)
  size_t zero_chain_frontier = SIZE_MAX;
  size_t one_chain_frontier = SIZE_MAX;
  size_t cur = SIZE_MAX;

  // Set to 0 (zero chain frontier earlier), 1 (one chain), or -1 (unknown)
  int chain_frontier_first = -1;

  // Might need to retry initial update of heads
  for (int i = 0;; ++i) {
    CHECK_TOO_MANY_ITERATIONS(i);
    assert(zero_chain_frontier == SIZE_MAX);
    assert(one_chain_frontier == SIZE_MAX);
    assert(cur == SIZE_MAX);
    assert(chain_frontier_first == -1);

    uint64_t next_with_shift = zero_head_lock.GetNewHead();

    // Find a single representative for each target chain, or scan the whole
    // chain if some target chain has no representative.
    for (;; ++i) {
      CHECK_TOO_MANY_ITERATIONS(i);

      // Loop invariants
      assert((chain_frontier_first < 0) == (zero_chain_frontier == SIZE_MAX &&
                                            one_chain_frontier == SIZE_MAX));
      assert((cur == SIZE_MAX) == (zero_chain_frontier == SIZE_MAX &&
                                   one_chain_frontier == SIZE_MAX));

      assert(GetShiftFromNextWithShift(next_with_shift) == old_shift);

      // Check for end of original chain
      if (HandleImpl::IsEnd(next_with_shift)) {
        cur = SIZE_MAX;
        break;
      }

      // next_with_shift is not End
      cur = GetNextFromNextWithShift(next_with_shift);

      if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) {
        // Entry for zero chain
        if (zero_chain_frontier == SIZE_MAX) {
          zero_chain_frontier = cur;
          if (one_chain_frontier != SIZE_MAX) {
            // Ready to update heads
            break;
          }
          // Nothing yet for one chain
          chain_frontier_first = 0;
        }
      } else {
        assert(BottomNBits(arr[cur].hashed_key[1], new_shift) == grow_home);
        // Entry for one chain
        if (one_chain_frontier == SIZE_MAX) {
          one_chain_frontier = cur;
          if (zero_chain_frontier != SIZE_MAX) {
            // Ready to update heads
            break;
          }
          // Nothing yet for zero chain
          chain_frontier_first = 1;
        }
      }

      next_with_shift =
          arr[cur].chain_next_with_shift.load(std::memory_order_acquire);
    }

    // Try to update heads for initial migration info
    // We only reached the end of the migrate-from chain already if one of the
    // target chains will be empty.
    assert((cur == SIZE_MAX) ==
           (zero_chain_frontier == SIZE_MAX || one_chain_frontier == SIZE_MAX));
    assert((chain_frontier_first < 0) ==
           (zero_chain_frontier == SIZE_MAX && one_chain_frontier == SIZE_MAX));

    // Always update one chain's head first (safe).
    one_head_lock.SimpleUpdate(
        one_chain_frontier != SIZE_MAX
            ? MakeNextWithShift(one_chain_frontier, new_shift)
            : MakeNextWithShiftEnd(grow_home, new_shift));

    // Make sure length_info_ hasn't been updated too early, as we're about
    // to make the change that makes it safe to update (e.g. in DoInsert())
    assert(LengthInfoToUsedLength(
               length_info_.load(std::memory_order_acquire)) <= grow_home);

    // Try to set zero's head.
    if (zero_head_lock.CasUpdate(
            zero_chain_frontier != SIZE_MAX
                ? MakeNextWithShift(zero_chain_frontier, new_shift)
                : MakeNextWithShiftEnd(old_home, new_shift),
            yield_count_)) {
      // Both heads successfully updated to new shift
      break;
    } else {
      // Concurrent insertion. This should not happen too many times.
      CHECK_TOO_MANY_ITERATIONS(i);
      // The easiest solution is to restart.
      zero_chain_frontier = SIZE_MAX;
      one_chain_frontier = SIZE_MAX;
      cur = SIZE_MAX;
      chain_frontier_first = -1;
      continue;
    }
  }

  // Except for trivial cases, we have something like
  // AHome -New-> [A0] -Old-> [B0] -Old-> [C0] \                        |
  // BHome --------------------New------------> [A1] -Old-> ...
  // And we need to upgrade as much as we can on the "first" chain
  // (the one eventually pointing to the other's frontier). This will
  // also finish off any case in which one of the target chains will be empty.
  if (chain_frontier_first >= 0) {
    size_t& first_frontier = chain_frontier_first == 0
                                 ? /*&*/ zero_chain_frontier
                                 : /*&*/ one_chain_frontier;
    size_t& other_frontier = chain_frontier_first != 0
                                 ? /*&*/ zero_chain_frontier
                                 : /*&*/ one_chain_frontier;
    uint64_t stop_before_or_new_tail =
        other_frontier != SIZE_MAX
            ? /*stop before*/ MakeNextWithShift(other_frontier, old_shift)
            : /*new tail*/ MakeNextWithShiftEnd(
                  chain_frontier_first == 0 ? old_home : grow_home, new_shift);
    UpgradeShiftsOnRange(arr, first_frontier, stop_before_or_new_tail,
                         old_shift, new_shift);
  }

  if (zero_chain_frontier == SIZE_MAX) {
    // Already finished migrating
    assert(one_chain_frontier == SIZE_MAX);
    assert(cur == SIZE_MAX);
  } else {
    // Still need to migrate between two target chains
    for (int i = 0;; ++i) {
      CHECK_TOO_MANY_ITERATIONS(i);
      // Overall loop invariants
      assert(zero_chain_frontier != SIZE_MAX);
      assert(one_chain_frontier != SIZE_MAX);
      assert(cur != SIZE_MAX);
      assert(chain_frontier_first >= 0);
      size_t& first_frontier = chain_frontier_first == 0
                                   ? /*&*/ zero_chain_frontier
                                   : /*&*/ one_chain_frontier;
      size_t& other_frontier = chain_frontier_first != 0
                                   ? /*&*/ zero_chain_frontier
                                   : /*&*/ one_chain_frontier;
      assert(cur != first_frontier);
      assert(GetNextFromNextWithShift(
                 arr[first_frontier].chain_next_with_shift.load(
                     std::memory_order_acquire)) == other_frontier);

      uint64_t next_with_shift =
          arr[cur].chain_next_with_shift.load(std::memory_order_acquire);

      // Check for end of original chain
      if (HandleImpl::IsEnd(next_with_shift)) {
        // Can set upgraded tail on first chain
        uint64_t first_new_tail = MakeNextWithShiftEnd(
            chain_frontier_first == 0 ? old_home : grow_home, new_shift);
        arr[first_frontier].chain_next_with_shift.store(
            first_new_tail, std::memory_order_release);
        // And upgrade remainder of other chain
        uint64_t other_new_tail = MakeNextWithShiftEnd(
            chain_frontier_first != 0 ? old_home : grow_home, new_shift);
        UpgradeShiftsOnRange(arr, other_frontier, other_new_tail, old_shift,
                             new_shift);
        assert(other_frontier == SIZE_MAX);  // Finished
        break;
      }

      // next_with_shift is not End
      cur = GetNextFromNextWithShift(next_with_shift);

      int target_chain;
      if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) {
        // Entry for zero chain
        target_chain = 0;
      } else {
        assert(BottomNBits(arr[cur].hashed_key[1], new_shift) == grow_home);
        // Entry for one chain
        target_chain = 1;
      }
      if (target_chain == chain_frontier_first) {
        // Found next entry to skip to on the first chain
        uint64_t skip_to = MakeNextWithShift(cur, new_shift);
        arr[first_frontier].chain_next_with_shift.store(
            skip_to, std::memory_order_release);
        first_frontier = cur;
        // Upgrade other chain up to entry before that one
        UpgradeShiftsOnRange(arr, other_frontier, next_with_shift, old_shift,
                             new_shift);
        // Swap which is marked as first
        chain_frontier_first = 1 - chain_frontier_first;
      } else {
        // Nothing to do yet, as we need to keep old generation pointers in
        // place for lookups
      }
    }
  }
}

// Variant of PurgeImplLocked: Removes all "under (de) construction" entries
// from a chain where already holding a rewrite lock
using PurgeLockedOpData = void;
// Variant of PurgeImplLocked: Clock-updates all entries in a chain, in
// addition to functionality of PurgeLocked, where already holding a rewrite
// lock. (Caller finalizes eviction on entries added to the autovector, in part
// so that we don't hold the rewrite lock while doing potentially expensive
// callback and allocator free.)
using ClockUpdateChainLockedOpData =
    autovector<AutoHyperClockTable::HandleImpl*>;

template <class OpData>
void AutoHyperClockTable::PurgeImplLocked(OpData* op_data,
                                          ChainRewriteLock& rewrite_lock,
                                          size_t home) {
  constexpr bool kIsPurge = std::is_same_v<OpData, PurgeLockedOpData>;
  constexpr bool kIsClockUpdateChain =
      std::is_same_v<OpData, ClockUpdateChainLockedOpData>;

  // Exactly one op specified
  static_assert(kIsPurge + kIsClockUpdateChain == 1);

  HandleImpl* const arr = array_.Get();

  uint64_t next_with_shift = rewrite_lock.GetNewHead();
  assert(!HandleImpl::IsEnd(next_with_shift));
  int home_shift = GetShiftFromNextWithShift(next_with_shift);
  (void)home;
  (void)home_shift;
  size_t next = GetNextFromNextWithShift(next_with_shift);
  assert(next < array_.Count());
  HandleImpl* h = &arr[next];
  HandleImpl* prev_to_keep = nullptr;
#ifndef NDEBUG
  uint64_t prev_to_keep_next_with_shift = 0;
#endif
  // Whether there are entries between h and prev_to_keep that should be
  // purged from the chain.
  bool pending_purge = false;

  // Walk the chain, and stitch together any entries that are still
  // "shareable," possibly after clock update. prev_to_keep tells us where
  // the last "stitch back to" location is (nullptr => head).
  for (size_t i = 0;; ++i) {
    CHECK_TOO_MANY_ITERATIONS(i);

    bool purgeable = false;
    // In last iteration, h will be nullptr, to stitch together the tail of
    // the chain.
    if (h) {
      // NOTE: holding a rewrite lock on the chain prevents any "under
      // (de)construction" entries in the chain from being marked empty, which
      // allows us to access the hashed_keys without holding a read ref.
      assert(home == BottomNBits(h->hashed_key[1], home_shift));
      if constexpr (kIsClockUpdateChain) {
        // Clock update and/or check for purgeable (under (de)construction)
        if (ClockUpdate(*h, &purgeable)) {
          // Remember for finishing eviction
          op_data->push_back(h);
          // Entries for eviction become purgeable
          purgeable = true;
          assert((h->meta.load(std::memory_order_acquire) >>
                  ClockHandle::kStateShift) == ClockHandle::kStateConstruction);
        }
      } else {
        (void)op_data;
        purgeable = ((h->meta.load(std::memory_order_acquire) >>
                      ClockHandle::kStateShift) &
                     ClockHandle::kStateShareableBit) == 0;
      }
    }

    if (purgeable) {
      assert((h->meta.load(std::memory_order_acquire) >>
              ClockHandle::kStateShift) == ClockHandle::kStateConstruction);
      pending_purge = true;
    } else if (pending_purge) {
      if (prev_to_keep) {
        // Update chain next to skip purgeable entries
        assert(prev_to_keep->chain_next_with_shift.load(
                   std::memory_order_acquire) == prev_to_keep_next_with_shift);
        prev_to_keep->chain_next_with_shift.store(next_with_shift,
                                                  std::memory_order_release);
      } else if (rewrite_lock.CasUpdate(next_with_shift, yield_count_)) {
        // Managed to update head without any parallel insertions
      } else {
        // Parallel insertion must have interfered. Need to do a purge
        // from updated head to here. Since we have no prev_to_keep, there's
        // no risk of duplicate clock updates to entries. Any entries already
        // updated must have been evicted (purgeable) and it's OK to clock
        // update any new entries just inserted in parallel.
        // Can simply restart (GetNewHead() already updated from CAS failure).
        next_with_shift = rewrite_lock.GetNewHead();
        assert(!HandleImpl::IsEnd(next_with_shift));
        next = GetNextFromNextWithShift(next_with_shift);
        assert(next < array_.Count());
        h = &arr[next];
        pending_purge = false;
        assert(prev_to_keep == nullptr);
        assert(GetShiftFromNextWithShift(next_with_shift) == home_shift);
        continue;
      }
      pending_purge = false;
      prev_to_keep = h;
    } else {
      prev_to_keep = h;
    }

    if (h == nullptr) {
      // Reached end of the chain
      return;
    }

    // Read chain pointer
    next_with_shift = h->chain_next_with_shift.load(std::memory_order_acquire);
#ifndef NDEBUG
    if (prev_to_keep == h) {
      prev_to_keep_next_with_shift = next_with_shift;
    }
#endif

    assert(GetShiftFromNextWithShift(next_with_shift) == home_shift);

    // Check for end marker
    if (HandleImpl::IsEnd(next_with_shift)) {
      h = nullptr;
    } else {
      next = GetNextFromNextWithShift(next_with_shift);
      assert(next < array_.Count());
      h = &arr[next];
      assert(h != prev_to_keep);
    }
  }
}

// Variant of PurgeImpl: Removes all "under (de) construction" entries in a
// chain, such that any entry with the given key must have been purged.
using PurgeOpData = const UniqueId64x2;
// Variant of PurgeImpl: Clock-updates all entries in a chain, in addition to
// purging as appropriate. (Caller finalizes eviction on entries added to the
// autovector, in part so that we don't hold the rewrite lock while doing
// potentially expensive callback and allocator free.)
using ClockUpdateChainOpData = ClockUpdateChainLockedOpData;

template <class OpData>
void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home) {
  // Early efforts to make AutoHCC fully wait-free ran into too many problems
  // that needed obscure and potentially inefficient work-arounds to have a
  // chance at working.
  //
  // The implementation settled on "essentially wait-free" which can be
  // achieved by locking at the level of each probing chain and only for
  // operations that might remove entries from the chain. Because parallel
  // clock updates and Grow operations are ordered, contention is very rare.
  // However, parallel insertions at any chain head have to be accommodated
  // to keep them wait-free.
  //
  // This function implements Purge and ClockUpdateChain functions (see above
  // OpData type definitions) as part of higher-level operations. This function
  // ensures the correct chain is (eventually) covered and handles rewrite
  // locking the chain. PurgeImplLocked has lower level details.
  //
  // In general, these operations and Grow are kept simpler by allowing eager
  // purging of under (de-)construction entries. For example, an Erase
  // operation might find that another thread has purged the entry from the
  // chain by the time its own purge operation acquires the rewrite lock and
  // proceeds. This is OK, and potentially reduces the number of lock/unlock
  // cycles because empty chains are not rewrite-lockable.

  constexpr bool kIsPurge = std::is_same_v<OpData, PurgeOpData>;
  constexpr bool kIsClockUpdateChain =
      std::is_same_v<OpData, ClockUpdateChainOpData>;

  // Exactly one op specified
  static_assert(kIsPurge + kIsClockUpdateChain == 1);

  int home_shift = 0;
  if constexpr (kIsPurge) {
    // Purge callers leave home unspecified, to be determined from key
    assert(home == SIZE_MAX);
    GetHomeIndexAndShift(length_info_.load(std::memory_order_acquire),
                         (*op_data)[1], &home, &home_shift);
    assert(home_shift > 0);
  } else {
    // Evict callers must specify home
    assert(home < SIZE_MAX);
  }

  HandleImpl* const arr = array_.Get();

  // Acquire the RAII rewrite lock (if not an empty chain)
  ChainRewriteLock rewrite_lock(&arr[home], yield_count_);

  int shift;
  for (;;) {
    shift = GetShiftFromNextWithShift(rewrite_lock.GetNewHead());

    if constexpr (kIsPurge) {
      if (shift > home_shift) {
        // At head. Thus, we know the newer shift applies to us.
        // Newer shift might not yet be reflected in length_info_ (an atomicity
        // gap in Grow), so operate as if it is. Note that other insertions
        // could happen using this shift before length_info_ is updated, and
        // it's possible (though unlikely) that multiple generations of Grow
        // have occurred. If shift is more than one generation ahead of
        // home_shift, it's possible that not all descendent homes have
        // reached the `shift` generation. Thus, we need to advance only one
        // shift at a time looking for a home+head with a matching shift
        // amount.
        home_shift++;
        home = GetHomeIndex((*op_data)[1], home_shift);
        rewrite_lock.Reset(&arr[home], yield_count_);
        continue;
      } else {
        assert(shift == home_shift);
      }
    } else {
      assert(home_shift == 0);
      home_shift = shift;
    }
    break;
  }

  // If the chain is empty, nothing to do
  if (!rewrite_lock.IsEnd()) {
    if constexpr (kIsPurge) {
      PurgeLockedOpData* locked_op_data{};
      PurgeImplLocked(locked_op_data, rewrite_lock, home);
    } else {
      PurgeImplLocked(op_data, rewrite_lock, home);
    }
  }
}

AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert(
    const ClockHandleBasicData& proto, uint64_t initial_countdown,
    bool take_ref, InsertState& state) {
  size_t home;
  int orig_home_shift;
  GetHomeIndexAndShift(state.saved_length_info, proto.hashed_key[1], &home,
                       &orig_home_shift);
  HandleImpl* const arr = array_.Get();

  // We could go searching through the chain for any duplicate, but that's
  // not typically helpful, except for the REDUNDANT block cache stats.
  // (Inferior duplicates will age out with eviction.) However, we do skip
  // insertion if the home slot (or some other we happen to probe) already
  // has a match (already_matches below). This helps to keep better locality
  // when we can.
  //
  // And we can do that as part of searching for an available slot to
  // insert the new entry, because our preferred location and first slot
  // checked will be the home slot.
  //
  // As the table initially grows to size, few entries will be in the same
  // cache line as the chain head. However, churn in the cache relatively
  // quickly improves the proportion of entries sharing that cache line with
  // the chain head. Data:
  //
  // Initial population only: (cache_bench with -ops_per_thread=1)
  // Entries at home count: 29,202 (out of 129,170 entries in 94,411 chains)
  // Approximate average cache lines read to find an existing entry:
  //           129.2 / 94.4 [without the heads]
  // + (94.4 - 29.2) / 94.4 [the heads not included with entries]
  // = 2.06 cache lines
  //
  // After 10 million ops: (-threads=10 -ops_per_thread=100000)
  // Entries at home count: 67,556 (out of 129,359 entries in 94,756 chains)
  // That's a majority of entries and more than 2/3rds of chains.
  // Approximate average cache lines read to find an existing entry:
  // = 1.65 cache lines

  size_t used_length = LengthInfoToUsedLength(state.saved_length_info);
  assert(home < used_length);

  size_t idx = home;
  bool already_matches = false;
  bool already_matches_ignore = false;
  if (TryInsert(proto, arr[idx], initial_countdown, take_ref,
                &already_matches)) {
    assert(idx == home);
  } else if (already_matches) {
    return nullptr;
    // Here we try to populate newly-opened slots in the table, but not
    // when we can add something to its home slot. This makes the structure
    // more performant more quickly on (initial) growth. We ignore "already
    // matches" in this case because it is unlikely and difficult to
    // incorporate logic for here cleanly and efficiently.
  } else if (UNLIKELY(state.likely_empty_slot > 0) &&
             TryInsert(proto, arr[state.likely_empty_slot], initial_countdown,
                       take_ref, &already_matches_ignore)) {
    idx = state.likely_empty_slot;
  } else {
    // We need to search for an available slot outside of the home.
    // Linear hashing provides nice resizing but does typically mean
    // that some heads (home locations) have (in expectation) twice as
    // many entries mapped to them as other heads. For example if the
    // usable length is 80, then heads 16-63 are (in expectation) twice
    // as loaded as heads 0-15 and 64-79, which are using another hash bit.
    //
    // This means that if we just use linear probing (by a small constant)
    // to find an available slot, part of the structure could easily fill up
    // and resort to linear time operations even when the overall load factor
    // is only modestly high, like 70%. Even though each slot has its own CPU
    // cache line, there appears to be a small locality benefit (e.g. TLB and
    // paging) to iterating one by one, as long as we don't afoul of the
    // linear hashing imbalance.
    //
    // In a traditional non-concurrent structure, we could keep a "free list"
    // to ensure immediate access to an available slot, but maintaining such
    // a structure could require more cross-thread coordination to ensure
    // all entries are eventually available to all threads.
    //
    // The way we solve this problem is to use unit-increment linear probing
    // with a small bound, and then fall back on big jumps to have a good
    // chance of finding a slot in an under-populated region quickly if that
    // doesn't work.
    size_t i = 0;
    constexpr size_t kMaxLinearProbe = 4;
    for (; i < kMaxLinearProbe; i++) {
      idx++;
      if (idx >= used_length) {
        idx -= used_length;
      }
      if (TryInsert(proto, arr[idx], initial_countdown, take_ref,
                    &already_matches)) {
        break;
      }
      if (already_matches) {
        return nullptr;
      }
    }
    if (i == kMaxLinearProbe) {
      // Keep searching, but change to a search method that should quickly
      // find any under-populated region. Switching to an increment based
      // on the golden ratio helps with that, but we also inject some minor
      // variation (less than 2%, 1 in 2^6) to avoid clustering effects on
      // this larger increment (if it were a fixed value in steady state
      // operation). Here we are primarily using upper bits of hashed_key[1]
      // while home is based on lowest bits.
      uint64_t incr_ratio = 0x9E3779B185EBCA87U + (proto.hashed_key[1] >> 6);
      size_t incr = FastRange64(incr_ratio, used_length);
      assert(incr > 0);
      size_t start = idx;
      for (;; i++) {
        idx += incr;
        if (idx >= used_length) {
          // Wrap around (faster than %)
          idx -= used_length;
        }
        if (idx == start) {
          // We have just completed a cycle that might not have covered all
          // slots. (incr and used_length could have common factors.)
          // Increment for the next cycle, which eventually ensures complete
          // iteration over the set of slots before repeating.
          idx++;
          if (idx >= used_length) {
            idx -= used_length;
          }
          start++;
          if (start >= used_length) {
            start -= used_length;
          }
          if (i >= used_length) {
            used_length = LengthInfoToUsedLength(
                length_info_.load(std::memory_order_acquire));
            if (i >= used_length * 2) {
              // Cycling back should not happen unless there is enough random
              // churn in parallel that we happen to hit each slot at a time
              // that it's occupied, which is really only feasible for small
              // structures, though with linear probing to find empty slots,
              // "small" here might be larger than for double hashing.
              assert(used_length <= 256);
              // Fall back on standalone insert in case something goes awry to
              // cause this
              return nullptr;
            }
          }
        }
        if (TryInsert(proto, arr[idx], initial_countdown, take_ref,
                      &already_matches)) {
          break;
        }
        if (already_matches) {
          return nullptr;
        }
      }
    }
  }

  // Now insert into chain using head pointer
  uint64_t next_with_shift;
  int home_shift = orig_home_shift;

  // Might need to retry
  for (int i = 0;; ++i) {
    CHECK_TOO_MANY_ITERATIONS(i);
    next_with_shift =
        arr[home].head_next_with_shift.load(std::memory_order_acquire);
    int shift = GetShiftFromNextWithShift(next_with_shift);

    if (UNLIKELY(shift != home_shift)) {
      // NOTE: shift increases with table growth
      if (shift > home_shift) {
        // Must be grow in progress or completed since reading length_info.
        // Pull out one more hash bit. (See Lookup() for why we can't
        // safely jump to the shift that was read.)
        home_shift++;
        uint64_t hash_bit_mask = uint64_t{1} << (home_shift - 1);
        assert((home & hash_bit_mask) == 0);
        // BEGIN leftover updates to length_info_ for Grow()
        size_t grow_home = home + hash_bit_mask;
        assert(arr[grow_home].head_next_with_shift.load(
                   std::memory_order_acquire) != HandleImpl::kUnusedMarker);
        CatchUpLengthInfoNoWait(grow_home);
        // END leftover updates to length_info_ for Grow()
        home += proto.hashed_key[1] & hash_bit_mask;
        continue;
      } else {
        // Should not happen because length_info_ is only updated after both
        // old and new home heads are marked with new shift
        assert(false);
      }
    }

    // Values to update to
    uint64_t head_next_with_shift = MakeNextWithShift(idx, home_shift);
    uint64_t chain_next_with_shift = next_with_shift;

    // Preserve the locked state in head, without propagating to chain next
    // where it is meaningless (and not allowed)
    if (UNLIKELY((next_with_shift & HandleImpl::kNextEndFlags) ==
                 HandleImpl::kHeadLocked)) {
      head_next_with_shift |= HandleImpl::kHeadLocked;
      chain_next_with_shift &= ~HandleImpl::kHeadLocked;
    }

    arr[idx].chain_next_with_shift.store(chain_next_with_shift,
                                         std::memory_order_release);
    if (arr[home].head_next_with_shift.compare_exchange_weak(
            next_with_shift, head_next_with_shift, std::memory_order_acq_rel)) {
      // Success
      return arr + idx;
    }
  }
}

AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
    const UniqueId64x2& hashed_key) {
  // Lookups are wait-free with low occurrence of retries, back-tracking,
  // and fallback. We do not have the benefit of holding a rewrite lock on
  // the chain so must be prepared for many kinds of mayhem, most notably
  // "falling off our chain" where a slot that Lookup has identified but
  // has not read-referenced is removed from one chain and inserted into
  // another. The full algorithm uses the following mitigation strategies to
  // ensure every relevant entry inserted before this Lookup, and not yet
  // evicted, is seen by Lookup, without excessive backtracking etc.:
  // * Keep a known good read ref in the chain for "island hopping." When
  // we observe that a concurrent write takes us off to another chain, we
  // only need to fall back to our last known good read ref (most recent
  // entry on the chain that is not "under construction," which is a transient
  // state). We don't want to compound the CPU toil of a long chain with
  // operations that might need to retry from scratch, with probability
  // in proportion to chain length.
  // * Only detect a chain is potentially incomplete because of a Grow in
  // progress by looking at shift in the next pointer tags (rather than
  // re-checking length_info_).
  // * SplitForGrow, Insert, and PurgeImplLocked ensure that there are no
  // transient states that might cause this full Lookup algorithm to skip over
  // live entries.

  // Reading length_info_ is not strictly required for Lookup, if we were
  // to increment shift sizes until we see a shift size match on the
  // relevant head pointer. Thus, reading with relaxed memory order gives
  // us a safe and almost always up-to-date jump into finding the correct
  // home and head.
  size_t home;
  int home_shift;
  GetHomeIndexAndShift(length_info_.load(std::memory_order_relaxed),
                       hashed_key[1], &home, &home_shift);
  assert(home_shift > 0);

  // The full Lookup algorithm however is not great for hot path efficiency,
  // because of the extra careful tracking described above. Overwhelmingly,
  // we can find what we're looking for with a naive linked list traversal
  // of the chain. Even if we "fall off our chain" to another, we don't
  // violate memory safety. We just won't match the key we're looking for.
  // And we would eventually reach an end state, possibly even experiencing a
  // cycle as an entry is freed and reused during our traversal (though at
  // any point in time the structure doesn't have cycles).
  //
  // So for hot path efficiency, we start with a naive Lookup attempt, and
  // then fall back on full Lookup if we don't find the correct entry. To
  // cap how much we invest into the naive Lookup, we simply cap the traversal
  // length before falling back. Also, when we do fall back on full Lookup,
  // we aren't paying much penalty by starting over. Much or most of the cost
  // of Lookup is memory latency in following the chain pointers, and the
  // naive Lookup has warmed the CPU cache for these entries, using as tight
  // of a loop as possible.

  HandleImpl* const arr = array_.Get();
  uint64_t next_with_shift = arr[home].head_next_with_shift;
  for (size_t i = 0; !HandleImpl::IsEnd(next_with_shift) && i < 10; ++i) {
    HandleImpl* h = &arr[GetNextFromNextWithShift(next_with_shift)];
    // Attempt cheap key match without acquiring a read ref. This could give a
    // false positive, which is re-checked after acquiring read ref, or false
    // negative, which is re-checked in the full Lookup. Also, this is a
    // technical UB data race according to TSAN, but we don't need to read
    // a "correct" value here for correct overall behavior.
#ifdef __SANITIZE_THREAD__
    bool probably_equal = Random::GetTLSInstance()->OneIn(2);
#else
    bool probably_equal = h->hashed_key == hashed_key;
#endif
    if (probably_equal) {
      // Increment acquire counter for definitive check
      uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
                                            std::memory_order_acquire);
      // Check if it's a referencable (sharable) entry
      if (LIKELY(old_meta & (uint64_t{ClockHandle::kStateShareableBit}
                             << ClockHandle::kStateShift))) {
        assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0);
        if (LIKELY(h->hashed_key == hashed_key) &&
            LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit}
                               << ClockHandle::kStateShift))) {
          return h;
        } else {
          Unref(*h);
        }
      } else {
        // For non-sharable states, incrementing the acquire counter has no
        // effect so we don't need to undo it. Furthermore, we cannot safely
        // undo it because we did not acquire a read reference to lock the entry
        // in a Shareable state.
      }
    }

    next_with_shift = h->chain_next_with_shift.load(std::memory_order_relaxed);
  }

  // If we get here, falling back on full Lookup algorithm.
  HandleImpl* h = nullptr;
  HandleImpl* read_ref_on_chain = nullptr;

  for (size_t i = 0;; ++i) {
    CHECK_TOO_MANY_ITERATIONS(i);
    // Read head or chain pointer
    next_with_shift =
        h ? h->chain_next_with_shift : arr[home].head_next_with_shift;
    int shift = GetShiftFromNextWithShift(next_with_shift);

    // Make sure it's usable
    size_t effective_home = home;
    if (UNLIKELY(shift != home_shift)) {
      // We have potentially gone awry somehow, but it's possible we're just
      // hitting old data that is not yet completed Grow.
      // NOTE: shift bits goes up with table growth.
      if (shift < home_shift) {
        // To avoid waiting on Grow in progress, an old shift amount needs
        // to be processed as if we were still using it and (potentially
        // different or the same) the old home.
        // We can assert it's not too old, because each generation of Grow
        // waits on its ancestor in the previous generation.
        assert(shift + 1 == home_shift);
        effective_home = GetHomeIndex(home, shift);
      } else if (h == read_ref_on_chain) {
        assert(shift > home_shift);
        // At head or coming from an entry on our chain where we're holding
        // a read reference. Thus, we know the newer shift applies to us.
        // Newer shift might not yet be reflected in length_info_ (an atomicity
        // gap in Grow), so operate as if it is. Note that other insertions
        // could happen using this shift before length_info_ is updated, and
        // it's possible (though unlikely) that multiple generations of Grow
        // have occurred. If shift is more than one generation ahead of
        // home_shift, it's possible that not all descendent homes have
        // reached the `shift` generation. Thus, we need to advance only one
        // shift at a time looking for a home+head with a matching shift
        // amount.
        home_shift++;
        // Update home in case it has changed
        home = GetHomeIndex(hashed_key[1], home_shift);
        // This should be rare enough occurrence that it's simplest just
        // to restart (TODO: improve in some cases?)
        h = nullptr;
        if (read_ref_on_chain) {
          Unref(*read_ref_on_chain);
          read_ref_on_chain = nullptr;
        }
        // Didn't make progress & retry
        continue;
      } else {
        assert(shift > home_shift);
        assert(h != nullptr);
        // An "under (de)construction" entry has a new shift amount, which
        // means we have either gotten off our chain or our home shift is out
        // of date. If we revert back to saved ref, we will get updated info.
        h = read_ref_on_chain;
        // Didn't make progress & retry
        continue;
      }
    }

    // Check for end marker
    if (HandleImpl::IsEnd(next_with_shift)) {
      // To ensure we didn't miss anything in the chain, the end marker must
      // point back to the correct home.
      if (LIKELY(GetNextFromNextWithShift(next_with_shift) == effective_home)) {
        // Complete, clean iteration of the chain, not found.
        // Clean up.
        if (read_ref_on_chain) {
          Unref(*read_ref_on_chain);
        }
        return nullptr;
      } else {
        // Something went awry. Revert back to a safe point (if we have it)
        h = read_ref_on_chain;
        // Didn't make progress & retry
        continue;
      }
    }

    // Follow the next and check for full key match, home match, or neither
    h = &arr[GetNextFromNextWithShift(next_with_shift)];
    bool full_match_or_unknown = false;
    if (MatchAndRef(&hashed_key, *h, shift, effective_home,
                    &full_match_or_unknown)) {
      // Got a read ref on next (h).
      //
      // There is a very small chance that between getting the next pointer
      // (now h) and doing MatchAndRef on it, another thread erased/evicted it
      // reinserted it into the same chain, causing us to cycle back in the
      // same chain and potentially see some entries again if we keep walking.
      // Newly-inserted entries are inserted before older ones, so we are at
      // least guaranteed not to miss anything. Here in Lookup, it's just a
      // transient, slight hiccup in performance.

      if (full_match_or_unknown) {
        // Full match.
        // Release old read ref on chain if applicable
        if (read_ref_on_chain) {
          // Pretend we never took the reference.
          Unref(*read_ref_on_chain);
        }
        // Update the hit bit
        if (eviction_callback_) {
          h->meta.fetch_or(uint64_t{1} << ClockHandle::kHitBitShift,
                           std::memory_order_relaxed);
        }
        // All done.
        return h;
      } else if (UNLIKELY(shift != home_shift) &&
                 home != BottomNBits(h->hashed_key[1], home_shift)) {
        // This chain is in a Grow operation and we've landed on an entry
        // that belongs to the wrong destination chain. We can keep going, but
        // there's a chance we'll need to backtrack back *before* this entry,
        // if the Grow finishes before this Lookup. We cannot save this entry
        // for backtracking because it might soon or already be on the wrong
        // chain.
        // NOTE: if we simply backtrack rather than continuing, we would
        // be in a wait loop (not allowed in Lookup!) until the other thread
        // finishes its Grow.
        Unref(*h);
      } else {
        // Correct home location, so we are on the right chain.
        // With new usable read ref, can release old one (if applicable).
        if (read_ref_on_chain) {
          // Pretend we never took the reference.
          Unref(*read_ref_on_chain);
        }
        // And keep the new one.
        read_ref_on_chain = h;
      }
    } else {
      if (full_match_or_unknown) {
        // Must have been an "under construction" entry. Can safely skip it,
        // but there's a chance we'll have to backtrack later
      } else {
        // Home mismatch! Revert back to a safe point (if we have it)
        h = read_ref_on_chain;
        // Didn't make progress & retry
      }
    }
  }
}

void AutoHyperClockTable::Remove(HandleImpl* h) {
  assert((h->meta.load() >> ClockHandle::kStateShift) ==
         ClockHandle::kStateConstruction);

  const HandleImpl& c_h = *h;
  PurgeImpl(&c_h.hashed_key);
}

bool AutoHyperClockTable::TryEraseHandle(HandleImpl* h, bool holding_ref,
                                         bool mark_invisible) {
  uint64_t meta;
  if (mark_invisible) {
    // Set invisible
    meta = h->meta.fetch_and(
        ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift),
        std::memory_order_acq_rel);
    // To local variable also
    meta &=
        ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift);
  } else {
    meta = h->meta.load(std::memory_order_acquire);
  }

  // Take ownership if no other refs
  do {
    if (GetRefcount(meta) != uint64_t{holding_ref}) {
      // Not last ref at some point in time during this call
      return false;
    }
    if ((meta & (uint64_t{ClockHandle::kStateShareableBit}
                 << ClockHandle::kStateShift)) == 0) {
      // Someone else took ownership
      return false;
    }
    // Note that if !holding_ref, there's a small chance that we release,
    // another thread replaces this entry with another, reaches zero refs, and
    // then we end up erasing that other entry. That's an acceptable risk /
    // imprecision.
  } while (!h->meta.compare_exchange_weak(
      meta,
      uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
      std::memory_order_acquire));
  // Took ownership
  // TODO? Delay freeing?
  h->FreeData(allocator_);
  size_t total_charge = h->total_charge;
  if (UNLIKELY(h->IsStandalone())) {
    // Delete detached handle
    delete h;
    standalone_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
  } else {
    Remove(h);
    MarkEmpty(*h);
    occupancy_.fetch_sub(1U, std::memory_order_release);
  }
  usage_.fetch_sub(total_charge, std::memory_order_relaxed);
  assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
  return true;
}

bool AutoHyperClockTable::Release(HandleImpl* h, bool useful,
                                  bool erase_if_last_ref) {
  // In contrast with LRUCache's Release, this function won't delete the handle
  // when the cache is above capacity and the reference is the last one. Space
  // is only freed up by Evict/PurgeImpl (called by Insert when space
  // is needed) and Erase. We do this to avoid an extra atomic read of the
  // variable usage_.

  uint64_t old_meta;
  if (useful) {
    // Increment release counter to indicate was used
    old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement,
                                 std::memory_order_release);
    // Correct for possible (but rare) overflow
    CorrectNearOverflow(old_meta, h->meta);
  } else {
    // Decrement acquire counter to pretend it never happened
    old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
                                 std::memory_order_release);
  }

  assert((old_meta >> ClockHandle::kStateShift) &
         ClockHandle::kStateShareableBit);
  // No underflow
  assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
          ClockHandle::kCounterMask) !=
         ((old_meta >> ClockHandle::kReleaseCounterShift) &
          ClockHandle::kCounterMask));

  if ((erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
                                     ClockHandle::kStateInvisible))) {
    // FIXME: There's a chance here that another thread could replace this
    // entry and we end up erasing the wrong one.
    return TryEraseHandle(h, /*holding_ref=*/false, /*mark_invisible=*/false);
  } else {
    return false;
  }
}

#ifndef NDEBUG
void AutoHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
  if (n > 0) {
    // Do n-1 simple releases first
    TEST_ReleaseNMinus1(h, n);

    // Then the last release might be more involved
    Release(h, /*useful*/ true, /*erase_if_last_ref*/ false);
  }
}
#endif

void AutoHyperClockTable::Erase(const UniqueId64x2& hashed_key) {
  // Don't need to be efficient.
  // Might be one match masking another, so loop.
  while (HandleImpl* h = Lookup(hashed_key)) {
    bool gone =
        TryEraseHandle(h, /*holding_ref=*/true, /*mark_invisible=*/true);
    if (!gone) {
      // Only marked invisible, which is ok.
      // Pretend we never took the reference from Lookup.
      Unref(*h);
    }
  }
}

void AutoHyperClockTable::EraseUnRefEntries() {
  size_t usable_size = GetTableSize();
  for (size_t i = 0; i < usable_size; i++) {
    HandleImpl& h = array_[i];

    uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
    if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
                    << ClockHandle::kStateShift) &&
        GetRefcount(old_meta) == 0 &&
        h.meta.compare_exchange_strong(old_meta,
                                       uint64_t{ClockHandle::kStateConstruction}
                                           << ClockHandle::kStateShift,
                                       std::memory_order_acquire)) {
      // Took ownership
      h.FreeData(allocator_);
      usage_.fetch_sub(h.total_charge, std::memory_order_relaxed);
      // NOTE: could be more efficient with a dedicated variant of
      // PurgeImpl, but this is not a common operation
      Remove(&h);
      MarkEmpty(h);
      occupancy_.fetch_sub(1U, std::memory_order_release);
    }
  }
}

void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state,
                                EvictionData* data) {
  // precondition
  assert(requested_charge > 0);

  // We need the clock pointer to seemlessly "wrap around" at the end of the
  // table, and to be reasonably stable under Grow operations. This is
  // challenging when the linear hashing progressively opens additional
  // most-significant-hash-bits in determining home locations.

  // TODO: make a tuning parameter?
  // Up to 2x this number of homes will be evicted per step. In very rare
  // cases, possibly more, as homes of an out-of-date generation will be
  // resolved to multiple in a newer generation.
  constexpr size_t step_size = 4;

  // A clock_pointer_mask_ field separate from length_info_ enables us to use
  // the same mask (way of dividing up the space among evicting threads) for
  // iterating over the whole structure before considering changing the mask
  // at the beginning of each pass. This ensures we do not have a large portion
  // of the space that receives redundant or missed clock updates. However,
  // with two variables, for each update to clock_pointer_mask (< 64 ever in
  // the life of the cache), there will be a brief period where concurrent
  // eviction threads could use the old mask value, possibly causing redundant
  // or missed clock updates for a *small* portion of the table.
  size_t clock_pointer_mask =
      clock_pointer_mask_.load(std::memory_order_relaxed);

  uint64_t max_clock_pointer = 0;  // unset

  // TODO: consider updating during a long eviction
  size_t used_length = LengthInfoToUsedLength(state.saved_length_info);

  autovector<HandleImpl*> to_finish_eviction;

  // Loop until enough freed, or limit reached (see bottom of loop)
  for (;;) {
    // First (concurrent) increment clock pointer
    uint64_t old_clock_pointer =
        clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);

    if (UNLIKELY((old_clock_pointer & clock_pointer_mask) == 0)) {
      // Back at the beginning. See if clock_pointer_mask should be updated.
      uint64_t mask = BottomNBits(
          UINT64_MAX, LengthInfoToMinShift(state.saved_length_info));
      if (clock_pointer_mask != mask) {
        clock_pointer_mask = static_cast<size_t>(mask);
        clock_pointer_mask_.store(clock_pointer_mask,
                                  std::memory_order_relaxed);
      }
    }

    size_t major_step = clock_pointer_mask + 1;
    assert((major_step & clock_pointer_mask) == 0);

    for (size_t base_home = old_clock_pointer & clock_pointer_mask;
         base_home < used_length; base_home += major_step) {
      for (size_t i = 0; i < step_size; i++) {
        size_t home = base_home + i;
        if (home >= used_length) {
          break;
        }
        PurgeImpl(&to_finish_eviction, home);
      }
    }

    for (HandleImpl* h : to_finish_eviction) {
      TrackAndReleaseEvictedEntry(h, data);
      // NOTE: setting likely_empty_slot here can cause us to reduce the
      // portion of "at home" entries, probably because an evicted entry
      // is more likely to come back than a random new entry and would be
      // unable to go into its home slot.
    }
    to_finish_eviction.clear();

    // Loop exit conditions
    if (data->freed_charge >= requested_charge) {
      return;
    }

    if (max_clock_pointer == 0) {
      // Cap the eviction effort at this thread (along with those operating in
      // parallel) circling through the whole structure kMaxCountdown times.
      // In other words, this eviction run must find something/anything that is
      // unreferenced at start of and during the eviction run that isn't
      // reclaimed by a concurrent eviction run.
      // TODO: Does HyperClockCache need kMaxCountdown + 1?
      max_clock_pointer =
          old_clock_pointer +
          (uint64_t{ClockHandle::kMaxCountdown + 1} * major_step);
    }

    if (old_clock_pointer + step_size >= max_clock_pointer) {
      return;
    }
  }
}

size_t AutoHyperClockTable::CalcMaxUsableLength(
    size_t capacity, size_t min_avg_value_size,
    CacheMetadataChargePolicy metadata_charge_policy) {
  double min_avg_slot_charge = min_avg_value_size * kMaxLoadFactor;
  if (metadata_charge_policy == kFullChargeCacheMetadata) {
    min_avg_slot_charge += sizeof(HandleImpl);
  }
  assert(min_avg_slot_charge > 0.0);
  size_t num_slots =
      static_cast<size_t>(capacity / min_avg_slot_charge + 0.999999);

  const size_t slots_per_page = port::kPageSize / sizeof(HandleImpl);

  // Round up to page size
  return ((num_slots + slots_per_page - 1) / slots_per_page) * slots_per_page;
}

namespace {
bool IsHeadNonempty(const AutoHyperClockTable::HandleImpl& h) {
  return !AutoHyperClockTable::HandleImpl::IsEnd(
      h.head_next_with_shift.load(std::memory_order_relaxed));
}
bool IsEntryAtHome(const AutoHyperClockTable::HandleImpl& h, int shift,
                   size_t home) {
  if (MatchAndRef(nullptr, h, shift, home)) {
    Unref(h);
    return true;
  } else {
    return false;
  }
}
}  // namespace

void AutoHyperClockCache::ReportProblems(
    const std::shared_ptr<Logger>& info_log) const {
  BaseHyperClockCache::ReportProblems(info_log);

  if (info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) {
    LoadVarianceStats head_stats;
    size_t entry_at_home_count = 0;
    uint64_t yield_count = 0;
    this->ForEachShard([&](const Shard* shard) {
      size_t count = shard->GetTableAddressCount();
      uint64_t length_info = UsedLengthToLengthInfo(count);
      for (size_t i = 0; i < count; ++i) {
        const auto& h = *shard->GetTable().HandlePtr(i);
        head_stats.Add(IsHeadNonempty(h));
        int shift;
        size_t home;
        GetHomeIndexAndShift(length_info, i, &home, &shift);
        assert(home == i);
        entry_at_home_count += IsEntryAtHome(h, shift, home);
      }
      yield_count += shard->GetTable().GetYieldCount();
    });
    ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL,
                       "Head occupancy stats: %s", head_stats.Report().c_str());
    ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL,
                       "Entries at home count: %zu", entry_at_home_count);
    ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL,
                       "Yield count: %" PRIu64, yield_count);
  }
}

}  // namespace clock_cache

// DEPRECATED (see public API)
std::shared_ptr<Cache> NewClockCache(
    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
    CacheMetadataChargePolicy metadata_charge_policy) {
  return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
                     /* high_pri_pool_ratio */ 0.5, nullptr,
                     kDefaultToAdaptiveMutex, metadata_charge_policy,
                     /* low_pri_pool_ratio */ 0.0);
}

std::shared_ptr<Cache> HyperClockCacheOptions::MakeSharedCache() const {
  // For sanitized options
  HyperClockCacheOptions opts = *this;
  if (opts.num_shard_bits >= 20) {
    return nullptr;  // The cache cannot be sharded into too many fine pieces.
  }
  if (opts.num_shard_bits < 0) {
    // Use larger shard size to reduce risk of large entries clustering
    // or skewing individual shards.
    constexpr size_t min_shard_size = 32U * 1024U * 1024U;
    opts.num_shard_bits =
        GetDefaultCacheShardBits(opts.capacity, min_shard_size);
  }
  std::shared_ptr<Cache> cache;
  if (opts.estimated_entry_charge == 0) {
    cache = std::make_shared<clock_cache::AutoHyperClockCache>(opts);
  } else {
    cache = std::make_shared<clock_cache::FixedHyperClockCache>(opts);
  }
  if (opts.secondary_cache) {
    cache = std::make_shared<CacheWithSecondaryAdapter>(cache,
                                                        opts.secondary_cache);
  }
  return cache;
}

}  // namespace ROCKSDB_NAMESPACE