Refactor ShardedCache for more sharing, static polymorphism (#10801)

Summary: The motivations for this change include * Free up space in ClockHandle so that we can add data for secondary cache handling while still keeping within single cache line (64 byte) size. * This change frees up space by eliminating the need for the `hash` field by making the fixed-size key itself a hash, using a 128-bit bijective (lossless) hash. * Generally more customizability of ShardedCache (such as hashing) without worrying about virtual call overheads * ShardedCache now uses static polymorphism (template) instead of dynamic polymorphism (virtual overrides) for the CacheShard. No obvious performance benefit is seen from the change (as mostly expected; most calls to virtual functions in CacheShard could already be optimized to static calls), but offers more flexibility without incurring the runtime cost of adhering to a common interface (without type parameters or static callbacks). * You'll also notice less `reinterpret_cast`ing and other boilerplate in the Cache implementations, as this can go in ShardedCache. More detail: * Don't have LRUCacheShard maintain `std::shared_ptr<SecondaryCache>` copies (extra refcount) when LRUCache can be in charge of keeping a `shared_ptr`. * Renamed `capacity_mutex_` to `config_mutex_` to better represent the scope of what it guards. * Some preparation for 64-bit hash and indexing in LRUCache, but didn't include the full change because of slight performance regression. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10801 Test Plan: Unit test updates were non-trivial because of major changes to the ClockCacheShard interface in handling of key vs. hash. Performance: Create with `TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=30000000 -disable_wal=1 -bloom_bits=16` Test with ``` TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=readrandom[-X1000] -readonly -num=30000000 -bloom_bits=16 -cache_index_and_filter_blocks=1 -cache_size=610000000 -duration 20 -threads=16 ``` Before: `readrandom [AVG 150 runs] : 321147 (± 253) ops/sec` After: `readrandom [AVG 150 runs] : 321530 (± 326) ops/sec` So possibly ~0.1% improvement. And with `-cache_type=hyper_clock_cache`: Before: `readrandom [AVG 30 runs] : 614126 (± 7978) ops/sec` After: `readrandom [AVG 30 runs] : 645349 (± 8087) ops/sec` So roughly 5% improvement! Reviewed By: anand1976 Differential Revision: D40252236 Pulled By: pdillinger fbshipit-source-id: ff8fc70ef569585edc95bcbaaa0386f61355ae5b
2022-10-18 22:06:57 -07:00 · 2022-10-18 22:06:57 -07:00 · 7555243bcf
parent e267909ecf
commit 7555243bcf
14 changed files with 809 additions and 882 deletions
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@ -1023,21 +1023,21 @@ TEST_P(CacheTest, DefaultShardBits) {
      (GetParam() == kHyperClock ? 32U * 1024U : 512U) * 1024U;

  std::shared_ptr<Cache> cache = NewCache(32U * min_shard_size);
-  ShardedCache* sc = dynamic_cast<ShardedCache*>(cache.get());
+  ShardedCacheBase* sc = dynamic_cast<ShardedCacheBase*>(cache.get());
  ASSERT_EQ(5, sc->GetNumShardBits());

  cache = NewCache(min_shard_size / 1000U * 999U);
-  sc = dynamic_cast<ShardedCache*>(cache.get());
+  sc = dynamic_cast<ShardedCacheBase*>(cache.get());
  ASSERT_EQ(0, sc->GetNumShardBits());

  cache = NewCache(3U * 1024U * 1024U * 1024U);
-  sc = dynamic_cast<ShardedCache*>(cache.get());
+  sc = dynamic_cast<ShardedCacheBase*>(cache.get());
  // current maximum of 6
  ASSERT_EQ(6, sc->GetNumShardBits());

  if constexpr (sizeof(size_t) > 4) {
    cache = NewCache(128U * min_shard_size);
-    sc = dynamic_cast<ShardedCache*>(cache.get());
+    sc = dynamic_cast<ShardedCacheBase*>(cache.get());
    // current maximum of 6
    ASSERT_EQ(6, sc->GetNumShardBits());
  }
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@ -12,6 +12,7 @@
 #include <cassert>
 #include <functional>

+#include "cache/cache_key.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "port/lang.h"
@ -29,16 +30,22 @@ inline uint64_t GetRefcount(uint64_t meta) {
         ClockHandle::kCounterMask;
 }

+void ClockHandleBasicData::FreeData() const {
+  if (deleter) {
+    UniqueId64x2 unhashed;
+    (*deleter)(ClockCacheShard::ReverseHash(hashed_key, &unhashed), value);
+  }
+}
+
 static_assert(sizeof(ClockHandle) == 64U,
              "Expecting size / alignment with common cache line size");

 ClockHandleTable::ClockHandleTable(int hash_bits, bool initial_charge_metadata)
    : length_bits_(hash_bits),
-      length_bits_mask_(Lower32of64((uint64_t{1} << length_bits_) - 1)),
-      occupancy_limit_(static_cast<uint32_t>((uint64_t{1} << length_bits_) *
+      length_bits_mask_((size_t{1} << length_bits_) - 1),
+      occupancy_limit_(static_cast<size_t>((uint64_t{1} << length_bits_) *
                                           kStrictLoadFactor)),
      array_(new ClockHandle[size_t{1} << length_bits_]) {
-  assert(hash_bits <= 32);  // FIXME: ensure no overlap with sharding bits
  if (initial_charge_metadata) {
    usage_ += size_t{GetTableSize()} * sizeof(ClockHandle);
  }
@ -47,7 +54,7 @@ ClockHandleTable::ClockHandleTable(int hash_bits, bool initial_charge_metadata)
 ClockHandleTable::~ClockHandleTable() {
  // Assumes there are no references or active operations on any slot/element
  // in the table.
-  for (uint32_t i = 0; i < GetTableSize(); i++) {
+  for (size_t i = 0; i < GetTableSize(); i++) {
    ClockHandle& h = array_[i];
    switch (h.meta >> ClockHandle::kStateShift) {
      case ClockHandle::kStateEmpty:
@ -58,7 +65,7 @@ ClockHandleTable::~ClockHandleTable() {
        assert(GetRefcount(h.meta) == 0);
        h.FreeData();
 #ifndef NDEBUG
-        Rollback(h.hash, &h);
+        Rollback(h.hashed_key, &h);
        usage_.fetch_sub(h.total_charge, std::memory_order_relaxed);
        occupancy_.fetch_sub(1U, std::memory_order_relaxed);
 #endif
@ -71,7 +78,7 @@ ClockHandleTable::~ClockHandleTable() {
  }

 #ifndef NDEBUG
-  for (uint32_t i = 0; i < GetTableSize(); i++) {
+  for (size_t i = 0; i < GetTableSize(); i++) {
    assert(array_[i].displacements.load() == 0);
  }
 #endif
@ -154,12 +161,12 @@ inline void CorrectNearOverflow(uint64_t old_meta,
  }
 }

-Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
+Status ClockHandleTable::Insert(const ClockHandleBasicData& proto,
                                ClockHandle** handle, Cache::Priority priority,
                                size_t capacity, bool strict_capacity_limit) {
  // Do we have the available occupancy? Optimistically assume we do
  // and deal with it if we don't.
-  uint32_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
+  size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
  auto revert_occupancy_fn = [&]() {
    occupancy_.fetch_sub(1, std::memory_order_relaxed);
  };
@ -198,7 +205,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
    }
    if (request_evict_charge > 0) {
      size_t evicted_charge = 0;
-      uint32_t evicted_count = 0;
+      size_t evicted_count = 0;
      Evict(request_evict_charge, &evicted_charge, &evicted_count);
      occupancy_.fetch_sub(evicted_count, std::memory_order_release);
      if (LIKELY(evicted_charge > need_evict_charge)) {
@ -263,7 +270,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
      need_evict_charge = 1;
    }
    size_t evicted_charge = 0;
-    uint32_t evicted_count = 0;
+    size_t evicted_count = 0;
    if (need_evict_charge > 0) {
      Evict(need_evict_charge, &evicted_charge, &evicted_count);
      // Deal with potential occupancy deficit
@ -323,9 +330,9 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
    }
    assert(initial_countdown > 0);

-    uint32_t probe = 0;
+    size_t probe = 0;
    ClockHandle* e = FindSlot(
-        proto.hash,
+        proto.hashed_key,
        [&](ClockHandle* h) {
          // Optimistically transition the slot from "empty" to
          // "under construction" (no effect on other states)
@ -338,7 +345,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
          if (old_state == ClockHandle::kStateEmpty) {
            // We've started inserting into an available slot, and taken
            // ownership Save data fields
-            ClockHandleMoreData* h_alias = h;
+            ClockHandleBasicData* h_alias = h;
            *h_alias = proto;

            // Transition from "under construction" state to "visible" state
@ -375,7 +382,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
          if ((old_meta >> ClockHandle::kStateShift) ==
              ClockHandle::kStateVisible) {
            // Acquired a read reference
-            if (h->key == proto.key) {
+            if (h->hashed_key == proto.hashed_key) {
              // Match. Release in a way that boosts the clock state
              old_meta = h->meta.fetch_add(
                  ClockHandle::kReleaseIncrement * initial_countdown,
@ -431,7 +438,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
      return Status::OK();
    }
    // Roll back table insertion
-    Rollback(proto.hash, e);
+    Rollback(proto.hashed_key, e);
    revert_occupancy_fn();
    // Maybe fall back on detached insert
    if (handle == nullptr) {
@ -446,7 +453,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
  assert(use_detached_insert);

  ClockHandle* h = new ClockHandle();
-  ClockHandleMoreData* h_alias = h;
+  ClockHandleBasicData* h_alias = h;
  *h_alias = proto;
  h->detached = true;
  // Single reference (detached entries only created if returning a refed
@ -467,10 +474,10 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
  return Status::OkOverwritten();
 }

-ClockHandle* ClockHandleTable::Lookup(const CacheKeyBytes& key, uint32_t hash) {
-  uint32_t probe = 0;
+ClockHandle* ClockHandleTable::Lookup(const UniqueId64x2& hashed_key) {
+  size_t probe = 0;
  ClockHandle* e = FindSlot(
-      hash,
+      hashed_key,
      [&](ClockHandle* h) {
        // Mostly branch-free version (similar performance)
        /*
@ -501,7 +508,7 @@ ClockHandle* ClockHandleTable::Lookup(const CacheKeyBytes& key, uint32_t hash) {
        if ((old_meta >> ClockHandle::kStateShift) ==
            ClockHandle::kStateVisible) {
          // Acquired a read reference
-          if (h->key == key) {
+          if (h->hashed_key == hashed_key) {
            // Match
            return true;
          } else {
@ -596,7 +603,7 @@ bool ClockHandleTable::Release(ClockHandle* h, bool useful,
      delete h;
      detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
    } else {
-      uint32_t hash = h->hash;
+      UniqueId64x2 hashed_key = h->hashed_key;
 #ifndef NDEBUG
      // Mark slot as empty, with assertion
      old_meta = h->meta.exchange(0, std::memory_order_release);
@ -607,7 +614,7 @@ bool ClockHandleTable::Release(ClockHandle* h, bool useful,
      h->meta.store(0, std::memory_order_release);
 #endif
      occupancy_.fetch_sub(1U, std::memory_order_release);
-      Rollback(hash, h);
+      Rollback(hashed_key, h);
    }
    usage_.fetch_sub(total_charge, std::memory_order_relaxed);
    assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
@ -654,10 +661,10 @@ void ClockHandleTable::TEST_ReleaseN(ClockHandle* h, size_t n) {
  }
 }

-void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
-  uint32_t probe = 0;
+void ClockHandleTable::Erase(const UniqueId64x2& hashed_key) {
+  size_t probe = 0;
  (void)FindSlot(
-      hash,
+      hashed_key,
      [&](ClockHandle* h) {
        // Could be multiple entries in rare cases. Erase them all.
        // Optimistically increment acquire counter
@ -667,7 +674,7 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
        if ((old_meta >> ClockHandle::kStateShift) ==
            ClockHandle::kStateVisible) {
          // Acquired a read reference
-          if (h->key == key) {
+          if (h->hashed_key == hashed_key) {
            // Match. Set invisible.
            old_meta =
                h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit}
@ -691,7 +698,7 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
                                 << ClockHandle::kStateShift,
                             std::memory_order_acq_rel)) {
                // Took ownership
-                assert(hash == h->hash);
+                assert(hashed_key == h->hashed_key);
                // TODO? Delay freeing?
                h->FreeData();
                usage_.fetch_sub(h->total_charge, std::memory_order_relaxed);
@ -706,7 +713,7 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
                h->meta.store(0, std::memory_order_release);
 #endif
                occupancy_.fetch_sub(1U, std::memory_order_release);
-                Rollback(hash, h);
+                Rollback(hashed_key, h);
                break;
              }
            }
@ -735,14 +742,14 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
 }

 void ClockHandleTable::ConstApplyToEntriesRange(
-    std::function<void(const ClockHandle&)> func, uint32_t index_begin,
-    uint32_t index_end, bool apply_if_will_be_deleted) const {
+    std::function<void(const ClockHandle&)> func, size_t index_begin,
+    size_t index_end, bool apply_if_will_be_deleted) const {
  uint64_t check_state_mask = ClockHandle::kStateShareableBit;
  if (!apply_if_will_be_deleted) {
    check_state_mask |= ClockHandle::kStateVisibleBit;
  }

-  for (uint32_t i = index_begin; i < index_end; i++) {
+  for (size_t i = index_begin; i < index_end; i++) {
    ClockHandle& h = array_[i];

    // Note: to avoid using compare_exchange, we have to be extra careful.
@ -776,7 +783,7 @@ void ClockHandleTable::ConstApplyToEntriesRange(
 }

 void ClockHandleTable::EraseUnRefEntries() {
-  for (uint32_t i = 0; i <= this->length_bits_mask_; i++) {
+  for (size_t i = 0; i <= this->length_bits_mask_; i++) {
    ClockHandle& h = array_[i];

    uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
@ -788,7 +795,7 @@ void ClockHandleTable::EraseUnRefEntries() {
                                           << ClockHandle::kStateShift,
                                       std::memory_order_acquire)) {
      // Took ownership
-      uint32_t hash = h.hash;
+      UniqueId64x2 hashed_key = h.hashed_key;
      h.FreeData();
      usage_.fetch_sub(h.total_charge, std::memory_order_relaxed);
 #ifndef NDEBUG
@ -801,37 +808,29 @@ void ClockHandleTable::EraseUnRefEntries() {
      h.meta.store(0, std::memory_order_release);
 #endif
      occupancy_.fetch_sub(1U, std::memory_order_release);
-      Rollback(hash, &h);
+      Rollback(hashed_key, &h);
    }
  }
 }

-namespace {
-inline uint32_t Remix1(uint32_t hash) {
-  return Lower32of64((uint64_t{hash} * 0xbc9f1d35) >> 29);
-}
-
-inline uint32_t Remix2(uint32_t hash) {
-  return Lower32of64((uint64_t{hash} * 0x7a2bb9d5) >> 29);
-}
-}  // namespace
-
 ClockHandle* ClockHandleTable::FindSlot(
-    uint32_t hash, std::function<bool(ClockHandle*)> match_fn,
+    const UniqueId64x2& hashed_key, std::function<bool(ClockHandle*)> match_fn,
    std::function<bool(ClockHandle*)> abort_fn,
-    std::function<void(ClockHandle*)> update_fn, uint32_t& probe) {
+    std::function<void(ClockHandle*)> update_fn, size_t& probe) {
+  // NOTE: upper 32 bits of hashed_key[0] is used for sharding
+  //
  // We use double-hashing probing. Every probe in the sequence is a
  // pseudorandom integer, computed as a linear function of two random hashes,
  // which we call base and increment. Specifically, the i-th probe is base + i
  // * increment modulo the table size.
-  uint32_t base = ModTableSize(Remix1(hash));
+  size_t base = static_cast<size_t>(hashed_key[1]);
  // We use an odd increment, which is relatively prime with the power-of-two
  // table size. This implies that we cycle back to the first probe only
  // after probing every slot exactly once.
  // TODO: we could also reconsider linear probing, though locality benefits
  // are limited because each slot is a full cache line
-  uint32_t increment = Remix2(hash) | 1U;
-  uint32_t current = ModTableSize(base + probe * increment);
+  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
+  size_t current = ModTableSize(base + probe * increment);
  while (probe <= length_bits_mask_) {
    ClockHandle* h = &array_[current];
    if (match_fn(h)) {
@ -849,22 +848,23 @@ ClockHandle* ClockHandleTable::FindSlot(
  return nullptr;
 }

-void ClockHandleTable::Rollback(uint32_t hash, const ClockHandle* h) {
-  uint32_t current = ModTableSize(Remix1(hash));
-  uint32_t increment = Remix2(hash) | 1U;
-  for (uint32_t i = 0; &array_[current] != h; i++) {
+void ClockHandleTable::Rollback(const UniqueId64x2& hashed_key,
+                                const ClockHandle* h) {
+  size_t current = ModTableSize(hashed_key[1]);
+  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
+  for (size_t i = 0; &array_[current] != h; i++) {
    array_[current].displacements.fetch_sub(1, std::memory_order_relaxed);
    current = ModTableSize(current + increment);
  }
 }

 void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
-                             uint32_t* freed_count) {
+                             size_t* freed_count) {
  // precondition
  assert(requested_charge > 0);

  // TODO: make a tuning parameter?
-  constexpr uint32_t step_size = 4;
+  constexpr size_t step_size = 4;

  // First (concurrent) increment clock pointer
  uint64_t old_clock_pointer =
@ -879,7 +879,7 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
      old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_);

  for (;;) {
-    for (uint32_t i = 0; i < step_size; i++) {
+    for (size_t i = 0; i < step_size; i++) {
      ClockHandle& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))];
      uint64_t meta = h.meta.load(std::memory_order_relaxed);

@ -920,7 +920,7 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
                  << ClockHandle::kStateShift,
              std::memory_order_acquire)) {
        // Took ownership
-        uint32_t hash = h.hash;
+        const UniqueId64x2& hashed_key = h.hashed_key;
        // TODO? Delay freeing?
        h.FreeData();
        *freed_charge += h.total_charge;
@ -934,7 +934,7 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
        h.meta.store(0, std::memory_order_release);
 #endif
        *freed_count += 1;
-        Rollback(hash, &h);
+        Rollback(hashed_key, &h);
      }
    }

@ -955,7 +955,7 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
 ClockCacheShard::ClockCacheShard(
    size_t capacity, size_t estimated_value_size, bool strict_capacity_limit,
    CacheMetadataChargePolicy metadata_charge_policy)
-    : CacheShard(metadata_charge_policy),
+    : CacheShardBase(metadata_charge_policy),
      table_(
          CalcHashBits(capacity, estimated_value_size, metadata_charge_policy),
          /*initial_charge_metadata*/ metadata_charge_policy ==
@ -971,31 +971,33 @@ void ClockCacheShard::EraseUnRefEntries() { table_.EraseUnRefEntries(); }
 void ClockCacheShard::ApplyToSomeEntries(
    const std::function<void(const Slice& key, void* value, size_t charge,
                             DeleterFn deleter)>& callback,
-    uint32_t average_entries_per_lock, uint32_t* state) {
+    size_t average_entries_per_lock, size_t* state) {
  // The state is essentially going to be the starting hash, which works
  // nicely even if we resize between calls because we use upper-most
  // hash bits for table indexes.
-  uint32_t length_bits = table_.GetLengthBits();
-  uint32_t length = table_.GetTableSize();
+  size_t length_bits = table_.GetLengthBits();
+  size_t length = table_.GetTableSize();

  assert(average_entries_per_lock > 0);
  // Assuming we are called with same average_entries_per_lock repeatedly,
  // this simplifies some logic (index_end will not overflow).
  assert(average_entries_per_lock < length || *state == 0);

-  uint32_t index_begin = *state >> (32 - length_bits);
-  uint32_t index_end = index_begin + average_entries_per_lock;
+  size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
+  size_t index_end = index_begin + average_entries_per_lock;
  if (index_end >= length) {
    // Going to end.
    index_end = length;
-    *state = UINT32_MAX;
+    *state = SIZE_MAX;
  } else {
-    *state = index_end << (32 - length_bits);
+    *state = index_end << (sizeof(size_t) * 8u - length_bits);
  }

  table_.ConstApplyToEntriesRange(
      [callback](const ClockHandle& h) {
-        callback(h.KeySlice(), h.value, h.total_charge, h.deleter);
+        UniqueId64x2 unhashed;
+        callback(ReverseHash(h.hashed_key, &unhashed), h.value, h.total_charge,
+                 h.deleter);
      },
      index_begin, index_end, false);
 }
@ -1011,7 +1013,7 @@ int ClockCacheShard::CalcHashBits(
  uint64_t num_slots =
      static_cast<uint64_t>(capacity / average_slot_charge + 0.999999);

-  int hash_bits = std::min(FloorLog2((num_slots << 1) - 1), 32);
+  int hash_bits = FloorLog2((num_slots << 1) - 1);
  if (metadata_charge_policy == kFullChargeCacheMetadata) {
    // For very small estimated value sizes, it's possible to overshoot
    while (hash_bits > 0 &&
@ -1033,17 +1035,16 @@ void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
  // next Insert will take care of any necessary evictions
 }

-Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
-                               size_t charge, Cache::DeleterFn deleter,
-                               Cache::Handle** handle,
+Status ClockCacheShard::Insert(const Slice& key, const UniqueId64x2& hashed_key,
+                               void* value, size_t charge,
+                               Cache::DeleterFn deleter, ClockHandle** handle,
                               Cache::Priority priority) {
  if (UNLIKELY(key.size() != kCacheKeySize)) {
    return Status::NotSupported("ClockCache only supports key size " +
                                std::to_string(kCacheKeySize) + "B");
  }
-  ClockHandleMoreData proto;
-  proto.key = *reinterpret_cast<const CacheKeyBytes*>(key.data());
-  proto.hash = hash;
+  ClockHandleBasicData proto;
+  proto.hashed_key = hashed_key;
  proto.value = value;
  proto.deleter = deleter;
  proto.total_charge = charge;
@ -1054,49 +1055,47 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
  return s;
 }

-Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {
+ClockHandle* ClockCacheShard::Lookup(const Slice& key,
+                                     const UniqueId64x2& hashed_key) {
  if (UNLIKELY(key.size() != kCacheKeySize)) {
    return nullptr;
  }
-  auto key_bytes = reinterpret_cast<const CacheKeyBytes*>(key.data());
-  return reinterpret_cast<Cache::Handle*>(table_.Lookup(*key_bytes, hash));
+  return table_.Lookup(hashed_key);
 }

-bool ClockCacheShard::Ref(Cache::Handle* h) {
+bool ClockCacheShard::Ref(ClockHandle* h) {
  if (h == nullptr) {
    return false;
  }
-  table_.Ref(*reinterpret_cast<ClockHandle*>(h));
+  table_.Ref(*h);
  return true;
 }

-bool ClockCacheShard::Release(Cache::Handle* handle, bool useful,
+bool ClockCacheShard::Release(ClockHandle* handle, bool useful,
                              bool erase_if_last_ref) {
  if (handle == nullptr) {
    return false;
  }
-  return table_.Release(reinterpret_cast<ClockHandle*>(handle), useful,
-                        erase_if_last_ref);
+  return table_.Release(handle, useful, erase_if_last_ref);
 }

-void ClockCacheShard::TEST_RefN(Cache::Handle* h, size_t n) {
-  table_.TEST_RefN(*reinterpret_cast<ClockHandle*>(h), n);
+void ClockCacheShard::TEST_RefN(ClockHandle* h, size_t n) {
+  table_.TEST_RefN(*h, n);
 }

-void ClockCacheShard::TEST_ReleaseN(Cache::Handle* h, size_t n) {
-  table_.TEST_ReleaseN(reinterpret_cast<ClockHandle*>(h), n);
+void ClockCacheShard::TEST_ReleaseN(ClockHandle* h, size_t n) {
+  table_.TEST_ReleaseN(h, n);
 }

-bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
+bool ClockCacheShard::Release(ClockHandle* handle, bool erase_if_last_ref) {
  return Release(handle, /*useful=*/true, erase_if_last_ref);
 }

-void ClockCacheShard::Erase(const Slice& key, uint32_t hash) {
+void ClockCacheShard::Erase(const Slice& key, const UniqueId64x2& hashed_key) {
  if (UNLIKELY(key.size() != kCacheKeySize)) {
    return;
  }
-  auto key_bytes = reinterpret_cast<const CacheKeyBytes*>(key.data());
-  table_.Erase(*key_bytes, hash);
+  table_.Erase(hashed_key);
 }

 size_t ClockCacheShard::GetUsage() const { return table_.GetUsage(); }
@ -1140,39 +1139,19 @@ size_t ClockCacheShard::GetTableAddressCount() const {
 HyperClockCache::HyperClockCache(
    size_t capacity, size_t estimated_value_size, int num_shard_bits,
    bool strict_capacity_limit,
-    CacheMetadataChargePolicy metadata_charge_policy)
-    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit),
-      num_shards_(1 << num_shard_bits) {
+    CacheMetadataChargePolicy metadata_charge_policy,
+    std::shared_ptr<MemoryAllocator> memory_allocator)
+    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+                   std::move(memory_allocator)) {
  assert(estimated_value_size > 0 ||
         metadata_charge_policy != kDontChargeCacheMetadata);
  // TODO: should not need to go through two levels of pointer indirection to
  // get to table entries
-  shards_ = reinterpret_cast<ClockCacheShard*>(
-      port::cacheline_aligned_alloc(sizeof(ClockCacheShard) * num_shards_));
-  size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
-  for (int i = 0; i < num_shards_; i++) {
-    new (&shards_[i])
-        ClockCacheShard(per_shard, estimated_value_size, strict_capacity_limit,
-                        metadata_charge_policy);
-  }
-}
-
-HyperClockCache::~HyperClockCache() {
-  if (shards_ != nullptr) {
-    assert(num_shards_ > 0);
-    for (int i = 0; i < num_shards_; i++) {
-      shards_[i].~ClockCacheShard();
-    }
-    port::cacheline_aligned_free(shards_);
-  }
-}
-
-CacheShard* HyperClockCache::GetShard(uint32_t shard) {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
-}
-
-const CacheShard* HyperClockCache::GetShard(uint32_t shard) const {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
+  size_t per_shard = GetPerShardCapacity();
+  InitShards([=](ClockCacheShard* cs) {
+    new (cs) ClockCacheShard(per_shard, estimated_value_size,
+                             strict_capacity_limit, metadata_charge_policy);
+  });
 }

 void* HyperClockCache::Value(Handle* handle) {
@ -1188,18 +1167,6 @@ Cache::DeleterFn HyperClockCache::GetDeleter(Handle* handle) const {
  return h->deleter;
 }

-uint32_t HyperClockCache::GetHash(Handle* handle) const {
-  return reinterpret_cast<const ClockHandle*>(handle)->hash;
-}
-
-void HyperClockCache::DisownData() {
-  // Leak data only if that won't generate an ASAN/valgrind warning.
-  if (!kMustFreeHeapAllocations) {
-    shards_ = nullptr;
-    num_shards_ = 0;
-  }
-}
-
 }  // namespace hyper_clock_cache

 // DEPRECATED (see public API)
@ -1225,7 +1192,7 @@ std::shared_ptr<Cache> HyperClockCacheOptions::MakeSharedCache() const {
  }
  return std::make_shared<hyper_clock_cache::HyperClockCache>(
      capacity, estimated_entry_charge, my_num_shard_bits,
-      strict_capacity_limit, metadata_charge_policy);
+      strict_capacity_limit, metadata_charge_policy, memory_allocator);
 }

 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@ -303,30 +303,24 @@ constexpr double kLoadFactor = 0.7;
 // strict upper bound on the load factor.
 constexpr double kStrictLoadFactor = 0.84;

-using CacheKeyBytes = std::array<char, kCacheKeySize>;
-
 struct ClockHandleBasicData {
  void* value = nullptr;
  Cache::DeleterFn deleter = nullptr;
-  CacheKeyBytes key = {};
+  // A lossless, reversible hash of the fixed-size (16 byte) cache key. This
+  // eliminates the need to store a hash separately.
+  UniqueId64x2 hashed_key = kNullUniqueId64x2;
  size_t total_charge = 0;

-  Slice KeySlice() const { return Slice(key.data(), kCacheKeySize); }
+  // Calls deleter (if non-null) on cache key and value
+  void FreeData() const;

-  void FreeData() const {
-    if (deleter) {
-      (*deleter)(KeySlice(), value);
-    }
-  }
-};
-
-struct ClockHandleMoreData : public ClockHandleBasicData {
-  uint32_t hash = 0;
+  // Required by concept HandleImpl
+  const UniqueId64x2& GetHash() const { return hashed_key; }
 };

 // Target size to be exactly a common cache line size (see static_assert in
 // clock_cache.cc)
-struct ALIGN_AS(64U) ClockHandle : public ClockHandleMoreData {
+struct ALIGN_AS(64U) ClockHandle : public ClockHandleBasicData {
  // Constants for handling the atomic `meta` word, which tracks most of the
  // state of the handle. The meta word looks like this:
  // low bits                                                     high bits
@ -391,31 +385,31 @@ class ClockHandleTable {
  explicit ClockHandleTable(int hash_bits, bool initial_charge_metadata);
  ~ClockHandleTable();

-  Status Insert(const ClockHandleMoreData& proto, ClockHandle** handle,
+  Status Insert(const ClockHandleBasicData& proto, ClockHandle** handle,
                Cache::Priority priority, size_t capacity,
                bool strict_capacity_limit);

-  ClockHandle* Lookup(const CacheKeyBytes& key, uint32_t hash);
+  ClockHandle* Lookup(const UniqueId64x2& hashed_key);

  bool Release(ClockHandle* handle, bool useful, bool erase_if_last_ref);

  void Ref(ClockHandle& handle);

-  void Erase(const CacheKeyBytes& key, uint32_t hash);
+  void Erase(const UniqueId64x2& hashed_key);

  void ConstApplyToEntriesRange(std::function<void(const ClockHandle&)> func,
-                                uint32_t index_begin, uint32_t index_end,
+                                size_t index_begin, size_t index_end,
                                bool apply_if_will_be_deleted) const;

  void EraseUnRefEntries();

-  uint32_t GetTableSize() const { return uint32_t{1} << length_bits_; }
+  size_t GetTableSize() const { return size_t{1} << length_bits_; }

  int GetLengthBits() const { return length_bits_; }

-  uint32_t GetOccupancyLimit() const { return occupancy_limit_; }
+  size_t GetOccupancyLimit() const { return occupancy_limit_; }

-  uint32_t GetOccupancy() const {
+  size_t GetOccupancy() const {
    return occupancy_.load(std::memory_order_relaxed);
  }

@ -431,13 +425,15 @@ class ClockHandleTable {

 private:  // functions
  // Returns x mod 2^{length_bits_}.
-  uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; }
+  inline size_t ModTableSize(uint64_t x) {
+    return static_cast<size_t>(x) & length_bits_mask_;
+  }

  // Runs the clock eviction algorithm trying to reclaim at least
  // requested_charge. Returns how much is evicted, which could be less
  // if it appears impossible to evict the requested amount without blocking.
  void Evict(size_t requested_charge, size_t* freed_charge,
-             uint32_t* freed_count);
+             size_t* freed_count);

  // Returns the first slot in the probe sequence, starting from the given
  // probe number, with a handle e such that match(e) is true. At every
@ -450,15 +446,15 @@ class ClockHandleTable {
  // value of probe is one more than the last non-aborting probe during the
  // call. This is so that that the variable can be used to keep track of
  // progress across consecutive calls to FindSlot.
-  inline ClockHandle* FindSlot(uint32_t hash,
+  inline ClockHandle* FindSlot(const UniqueId64x2& hashed_key,
                               std::function<bool(ClockHandle*)> match,
                               std::function<bool(ClockHandle*)> stop,
                               std::function<void(ClockHandle*)> update,
-                               uint32_t& probe);
+                               size_t& probe);

  // Re-decrement all displacements in probe path starting from beginning
  // until (not including) the given handle
-  void Rollback(uint32_t hash, const ClockHandle* h);
+  void Rollback(const UniqueId64x2& hashed_key, const ClockHandle* h);

 private:  // data
  // Number of hash bits used for table index.
@ -466,10 +462,10 @@ class ClockHandleTable {
  const int length_bits_;

  // For faster computation of ModTableSize.
-  const uint32_t length_bits_mask_;
+  const size_t length_bits_mask_;

  // Maximum number of elements the user can store in the table.
-  const uint32_t occupancy_limit_;
+  const size_t occupancy_limit_;

  // Array of slots comprising the hash table.
  const std::unique_ptr<ClockHandle[]> array_;
@ -484,7 +480,7 @@ class ClockHandleTable {

  ALIGN_AS(CACHE_LINE_SIZE)
  // Number of elements in the table.
-  std::atomic<uint32_t> occupancy_{};
+  std::atomic<size_t> occupancy_{};

  // Memory usage by entries tracked by the cache (including detached)
  std::atomic<size_t> usage_{};
@ -494,78 +490,107 @@ class ClockHandleTable {
 };  // class ClockHandleTable

 // A single shard of sharded cache.
-class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
+class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
 public:
  ClockCacheShard(size_t capacity, size_t estimated_value_size,
                  bool strict_capacity_limit,
                  CacheMetadataChargePolicy metadata_charge_policy);
-  ~ClockCacheShard() override = default;

-  // TODO: document limitations
-  void SetCapacity(size_t capacity) override;
+  // For CacheShard concept
+  using HandleImpl = ClockHandle;
+  // Hash is lossless hash of 128-bit key
+  using HashVal = UniqueId64x2;
+  using HashCref = const HashVal&;
+  static inline uint32_t HashPieceForSharding(HashCref hash) {
+    return Upper32of64(hash[0]);
+  }
+  static inline HashVal ComputeHash(const Slice& key) {
+    assert(key.size() == kCacheKeySize);
+    HashVal in;
+    HashVal out;
+    // NOTE: endian dependence
+    // TODO: use GetUnaligned?
+    std::memcpy(&in, key.data(), kCacheKeySize);
+    BijectiveHash2x64(in[1], in[0], &out[1], &out[0]);
+    return out;
+  }

-  void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+  // For reconstructing key from hashed_key. Requires the caller to provide
+  // backing storage for the Slice in `unhashed`
+  static inline Slice ReverseHash(const UniqueId64x2& hashed,
+                                  UniqueId64x2* unhashed) {
+    BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]);
+    // NOTE: endian dependence
+    return Slice(reinterpret_cast<const char*>(unhashed), kCacheKeySize);
+  }

-  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
-                Cache::DeleterFn deleter, Cache::Handle** handle,
-                Cache::Priority priority) override;
+  // Although capacity is dynamically changeable, the number of table slots is
+  // not, so growing capacity substantially could lead to hitting occupancy
+  // limit.
+  void SetCapacity(size_t capacity);

-  Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+  void SetStrictCapacityLimit(bool strict_capacity_limit);

-  bool Release(Cache::Handle* handle, bool useful,
-               bool erase_if_last_ref) override;
+  Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
+                size_t charge, Cache::DeleterFn deleter, ClockHandle** handle,
+                Cache::Priority priority);

-  bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
+  ClockHandle* Lookup(const Slice& key, const UniqueId64x2& hashed_key);

-  bool Ref(Cache::Handle* handle) override;
+  bool Release(ClockHandle* handle, bool useful, bool erase_if_last_ref);

-  void Erase(const Slice& key, uint32_t hash) override;
+  bool Release(ClockHandle* handle, bool erase_if_last_ref = false);

-  size_t GetUsage() const override;
+  bool Ref(ClockHandle* handle);

-  size_t GetPinnedUsage() const override;
+  void Erase(const Slice& key, const UniqueId64x2& hashed_key);

-  size_t GetOccupancyCount() const override;
+  size_t GetUsage() const;

-  size_t GetTableAddressCount() const override;
+  size_t GetPinnedUsage() const;
+
+  size_t GetOccupancyCount() const;
+
+  size_t GetTableAddressCount() const;

  void ApplyToSomeEntries(
      const std::function<void(const Slice& key, void* value, size_t charge,
                               DeleterFn deleter)>& callback,
-      uint32_t average_entries_per_lock, uint32_t* state) override;
+      size_t average_entries_per_lock, size_t* state);

-  void EraseUnRefEntries() override;
+  void EraseUnRefEntries();

-  std::string GetPrintableOptions() const override { return std::string{}; }
+  std::string GetPrintableOptions() const { return std::string{}; }

  // SecondaryCache not yet supported
-  Status Insert(const Slice& key, uint32_t hash, void* value,
+  Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
                const Cache::CacheItemHelper* helper, size_t charge,
-                Cache::Handle** handle, Cache::Priority priority) override {
-    return Insert(key, hash, value, charge, helper->del_cb, handle, priority);
+                ClockHandle** handle, Cache::Priority priority) {
+    return Insert(key, hashed_key, value, charge, helper->del_cb, handle,
+                  priority);
  }

-  Cache::Handle* Lookup(const Slice& key, uint32_t hash,
+  ClockHandle* Lookup(const Slice& key, const UniqueId64x2& hashed_key,
                      const Cache::CacheItemHelper* /*helper*/,
                      const Cache::CreateCallback& /*create_cb*/,
                      Cache::Priority /*priority*/, bool /*wait*/,
-                        Statistics* /*stats*/) override {
-    return Lookup(key, hash);
+                      Statistics* /*stats*/) {
+    return Lookup(key, hashed_key);
  }

-  bool IsReady(Cache::Handle* /*handle*/) override { return true; }
+  bool IsReady(ClockHandle* /*handle*/) { return true; }

-  void Wait(Cache::Handle* /*handle*/) override {}
+  void Wait(ClockHandle* /*handle*/) {}

  // Acquire/release N references
-  void TEST_RefN(Cache::Handle* handle, size_t n);
-  void TEST_ReleaseN(Cache::Handle* handle, size_t n);
+  void TEST_RefN(ClockHandle* handle, size_t n);
+  void TEST_ReleaseN(ClockHandle* handle, size_t n);

 private:  // functions
  friend class ClockCache;
  friend class ClockCacheTest;

-  ClockHandle* DetachedInsert(const ClockHandleMoreData& h);
+  ClockHandle* DetachedInsert(const ClockHandleBasicData& h);

  // Returns the number of bits used to hash an element in the hash
  // table.
@ -586,35 +611,20 @@ class HyperClockCache
 #ifdef NDEBUG
    final
 #endif
-    : public ShardedCache {
+    : public ShardedCache<ClockCacheShard> {
 public:
  HyperClockCache(size_t capacity, size_t estimated_value_size,
                  int num_shard_bits, bool strict_capacity_limit,
-                  CacheMetadataChargePolicy metadata_charge_policy =
-                      kDontChargeCacheMetadata);
-
-  ~HyperClockCache() override;
+                  CacheMetadataChargePolicy metadata_charge_policy,
+                  std::shared_ptr<MemoryAllocator> memory_allocator);

  const char* Name() const override { return "HyperClockCache"; }

-  CacheShard* GetShard(uint32_t shard) override;
-
-  const CacheShard* GetShard(uint32_t shard) const override;
-
  void* Value(Handle* handle) override;

  size_t GetCharge(Handle* handle) const override;

-  uint32_t GetHash(Handle* handle) const override;
-
  DeleterFn GetDeleter(Handle* handle) const override;
-
-  void DisownData() override;
-
- private:
-  ClockCacheShard* shards_ = nullptr;
-
-  int num_shards_;
 };  // class HyperClockCache

 }  // namespace hyper_clock_cache
--- a/cache/fast_lru_cache.cc
+++ b/cache/fast_lru_cache.cc
@ -173,7 +173,7 @@ inline int LRUHandleTable::FindSlot(const Slice& key,
 LRUCacheShard::LRUCacheShard(size_t capacity, size_t estimated_value_size,
                             bool strict_capacity_limit,
                             CacheMetadataChargePolicy metadata_charge_policy)
-    : CacheShard(metadata_charge_policy),
+    : CacheShardBase(metadata_charge_policy),
      capacity_(capacity),
      strict_capacity_limit_(strict_capacity_limit),
      table_(
@ -211,27 +211,27 @@ void LRUCacheShard::EraseUnRefEntries() {
 void LRUCacheShard::ApplyToSomeEntries(
    const std::function<void(const Slice& key, void* value, size_t charge,
                             DeleterFn deleter)>& callback,
-    uint32_t average_entries_per_lock, uint32_t* state) {
+    size_t average_entries_per_lock, size_t* state) {
  // The state is essentially going to be the starting hash, which works
  // nicely even if we resize between calls because we use upper-most
  // hash bits for table indexes.
  DMutexLock l(mutex_);
-  uint32_t length_bits = table_.GetLengthBits();
-  uint32_t length = table_.GetTableSize();
+  size_t length_bits = table_.GetLengthBits();
+  size_t length = table_.GetTableSize();

  assert(average_entries_per_lock > 0);
  // Assuming we are called with same average_entries_per_lock repeatedly,
  // this simplifies some logic (index_end will not overflow).
  assert(average_entries_per_lock < length || *state == 0);

-  uint32_t index_begin = *state >> (32 - length_bits);
-  uint32_t index_end = index_begin + average_entries_per_lock;
+  size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
+  size_t index_end = index_begin + average_entries_per_lock;
  if (index_end >= length) {
    // Going to end
    index_end = length;
-    *state = UINT32_MAX;
+    *state = SIZE_MAX;
  } else {
-    *state = index_end << (32 - length_bits);
+    *state = index_end << (sizeof(size_t) * 8u - length_bits);
  }

  table_.ApplyToEntriesRange(
@ -322,8 +322,7 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {

 Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
                             size_t charge, Cache::DeleterFn deleter,
-                             Cache::Handle** handle,
-                             Cache::Priority /*priority*/) {
+                             LRUHandle** handle, Cache::Priority /*priority*/) {
  if (key.size() != kCacheKeySize) {
    return Status::NotSupported("FastLRUCache only supports key size " +
                                std::to_string(kCacheKeySize) + "B");
@ -409,7 +408,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
        if (!h->HasRefs()) {
          h->Ref();
        }
-        *handle = reinterpret_cast<Cache::Handle*>(h);
+        *handle = h;
      }
    }
  }
@ -422,7 +421,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
  return s;
 }

-Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
+LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
  LRUHandle* h = nullptr;
  {
    DMutexLock l(mutex_);
@ -437,23 +436,21 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
      h->Ref();
    }
  }
-  return reinterpret_cast<Cache::Handle*>(h);
+  return h;
 }

-bool LRUCacheShard::Ref(Cache::Handle* h) {
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(h);
+bool LRUCacheShard::Ref(LRUHandle* h) {
  DMutexLock l(mutex_);
  // To create another reference - entry must be already externally referenced.
-  assert(e->HasRefs());
-  e->Ref();
+  assert(h->HasRefs());
+  h->Ref();
  return true;
 }

-bool LRUCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
-  if (handle == nullptr) {
+bool LRUCacheShard::Release(LRUHandle* h, bool erase_if_last_ref) {
+  if (h == nullptr) {
    return false;
  }
-  LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
  LRUHandle copy;
  bool last_reference = false;
  {
@ -535,41 +532,18 @@ size_t LRUCacheShard::GetTableAddressCount() const {
  return table_.GetTableSize();
 }

-std::string LRUCacheShard::GetPrintableOptions() const { return std::string{}; }
-
 LRUCache::LRUCache(size_t capacity, size_t estimated_value_size,
                   int num_shard_bits, bool strict_capacity_limit,
                   CacheMetadataChargePolicy metadata_charge_policy)
-    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
+    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+                   nullptr /*allocator*/) {
  assert(estimated_value_size > 0 ||
         metadata_charge_policy != kDontChargeCacheMetadata);
-  num_shards_ = 1 << num_shard_bits;
-  shards_ = reinterpret_cast<LRUCacheShard*>(
-      port::cacheline_aligned_alloc(sizeof(LRUCacheShard) * num_shards_));
-  size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
-  for (int i = 0; i < num_shards_; i++) {
-    new (&shards_[i])
-        LRUCacheShard(per_shard, estimated_value_size, strict_capacity_limit,
-                      metadata_charge_policy);
-  }
-}
-
-LRUCache::~LRUCache() {
-  if (shards_ != nullptr) {
-    assert(num_shards_ > 0);
-    for (int i = 0; i < num_shards_; i++) {
-      shards_[i].~LRUCacheShard();
-    }
-    port::cacheline_aligned_free(shards_);
-  }
-}
-
-CacheShard* LRUCache::GetShard(uint32_t shard) {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
-}
-
-const CacheShard* LRUCache::GetShard(uint32_t shard) const {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
+  size_t per_shard = GetPerShardCapacity();
+  InitShards([=](LRUCacheShard* cs) {
+    new (cs) LRUCacheShard(per_shard, estimated_value_size,
+                           strict_capacity_limit, metadata_charge_policy);
+  });
 }

 void* LRUCache::Value(Handle* handle) {
@ -577,12 +551,8 @@ void* LRUCache::Value(Handle* handle) {
 }

 size_t LRUCache::GetCharge(Handle* handle) const {
-  CacheMetadataChargePolicy metadata_charge_policy = kDontChargeCacheMetadata;
-  if (num_shards_ > 0) {
-    metadata_charge_policy = shards_[0].metadata_charge_policy_;
-  }
  return reinterpret_cast<const LRUHandle*>(handle)->GetCharge(
-      metadata_charge_policy);
+      GetShard(0).metadata_charge_policy_);
 }

 Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
@ -590,18 +560,6 @@ Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
  return h->deleter;
 }

-uint32_t LRUCache::GetHash(Handle* handle) const {
-  return reinterpret_cast<const LRUHandle*>(handle)->hash;
-}
-
-void LRUCache::DisownData() {
-  // Leak data only if that won't generate an ASAN/valgrind warning.
-  if (!kMustFreeHeapAllocations) {
-    shards_ = nullptr;
-    num_shards_ = 0;
-  }
-}
-
 }  // namespace fast_lru_cache

 std::shared_ptr<Cache> NewFastLRUCache(
--- a/cache/fast_lru_cache.h
+++ b/cache/fast_lru_cache.h
@ -141,6 +141,9 @@ struct LRUHandle {

  Slice key() const { return Slice(key_data.data(), kCacheKeySize); }

+  // For HandleImpl concept
+  uint32_t GetHash() const { return hash; }
+
  // Increase the reference count by 1.
  void Ref() { refs++; }

@ -260,8 +263,8 @@ class LRUHandleTable {
  void Assign(int slot, LRUHandle* h);

  template <typename T>
-  void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) {
-    for (uint32_t i = index_begin; i < index_end; i++) {
+  void ApplyToEntriesRange(T func, size_t index_begin, size_t index_end) {
+    for (size_t i = index_begin; i < index_end; i++) {
      LRUHandle* h = &array_[i];
      if (h->IsVisible()) {
        func(h);
@ -316,20 +319,30 @@ class LRUHandleTable {
 };

 // A single shard of sharded cache.
-class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
+class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
 public:
  LRUCacheShard(size_t capacity, size_t estimated_value_size,
                bool strict_capacity_limit,
                CacheMetadataChargePolicy metadata_charge_policy);
-  ~LRUCacheShard() override = default;
+
+  // For CacheShard concept
+  using HandleImpl = LRUHandle;
+
+  // Keep 32-bit hashing for now (FIXME: upgrade to 64-bit)
+  using HashVal = uint32_t;
+  using HashCref = uint32_t;
+  static inline HashVal ComputeHash(const Slice& key) {
+    return Lower32of64(GetSliceNPHash64(key));
+  }
+  static inline uint32_t HashPieceForSharding(HashCref hash) { return hash; }

  // Separate from constructor so caller can easily make an array of LRUCache
  // if current usage is more than new capacity, the function will attempt to
  // free the needed space.
-  void SetCapacity(size_t capacity) override;
+  void SetCapacity(size_t capacity);

  // Set the flag to reject insertion if cache if full.
-  void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+  void SetStrictCapacityLimit(bool strict_capacity_limit);

  // Like Cache methods, but with an extra "hash" parameter.
  // Insert an item into the hash table and, if handle is null, insert into
@ -337,48 +350,45 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  // and free_handle_on_fail is true, the item is deleted and handle is set to
  // nullptr.
  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
-                Cache::DeleterFn deleter, Cache::Handle** handle,
-                Cache::Priority priority) override;
+                Cache::DeleterFn deleter, LRUHandle** handle,
+                Cache::Priority priority);

  Status Insert(const Slice& key, uint32_t hash, void* value,
                const Cache::CacheItemHelper* helper, size_t charge,
-                Cache::Handle** handle, Cache::Priority priority) override {
+                LRUHandle** handle, Cache::Priority priority) {
    return Insert(key, hash, value, charge, helper->del_cb, handle, priority);
  }

-  Cache::Handle* Lookup(const Slice& key, uint32_t hash,
+  LRUHandle* Lookup(const Slice& key, uint32_t hash,
                    const Cache::CacheItemHelper* /*helper*/,
                    const Cache::CreateCallback& /*create_cb*/,
                    Cache::Priority /*priority*/, bool /*wait*/,
-                        Statistics* /*stats*/) override {
+                    Statistics* /*stats*/) {
    return Lookup(key, hash);
  }
-  Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+  LRUHandle* Lookup(const Slice& key, uint32_t hash);

-  bool Release(Cache::Handle* handle, bool /*useful*/,
-               bool erase_if_last_ref) override {
+  bool Release(LRUHandle* handle, bool /*useful*/, bool erase_if_last_ref) {
    return Release(handle, erase_if_last_ref);
  }
-  bool IsReady(Cache::Handle* /*handle*/) override { return true; }
-  void Wait(Cache::Handle* /*handle*/) override {}
+  bool IsReady(LRUHandle* /*handle*/) { return true; }
+  void Wait(LRUHandle* /*handle*/) {}

-  bool Ref(Cache::Handle* handle) override;
-  bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
-  void Erase(const Slice& key, uint32_t hash) override;
+  bool Ref(LRUHandle* handle);
+  bool Release(LRUHandle* handle, bool erase_if_last_ref = false);
+  void Erase(const Slice& key, uint32_t hash);

-  size_t GetUsage() const override;
-  size_t GetPinnedUsage() const override;
-  size_t GetOccupancyCount() const override;
-  size_t GetTableAddressCount() const override;
+  size_t GetUsage() const;
+  size_t GetPinnedUsage() const;
+  size_t GetOccupancyCount() const;
+  size_t GetTableAddressCount() const;

  void ApplyToSomeEntries(
      const std::function<void(const Slice& key, void* value, size_t charge,
                               DeleterFn deleter)>& callback,
-      uint32_t average_entries_per_lock, uint32_t* state) override;
+      size_t average_entries_per_lock, size_t* state);

-  void EraseUnRefEntries() override;
-
-  std::string GetPrintableOptions() const override;
+  void EraseUnRefEntries();

 private:
  friend class LRUCache;
@ -446,25 +456,16 @@ class LRUCache
 #ifdef NDEBUG
    final
 #endif
-    : public ShardedCache {
+    : public ShardedCache<LRUCacheShard> {
 public:
  LRUCache(size_t capacity, size_t estimated_value_size, int num_shard_bits,
           bool strict_capacity_limit,
           CacheMetadataChargePolicy metadata_charge_policy =
               kDontChargeCacheMetadata);
-  ~LRUCache() override;
  const char* Name() const override { return "LRUCache"; }
-  CacheShard* GetShard(uint32_t shard) override;
-  const CacheShard* GetShard(uint32_t shard) const override;
  void* Value(Handle* handle) override;
  size_t GetCharge(Handle* handle) const override;
-  uint32_t GetHash(Handle* handle) const override;
  DeleterFn GetDeleter(Handle* handle) const override;
-  void DisownData() override;
-
- private:
-  LRUCacheShard* shards_ = nullptr;
-  int num_shards_ = 0;
 };
 }  // namespace fast_lru_cache

--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@ -38,7 +38,7 @@ LRUHandleTable::~LRUHandleTable() {
          h->Free();
        }
      },
-      0, uint32_t{1} << length_bits_);
+      0, size_t{1} << length_bits_);
 }

 LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) {
@ -113,12 +113,13 @@ void LRUHandleTable::Resize() {
  length_bits_ = new_length_bits;
 }

-LRUCacheShard::LRUCacheShard(
-    size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio,
+LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
+                             double high_pri_pool_ratio,
                             double low_pri_pool_ratio, bool use_adaptive_mutex,
-    CacheMetadataChargePolicy metadata_charge_policy, int max_upper_hash_bits,
-    const std::shared_ptr<SecondaryCache>& secondary_cache)
-    : CacheShard(metadata_charge_policy),
+                             CacheMetadataChargePolicy metadata_charge_policy,
+                             int max_upper_hash_bits,
+                             SecondaryCache* secondary_cache)
+    : CacheShardBase(metadata_charge_policy),
      capacity_(0),
      high_pri_pool_usage_(0),
      low_pri_pool_usage_(0),
@ -165,27 +166,27 @@ void LRUCacheShard::EraseUnRefEntries() {
 void LRUCacheShard::ApplyToSomeEntries(
    const std::function<void(const Slice& key, void* value, size_t charge,
                             DeleterFn deleter)>& callback,
-    uint32_t average_entries_per_lock, uint32_t* state) {
+    size_t average_entries_per_lock, size_t* state) {
  // The state is essentially going to be the starting hash, which works
  // nicely even if we resize between calls because we use upper-most
  // hash bits for table indexes.
  DMutexLock l(mutex_);
-  uint32_t length_bits = table_.GetLengthBits();
-  uint32_t length = uint32_t{1} << length_bits;
+  int length_bits = table_.GetLengthBits();
+  size_t length = size_t{1} << length_bits;

  assert(average_entries_per_lock > 0);
  // Assuming we are called with same average_entries_per_lock repeatedly,
  // this simplifies some logic (index_end will not overflow).
  assert(average_entries_per_lock < length || *state == 0);

-  uint32_t index_begin = *state >> (32 - length_bits);
-  uint32_t index_end = index_begin + average_entries_per_lock;
+  size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
+  size_t index_end = index_begin + average_entries_per_lock;
  if (index_end >= length) {
    // Going to end
    index_end = length;
-    *state = UINT32_MAX;
+    *state = SIZE_MAX;
  } else {
-    *state = index_end << (32 - length_bits);
+    *state = index_end << (sizeof(size_t) * 8u - length_bits);
  }

  table_.ApplyToEntriesRange(
@ -364,7 +365,7 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
  strict_capacity_limit_ = strict_capacity_limit;
 }

-Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle,
+Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle,
                                 bool free_handle_on_fail) {
  Status s = Status::OK();
  autovector<LRUHandle*> last_reference_list;
@ -414,7 +415,7 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle,
        if (!e->HasRefs()) {
          e->Ref();
        }
-        *handle = reinterpret_cast<Cache::Handle*>(e);
+        *handle = e;
      }
    }
  }
@ -480,7 +481,7 @@ void LRUCacheShard::Promote(LRUHandle* e) {
                 priority);
    } else {
      e->SetInCache(true);
-      Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(e);
+      LRUHandle* handle = e;
      // This InsertItem() could fail if the cache is over capacity and
      // strict_capacity_limit_ is true. In such a case, we don't want
      // InsertItem() to free the handle, since the item is already in memory
@ -505,11 +506,11 @@ void LRUCacheShard::Promote(LRUHandle* e) {
  }
 }

-Cache::Handle* LRUCacheShard::Lookup(
-    const Slice& key, uint32_t hash,
-    const ShardedCache::CacheItemHelper* helper,
-    const ShardedCache::CreateCallback& create_cb, Cache::Priority priority,
-    bool wait, Statistics* stats) {
+LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash,
+                                 const Cache::CacheItemHelper* helper,
+                                 const Cache::CreateCallback& create_cb,
+                                 Cache::Priority priority, bool wait,
+                                 Statistics* stats) {
  LRUHandle* e = nullptr;
  bool found_dummy_entry{false};
  {
@ -607,11 +608,10 @@ Cache::Handle* LRUCacheShard::Lookup(
      assert(e == nullptr);
    }
  }
-  return reinterpret_cast<Cache::Handle*>(e);
+  return e;
 }

-bool LRUCacheShard::Ref(Cache::Handle* h) {
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(h);
+bool LRUCacheShard::Ref(LRUHandle* e) {
  DMutexLock l(mutex_);
  // To create another reference - entry must be already externally referenced.
  assert(e->HasRefs());
@ -635,11 +635,11 @@ void LRUCacheShard::SetLowPriorityPoolRatio(double low_pri_pool_ratio) {
  MaintainPoolSize();
 }

-bool LRUCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
-  if (handle == nullptr) {
+bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/,
+                            bool erase_if_last_ref) {
+  if (e == nullptr) {
    return false;
  }
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
  bool last_reference = false;
  // Must Wait or WaitAll first on pending handles. Otherwise, would leak
  // a secondary cache handle.
@ -679,7 +679,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
                             size_t charge,
                             void (*deleter)(const Slice& key, void* value),
                             const Cache::CacheItemHelper* helper,
-                             Cache::Handle** handle, Cache::Priority priority) {
+                             LRUHandle** handle, Cache::Priority priority) {
  // Allocate the memory here outside of the mutex.
  // If the cache is full, we'll have to release it.
  // It shouldn't happen very often though.
@ -738,8 +738,7 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
  }
 }

-bool LRUCacheShard::IsReady(Cache::Handle* handle) {
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
+bool LRUCacheShard::IsReady(LRUHandle* e) {
  bool ready = true;
  if (e->IsPending()) {
    assert(secondary_cache_);
@ -770,7 +769,7 @@ size_t LRUCacheShard::GetTableAddressCount() const {
  return size_t{1} << table_.GetLengthBits();
 }

-std::string LRUCacheShard::GetPrintableOptions() const {
+void LRUCacheShard::AppendPrintableOptions(std::string& str) const {
  const int kBufferSize = 200;
  char buffer[kBufferSize];
  {
@ -780,7 +779,7 @@ std::string LRUCacheShard::GetPrintableOptions() const {
    snprintf(buffer + strlen(buffer), kBufferSize - strlen(buffer),
             "    low_pri_pool_ratio: %.3lf\n", low_pri_pool_ratio_);
  }
-  return std::string(buffer);
+  str.append(buffer);
 }

 LRUCache::LRUCache(size_t capacity, int num_shard_bits,
@ -789,38 +788,18 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits,
                   std::shared_ptr<MemoryAllocator> allocator,
                   bool use_adaptive_mutex,
                   CacheMetadataChargePolicy metadata_charge_policy,
-                   const std::shared_ptr<SecondaryCache>& secondary_cache)
+                   std::shared_ptr<SecondaryCache> _secondary_cache)
    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
-                   std::move(allocator)) {
-  num_shards_ = 1 << num_shard_bits;
-  shards_ = reinterpret_cast<LRUCacheShard*>(
-      port::cacheline_aligned_alloc(sizeof(LRUCacheShard) * num_shards_));
-  size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
-  for (int i = 0; i < num_shards_; i++) {
-    new (&shards_[i]) LRUCacheShard(
+                   std::move(allocator)),
+      secondary_cache_(std::move(_secondary_cache)) {
+  size_t per_shard = GetPerShardCapacity();
+  SecondaryCache* secondary_cache = secondary_cache_.get();
+  InitShards([=](LRUCacheShard* cs) {
+    new (cs) LRUCacheShard(
        per_shard, strict_capacity_limit, high_pri_pool_ratio,
        low_pri_pool_ratio, use_adaptive_mutex, metadata_charge_policy,
        /* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache);
-  }
-  secondary_cache_ = secondary_cache;
-}
-
-LRUCache::~LRUCache() {
-  if (shards_ != nullptr) {
-    assert(num_shards_ > 0);
-    for (int i = 0; i < num_shards_; i++) {
-      shards_[i].~LRUCacheShard();
-    }
-    port::cacheline_aligned_free(shards_);
-  }
-}
-
-CacheShard* LRUCache::GetShard(uint32_t shard) {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
-}
-
-const CacheShard* LRUCache::GetShard(uint32_t shard) const {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
+  });
 }

 void* LRUCache::Value(Handle* handle) {
@ -831,12 +810,8 @@ void* LRUCache::Value(Handle* handle) {
 }

 size_t LRUCache::GetCharge(Handle* handle) const {
-  CacheMetadataChargePolicy metadata_charge_policy = kDontChargeCacheMetadata;
-  if (num_shards_ > 0) {
-    metadata_charge_policy = shards_[0].metadata_charge_policy_;
-  }
  return reinterpret_cast<const LRUHandle*>(handle)->GetCharge(
-      metadata_charge_policy);
+      GetShard(0).metadata_charge_policy_);
 }

 Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
@ -848,32 +823,12 @@ Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
  }
 }

-uint32_t LRUCache::GetHash(Handle* handle) const {
-  return reinterpret_cast<const LRUHandle*>(handle)->hash;
-}
-
-void LRUCache::DisownData() {
-  // Leak data only if that won't generate an ASAN/valgrind warning.
-  if (!kMustFreeHeapAllocations) {
-    shards_ = nullptr;
-    num_shards_ = 0;
-  }
-}
-
 size_t LRUCache::TEST_GetLRUSize() {
-  size_t lru_size_of_all_shards = 0;
-  for (int i = 0; i < num_shards_; i++) {
-    lru_size_of_all_shards += shards_[i].TEST_GetLRUSize();
-  }
-  return lru_size_of_all_shards;
+  return SumOverShards([](LRUCacheShard& cs) { return cs.TEST_GetLRUSize(); });
 }

 double LRUCache::GetHighPriPoolRatio() {
-  double result = 0.0;
-  if (num_shards_ > 0) {
-    result = shards_[0].GetHighPriPoolRatio();
-  }
-  return result;
+  return GetShard(0).GetHighPriPoolRatio();
 }

 void LRUCache::WaitAll(std::vector<Handle*>& handles) {
@ -899,22 +854,17 @@ void LRUCache::WaitAll(std::vector<Handle*>& handles) {
      if (!lru_handle->IsPending()) {
        continue;
      }
-      uint32_t hash = GetHash(handle);
-      LRUCacheShard* shard = static_cast<LRUCacheShard*>(GetShard(Shard(hash)));
-      shard->Promote(lru_handle);
+      GetShard(lru_handle->hash).Promote(lru_handle);
    }
  }
 }

-std::string LRUCache::GetPrintableOptions() const {
-  std::string ret;
-  ret.reserve(20000);
-  ret.append(ShardedCache::GetPrintableOptions());
+void LRUCache::AppendPrintableOptions(std::string& str) const {
+  ShardedCache::AppendPrintableOptions(str);  // options from shard
  if (secondary_cache_) {
-    ret.append("  secondary_cache:\n");
-    ret.append(secondary_cache_->GetPrintableOptions());
+    str.append("  secondary_cache:\n");
+    str.append(secondary_cache_->GetPrintableOptions());
  }
-  return ret;
 }

 }  // namespace lru_cache
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@ -53,7 +53,7 @@ struct LRUHandle {
    Info() {}
    ~Info() {}
    Cache::DeleterFn deleter;
-    const ShardedCache::CacheItemHelper* helper;
+    const Cache::CacheItemHelper* helper;
  } info_;
  // An entry is not added to the LRUHandleTable until the secondary cache
  // lookup is complete, so its safe to have this union.
@ -108,6 +108,9 @@ struct LRUHandle {

  Slice key() const { return Slice(key_data, key_length); }

+  // For HandleImpl concept
+  uint32_t GetHash() const { return hash; }
+
  // Increase the reference count by 1.
  void Ref() { refs++; }

@ -262,9 +265,6 @@ struct LRUHandle {
 // 4.4.3's builtin hashtable.
 class LRUHandleTable {
 public:
-  // If the table uses more hash bits than `max_upper_hash_bits`,
-  // it will eat into the bits used for sharding, which are constant
-  // for a given LRUHandleTable.
  explicit LRUHandleTable(int max_upper_hash_bits);
  ~LRUHandleTable();

@ -273,8 +273,8 @@ class LRUHandleTable {
  LRUHandle* Remove(const Slice& key, uint32_t hash);

  template <typename T>
-  void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) {
-    for (uint32_t i = index_begin; i < index_end; i++) {
+  void ApplyToEntriesRange(T func, size_t index_begin, size_t index_end) {
+    for (size_t i = index_begin; i < index_end; i++) {
      LRUHandle* h = list_[i];
      while (h != nullptr) {
        auto n = h->next_hash;
@ -313,23 +313,31 @@ class LRUHandleTable {
 };

 // A single shard of sharded cache.
-class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
+class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
 public:
  LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                double high_pri_pool_ratio, double low_pri_pool_ratio,
                bool use_adaptive_mutex,
                CacheMetadataChargePolicy metadata_charge_policy,
-                int max_upper_hash_bits,
-                const std::shared_ptr<SecondaryCache>& secondary_cache);
-  virtual ~LRUCacheShard() override = default;
+                int max_upper_hash_bits, SecondaryCache* secondary_cache);
+
+ public:  // Type definitions expected as parameter to ShardedCache
+  using HandleImpl = LRUHandle;
+  using HashVal = uint32_t;
+  using HashCref = uint32_t;
+
+ public:  // Function definitions expected as parameter to ShardedCache
+  static inline HashVal ComputeHash(const Slice& key) {
+    return Lower32of64(GetSliceNPHash64(key));
+  }

  // Separate from constructor so caller can easily make an array of LRUCache
  // if current usage is more than new capacity, the function will attempt to
  // free the needed space.
-  virtual void SetCapacity(size_t capacity) override;
+  void SetCapacity(size_t capacity);

  // Set the flag to reject insertion if cache if full.
-  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+  void SetStrictCapacityLimit(bool strict_capacity_limit);

  // Set percentage of capacity reserved for high-pri cache entries.
  void SetHighPriorityPoolRatio(double high_pri_pool_ratio);
@ -338,58 +346,49 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  void SetLowPriorityPoolRatio(double low_pri_pool_ratio);

  // Like Cache methods, but with an extra "hash" parameter.
-  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
+  inline Status Insert(const Slice& key, uint32_t hash, void* value,
                       size_t charge, Cache::DeleterFn deleter,
-                        Cache::Handle** handle,
-                        Cache::Priority priority) override {
+                       LRUHandle** handle, Cache::Priority priority) {
    return Insert(key, hash, value, charge, deleter, nullptr, handle, priority);
  }
-  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
+  inline Status Insert(const Slice& key, uint32_t hash, void* value,
                       const Cache::CacheItemHelper* helper, size_t charge,
-                        Cache::Handle** handle,
-                        Cache::Priority priority) override {
+                       LRUHandle** handle, Cache::Priority priority) {
    assert(helper);
    return Insert(key, hash, value, charge, nullptr, helper, handle, priority);
  }
  // If helper_cb is null, the values of the following arguments don't matter.
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash,
-                                const ShardedCache::CacheItemHelper* helper,
-                                const ShardedCache::CreateCallback& create_cb,
-                                ShardedCache::Priority priority, bool wait,
-                                Statistics* stats) override;
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override {
+  LRUHandle* Lookup(const Slice& key, uint32_t hash,
+                    const Cache::CacheItemHelper* helper,
+                    const Cache::CreateCallback& create_cb,
+                    Cache::Priority priority, bool wait, Statistics* stats);
+  inline LRUHandle* Lookup(const Slice& key, uint32_t hash) {
    return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true,
                  nullptr);
  }
-  virtual bool Release(Cache::Handle* handle, bool /*useful*/,
-                       bool erase_if_last_ref) override {
-    return Release(handle, erase_if_last_ref);
-  }
-  virtual bool IsReady(Cache::Handle* /*handle*/) override;
-  virtual void Wait(Cache::Handle* /*handle*/) override {}
-  virtual bool Ref(Cache::Handle* handle) override;
-  virtual bool Release(Cache::Handle* handle,
-                       bool erase_if_last_ref = false) override;
-  virtual void Erase(const Slice& key, uint32_t hash) override;
+  bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref);
+  bool IsReady(LRUHandle* /*handle*/);
+  void Wait(LRUHandle* /*handle*/) {}
+  bool Ref(LRUHandle* handle);
+  void Erase(const Slice& key, uint32_t hash);

  // Although in some platforms the update of size_t is atomic, to make sure
  // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
  // protect them with mutex_.

-  virtual size_t GetUsage() const override;
-  virtual size_t GetPinnedUsage() const override;
-  virtual size_t GetOccupancyCount() const override;
-  virtual size_t GetTableAddressCount() const override;
+  size_t GetUsage() const;
+  size_t GetPinnedUsage() const;
+  size_t GetOccupancyCount() const;
+  size_t GetTableAddressCount() const;

-  virtual void ApplyToSomeEntries(
+  void ApplyToSomeEntries(
      const std::function<void(const Slice& key, void* value, size_t charge,
                               DeleterFn deleter)>& callback,
-      uint32_t average_entries_per_lock, uint32_t* state) override;
+      size_t average_entries_per_lock, size_t* state);

-  virtual void EraseUnRefEntries() override;
-
-  virtual std::string GetPrintableOptions() const override;
+  void EraseUnRefEntries();

+ public:  // other function definitions
  void TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri,
                       LRUHandle** lru_bottom_pri);

@ -403,17 +402,19 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  // Retrieves low pri pool ratio
  double GetLowPriPoolRatio();

+  void AppendPrintableOptions(std::string& /*str*/) const;
+
 private:
  friend class LRUCache;
  // Insert an item into the hash table and, if handle is null, insert into
  // the LRU list. Older items are evicted as necessary. If the cache is full
  // and free_handle_on_fail is true, the item is deleted and handle is set to
  // nullptr.
-  Status InsertItem(LRUHandle* item, Cache::Handle** handle,
+  Status InsertItem(LRUHandle* item, LRUHandle** handle,
                    bool free_handle_on_fail);
  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
                DeleterFn deleter, const Cache::CacheItemHelper* helper,
-                Cache::Handle** handle, Cache::Priority priority);
+                LRUHandle** handle, Cache::Priority priority);
  // Promote an item looked up from the secondary cache to the LRU cache.
  // The item may be still in the secondary cache.
  // It is only inserted into the hash table and not the LRU list, and only
@ -500,14 +501,15 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  // don't mind mutex_ invoking the non-const actions.
  mutable DMutex mutex_;

-  std::shared_ptr<SecondaryCache> secondary_cache_;
+  // Owned by LRUCache
+  SecondaryCache* secondary_cache_;
 };

 class LRUCache
 #ifdef NDEBUG
    final
 #endif
-    : public ShardedCache {
+    : public ShardedCache<LRUCacheShard> {
 public:
  LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
           double high_pri_pool_ratio, double low_pri_pool_ratio,
@ -515,27 +517,21 @@ class LRUCache
           bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
           CacheMetadataChargePolicy metadata_charge_policy =
               kDontChargeCacheMetadata,
-           const std::shared_ptr<SecondaryCache>& secondary_cache = nullptr);
-  virtual ~LRUCache();
-  virtual const char* Name() const override { return "LRUCache"; }
-  virtual CacheShard* GetShard(uint32_t shard) override;
-  virtual const CacheShard* GetShard(uint32_t shard) const override;
-  virtual void* Value(Handle* handle) override;
-  virtual size_t GetCharge(Handle* handle) const override;
-  virtual uint32_t GetHash(Handle* handle) const override;
-  virtual DeleterFn GetDeleter(Handle* handle) const override;
-  virtual void DisownData() override;
-  virtual void WaitAll(std::vector<Handle*>& handles) override;
-  std::string GetPrintableOptions() const override;
+           std::shared_ptr<SecondaryCache> secondary_cache = nullptr);
+  const char* Name() const override { return "LRUCache"; }
+  void* Value(Handle* handle) override;
+  size_t GetCharge(Handle* handle) const override;
+  DeleterFn GetDeleter(Handle* handle) const override;
+  void WaitAll(std::vector<Handle*>& handles) override;

  // Retrieves number of elements in LRU, for unit test purpose only.
  size_t TEST_GetLRUSize();
  // Retrieves high pri pool ratio.
  double GetHighPriPoolRatio();

+  void AppendPrintableOptions(std::string& str) const override;
+
 private:
-  LRUCacheShard* shards_ = nullptr;
-  int num_shards_ = 0;
  std::shared_ptr<SecondaryCache> secondary_cache_;
 };

--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@ -67,7 +67,7 @@ class LRUCacheTest : public testing::Test {
  bool Lookup(const std::string& key) {
    auto handle = cache_->Lookup(key, 0 /*hash*/);
    if (handle) {
-      cache_->Release(handle);
+      cache_->Release(handle, true /*useful*/, false /*erase*/);
      return true;
    }
    return false;
@ -529,22 +529,27 @@ class ClockCacheTest : public testing::Test {
                                 kDontChargeCacheMetadata);
  }

-  Status Insert(const std::string& key,
+  Status Insert(const UniqueId64x2& hashed_key,
                Cache::Priority priority = Cache::Priority::LOW) {
-    return shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/,
-                          nullptr /*deleter*/, nullptr /*handle*/, priority);
+    return shard_->Insert(TestKey(hashed_key), hashed_key, nullptr /*value*/,
+                          1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/,
+                          priority);
  }

  Status Insert(char key, Cache::Priority priority = Cache::Priority::LOW) {
-    return Insert(std::string(kCacheKeySize, key), priority);
+    return Insert(TestHashedKey(key), priority);
  }

  Status InsertWithLen(char key, size_t len) {
-    return Insert(std::string(len, key));
+    std::string skey(len, key);
+    return shard_->Insert(skey, TestHashedKey(key), nullptr /*value*/,
+                          1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/,
+                          Cache::Priority::LOW);
  }

-  bool Lookup(const std::string& key, bool useful = true) {
-    auto handle = shard_->Lookup(key, 0 /*hash*/);
+  bool Lookup(const Slice& key, const UniqueId64x2& hashed_key,
+              bool useful = true) {
+    auto handle = shard_->Lookup(key, hashed_key);
    if (handle) {
      shard_->Release(handle, useful, /*erase_if_last_ref=*/false);
      return true;
@ -552,44 +557,29 @@ class ClockCacheTest : public testing::Test {
    return false;
  }

+  bool Lookup(const UniqueId64x2& hashed_key, bool useful = true) {
+    return Lookup(TestKey(hashed_key), hashed_key, useful);
+  }
+
  bool Lookup(char key, bool useful = true) {
-    return Lookup(std::string(kCacheKeySize, key), useful);
+    return Lookup(TestHashedKey(key), useful);
  }

-  void Erase(const std::string& key) { shard_->Erase(key, 0 /*hash*/); }
-
-#if 0  // FIXME
-  size_t CalcEstimatedHandleChargeWrapper(
-      size_t estimated_value_size,
-      CacheMetadataChargePolicy metadata_charge_policy) {
-    return ClockCacheShard::CalcEstimatedHandleCharge(estimated_value_size,
-                                                      metadata_charge_policy);
+  void Erase(char key) {
+    UniqueId64x2 hashed_key = TestHashedKey(key);
+    shard_->Erase(TestKey(hashed_key), hashed_key);
  }

-  int CalcHashBitsWrapper(size_t capacity, size_t estimated_value_size,
-                          CacheMetadataChargePolicy metadata_charge_policy) {
-    return ClockCacheShard::CalcHashBits(capacity, estimated_value_size,
-                                         metadata_charge_policy);
+  static inline Slice TestKey(const UniqueId64x2& hashed_key) {
+    return Slice(reinterpret_cast<const char*>(&hashed_key), 16U);
  }

-  // Maximum number of items that a shard can hold.
-  double CalcMaxOccupancy(size_t capacity, size_t estimated_value_size,
-                          CacheMetadataChargePolicy metadata_charge_policy) {
-    size_t handle_charge = ClockCacheShard::CalcEstimatedHandleCharge(
-        estimated_value_size, metadata_charge_policy);
-    return capacity / (kLoadFactor * handle_charge);
+  static inline UniqueId64x2 TestHashedKey(char key) {
+    // For testing hash near-collision behavior, put the variance in
+    // hashed_key in bits that are unlikely to be used as hash bits.
+    return {(static_cast<uint64_t>(key) << 56) + 1234U, 5678U};
  }

-  bool TableSizeIsAppropriate(int hash_bits, double max_occupancy) {
-    if (hash_bits == 0) {
-      return max_occupancy <= 1;
-    } else {
-      return (1 << hash_bits >= max_occupancy) &&
-             (1 << (hash_bits - 1) <= max_occupancy);
-    }
-  }
-#endif
-
  ClockCacheShard* shard_ = nullptr;
 };

@ -607,10 +597,10 @@ TEST_F(ClockCacheTest, Misc) {

  // Some of this is motivated by code coverage
  std::string wrong_size_key(15, 'x');
-  EXPECT_FALSE(Lookup(wrong_size_key));
+  EXPECT_FALSE(Lookup(wrong_size_key, TestHashedKey('x')));
  EXPECT_FALSE(shard_->Ref(nullptr));
  EXPECT_FALSE(shard_->Release(nullptr));
-  shard_->Erase(wrong_size_key, /*hash*/ 42);  // no-op
+  shard_->Erase(wrong_size_key, TestHashedKey('x'));  // no-op
 }

 TEST_F(ClockCacheTest, Limits) {
@ -622,11 +612,11 @@ TEST_F(ClockCacheTest, Limits) {
    // Also tests switching between strict limit and not
    shard_->SetStrictCapacityLimit(strict_capacity_limit);

-    std::string key(16, 'x');
+    UniqueId64x2 hkey = TestHashedKey('x');

    // Single entry charge beyond capacity
    {
-      Status s = shard_->Insert(key, 0 /*hash*/, nullptr /*value*/,
+      Status s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/,
                                5 /*charge*/, nullptr /*deleter*/,
                                nullptr /*handle*/, Cache::Priority::LOW);
      if (strict_capacity_limit) {
@ -638,9 +628,10 @@ TEST_F(ClockCacheTest, Limits) {

    // Single entry fills capacity
    {
-      Cache::Handle* h;
-      ASSERT_OK(shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 3 /*charge*/,
-                               nullptr /*deleter*/, &h, Cache::Priority::LOW));
+      ClockHandle* h;
+      ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/,
+                               3 /*charge*/, nullptr /*deleter*/, &h,
+                               Cache::Priority::LOW));
      // Try to insert more
      Status s = Insert('a');
      if (strict_capacity_limit) {
@ -657,11 +648,11 @@ TEST_F(ClockCacheTest, Limits) {
    // entries) to exceed occupancy limit.
    {
      size_t n = shard_->GetTableAddressCount() + 1;
-      std::unique_ptr<Cache::Handle* []> ha { new Cache::Handle* [n] {} };
+      std::unique_ptr<ClockHandle* []> ha { new ClockHandle* [n] {} };
      Status s;
      for (size_t i = 0; i < n && s.ok(); ++i) {
-        EncodeFixed64(&key[0], i);
-        s = shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 0 /*charge*/,
+        hkey[1] = i;
+        s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, 0 /*charge*/,
                           nullptr /*deleter*/, &ha[i], Cache::Priority::LOW);
        if (i == 0) {
          EXPECT_OK(s);
@ -807,12 +798,11 @@ void IncrementIntDeleter(const Slice& /*key*/, void* value) {
 // Testing calls to CorrectNearOverflow in Release
 TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
  NewShard(6, /*strict_capacity_limit*/ false);
-  Cache::Handle* h;
+  ClockHandle* h;
  int deleted = 0;
-  std::string my_key(kCacheKeySize, 'x');
-  uint32_t my_hash = 42;
-  ASSERT_OK(shard_->Insert(my_key, my_hash, &deleted, 1, IncrementIntDeleter,
-                           &h, Cache::Priority::HIGH));
+  UniqueId64x2 hkey = TestHashedKey('x');
+  ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &deleted, 1,
+                           IncrementIntDeleter, &h, Cache::Priority::HIGH));

  // Some large number outstanding
  shard_->TEST_RefN(h, 123456789);
@ -822,7 +812,7 @@ TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
    shard_->TEST_ReleaseN(h, 1234567);
  }
  // Mark it invisible (to reach a different CorrectNearOverflow() in Release)
-  shard_->Erase(my_key, my_hash);
+  shard_->Erase(TestKey(hkey), hkey);
  // Simulate many more lookup/ref + release (one-by-one would be too
  // expensive for unit test)
  for (int i = 0; i < 10000; ++i) {
@ -844,63 +834,65 @@ TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
 TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
  NewShard(6, /*strict_capacity_limit*/ false);
  int deleted = 0;
-  std::string key1(kCacheKeySize, 'x');
-  std::string key2(kCacheKeySize, 'y');
-  std::string key3(kCacheKeySize, 'z');
-  uint32_t my_hash = 42;
-  Cache::Handle* h1;
-  ASSERT_OK(shard_->Insert(key1, my_hash, &deleted, 1, IncrementIntDeleter, &h1,
+  UniqueId64x2 hkey1 = TestHashedKey('x');
+  Slice key1 = TestKey(hkey1);
+  UniqueId64x2 hkey2 = TestHashedKey('y');
+  Slice key2 = TestKey(hkey2);
+  UniqueId64x2 hkey3 = TestHashedKey('z');
+  Slice key3 = TestKey(hkey3);
+  ClockHandle* h1;
+  ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter, &h1,
                           Cache::Priority::HIGH));
-  Cache::Handle* h2;
-  ASSERT_OK(shard_->Insert(key2, my_hash, &deleted, 1, IncrementIntDeleter, &h2,
+  ClockHandle* h2;
+  ASSERT_OK(shard_->Insert(key2, hkey2, &deleted, 1, IncrementIntDeleter, &h2,
                           Cache::Priority::HIGH));
-  Cache::Handle* h3;
-  ASSERT_OK(shard_->Insert(key3, my_hash, &deleted, 1, IncrementIntDeleter, &h3,
+  ClockHandle* h3;
+  ASSERT_OK(shard_->Insert(key3, hkey3, &deleted, 1, IncrementIntDeleter, &h3,
                           Cache::Priority::HIGH));

  // Can repeatedly lookup+release despite the hash collision
-  Cache::Handle* tmp_h;
+  ClockHandle* tmp_h;
  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
-    tmp_h = shard_->Lookup(key1, my_hash);
+    tmp_h = shard_->Lookup(key1, hkey1);
    ASSERT_EQ(h1, tmp_h);
    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));

-    tmp_h = shard_->Lookup(key2, my_hash);
+    tmp_h = shard_->Lookup(key2, hkey2);
    ASSERT_EQ(h2, tmp_h);
    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));

-    tmp_h = shard_->Lookup(key3, my_hash);
+    tmp_h = shard_->Lookup(key3, hkey3);
    ASSERT_EQ(h3, tmp_h);
    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
  }

  // Make h1 invisible
-  shard_->Erase(key1, my_hash);
+  shard_->Erase(key1, hkey1);
  // Redundant erase
-  shard_->Erase(key1, my_hash);
+  shard_->Erase(key1, hkey1);

  // All still alive
  ASSERT_EQ(deleted, 0);

  // Invisible to Lookup
-  tmp_h = shard_->Lookup(key1, my_hash);
+  tmp_h = shard_->Lookup(key1, hkey1);
  ASSERT_EQ(nullptr, tmp_h);

  // Can still find h2, h3
  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
-    tmp_h = shard_->Lookup(key2, my_hash);
+    tmp_h = shard_->Lookup(key2, hkey2);
    ASSERT_EQ(h2, tmp_h);
    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));

-    tmp_h = shard_->Lookup(key3, my_hash);
+    tmp_h = shard_->Lookup(key3, hkey3);
    ASSERT_EQ(h3, tmp_h);
    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
  }

  // Also Insert with invisible entry there
-  ASSERT_OK(shard_->Insert(key1, my_hash, &deleted, 1, IncrementIntDeleter,
+  ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter,
                           nullptr, Cache::Priority::HIGH));
-  tmp_h = shard_->Lookup(key1, my_hash);
+  tmp_h = shard_->Lookup(key1, hkey1);
  // Found but distinct handle
  ASSERT_NE(nullptr, tmp_h);
  ASSERT_NE(h1, tmp_h);
@ -918,11 +910,11 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) {

  // Can still find h2, h3
  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
-    tmp_h = shard_->Lookup(key2, my_hash);
+    tmp_h = shard_->Lookup(key2, hkey2);
    ASSERT_EQ(h2, tmp_h);
    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));

-    tmp_h = shard_->Lookup(key3, my_hash);
+    tmp_h = shard_->Lookup(key3, hkey3);
    ASSERT_EQ(h3, tmp_h);
    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
  }
@ -934,7 +926,7 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
  ASSERT_EQ(deleted, 0);

  // Can still find it
-  tmp_h = shard_->Lookup(key2, my_hash);
+  tmp_h = shard_->Lookup(key2, hkey2);
  ASSERT_EQ(h2, tmp_h);

  // Release last ref on h2, with erase
@ -942,12 +934,12 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) {

  // h2 deleted
  ASSERT_EQ(deleted--, 1);
-  tmp_h = shard_->Lookup(key2, my_hash);
+  tmp_h = shard_->Lookup(key2, hkey2);
  ASSERT_EQ(nullptr, tmp_h);

  // Can still find h3
  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
-    tmp_h = shard_->Lookup(key3, my_hash);
+    tmp_h = shard_->Lookup(key3, hkey3);
    ASSERT_EQ(h3, tmp_h);
    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
  }
@ -959,11 +951,11 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
  ASSERT_EQ(deleted, 0);

  // Explicit erase
-  shard_->Erase(key3, my_hash);
+  shard_->Erase(key3, hkey3);

  // h3 deleted
  ASSERT_EQ(deleted--, 1);
-  tmp_h = shard_->Lookup(key3, my_hash);
+  tmp_h = shard_->Lookup(key3, hkey3);
  ASSERT_EQ(nullptr, tmp_h);
 }

@ -1371,9 +1363,11 @@ TEST_F(LRUCacheSecondaryCacheTest, SaveFailTest) {
  std::string str2 = rnd.RandomString(1020);
  TestItem* item2 = new TestItem(str2.data(), str2.length());
  // k1 should be demoted to NVM
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
  ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
                          &LRUCacheSecondaryCacheTest::helper_fail_,
                          str2.length()));
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);

  Cache::Handle* handle;
  handle =
--- a/cache/sharded_cache.cc
+++ b/cache/sharded_cache.cc
@ -19,184 +19,49 @@

 namespace ROCKSDB_NAMESPACE {

-namespace {
-
-inline uint32_t HashSlice(const Slice& s) {
-  return Lower32of64(GetSliceNPHash64(s));
-}
-
-}  // namespace
-
-ShardedCache::ShardedCache(size_t capacity, int num_shard_bits,
+ShardedCacheBase::ShardedCacheBase(size_t capacity, int num_shard_bits,
                                   bool strict_capacity_limit,
                                   std::shared_ptr<MemoryAllocator> allocator)
    : Cache(std::move(allocator)),
+      last_id_(1),
      shard_mask_((uint32_t{1} << num_shard_bits) - 1),
-      capacity_(capacity),
      strict_capacity_limit_(strict_capacity_limit),
-      last_id_(1) {}
+      capacity_(capacity) {}

-void ShardedCache::SetCapacity(size_t capacity) {
+size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const {
  uint32_t num_shards = GetNumShards();
-  const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
-  MutexLock l(&capacity_mutex_);
-  for (uint32_t s = 0; s < num_shards; s++) {
-    GetShard(s)->SetCapacity(per_shard);
-  }
-  capacity_ = capacity;
+  return (capacity + (num_shards - 1)) / num_shards;
 }

-void ShardedCache::SetStrictCapacityLimit(bool strict_capacity_limit) {
-  uint32_t num_shards = GetNumShards();
-  MutexLock l(&capacity_mutex_);
-  for (uint32_t s = 0; s < num_shards; s++) {
-    GetShard(s)->SetStrictCapacityLimit(strict_capacity_limit);
-  }
-  strict_capacity_limit_ = strict_capacity_limit;
+size_t ShardedCacheBase::GetPerShardCapacity() const {
+  return ComputePerShardCapacity(GetCapacity());
 }

-Status ShardedCache::Insert(const Slice& key, void* value, size_t charge,
-                            DeleterFn deleter, Handle** handle,
-                            Priority priority) {
-  uint32_t hash = HashSlice(key);
-  return GetShard(Shard(hash))
-      ->Insert(key, hash, value, charge, deleter, handle, priority);
-}
-
-Status ShardedCache::Insert(const Slice& key, void* value,
-                            const CacheItemHelper* helper, size_t charge,
-                            Handle** handle, Priority priority) {
-  uint32_t hash = HashSlice(key);
-  if (!helper) {
-    return Status::InvalidArgument();
-  }
-  return GetShard(Shard(hash))
-      ->Insert(key, hash, value, helper, charge, handle, priority);
-}
-
-Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* /*stats*/) {
-  uint32_t hash = HashSlice(key);
-  return GetShard(Shard(hash))->Lookup(key, hash);
-}
-
-Cache::Handle* ShardedCache::Lookup(const Slice& key,
-                                    const CacheItemHelper* helper,
-                                    const CreateCallback& create_cb,
-                                    Priority priority, bool wait,
-                                    Statistics* stats) {
-  uint32_t hash = HashSlice(key);
-  return GetShard(Shard(hash))
-      ->Lookup(key, hash, helper, create_cb, priority, wait, stats);
-}
-
-bool ShardedCache::IsReady(Handle* handle) {
-  uint32_t hash = GetHash(handle);
-  return GetShard(Shard(hash))->IsReady(handle);
-}
-
-void ShardedCache::Wait(Handle* handle) {
-  uint32_t hash = GetHash(handle);
-  GetShard(Shard(hash))->Wait(handle);
-}
-
-bool ShardedCache::Ref(Handle* handle) {
-  uint32_t hash = GetHash(handle);
-  return GetShard(Shard(hash))->Ref(handle);
-}
-
-bool ShardedCache::Release(Handle* handle, bool erase_if_last_ref) {
-  uint32_t hash = GetHash(handle);
-  return GetShard(Shard(hash))->Release(handle, erase_if_last_ref);
-}
-
-bool ShardedCache::Release(Handle* handle, bool useful,
-                           bool erase_if_last_ref) {
-  uint32_t hash = GetHash(handle);
-  return GetShard(Shard(hash))->Release(handle, useful, erase_if_last_ref);
-}
-
-void ShardedCache::Erase(const Slice& key) {
-  uint32_t hash = HashSlice(key);
-  GetShard(Shard(hash))->Erase(key, hash);
-}
-
-uint64_t ShardedCache::NewId() {
+uint64_t ShardedCacheBase::NewId() {
  return last_id_.fetch_add(1, std::memory_order_relaxed);
 }

-size_t ShardedCache::GetCapacity() const {
-  MutexLock l(&capacity_mutex_);
+size_t ShardedCacheBase::GetCapacity() const {
+  MutexLock l(&config_mutex_);
  return capacity_;
 }

-bool ShardedCache::HasStrictCapacityLimit() const {
-  MutexLock l(&capacity_mutex_);
+bool ShardedCacheBase::HasStrictCapacityLimit() const {
+  MutexLock l(&config_mutex_);
  return strict_capacity_limit_;
 }

-size_t ShardedCache::GetUsage() const {
-  // We will not lock the cache when getting the usage from shards.
-  uint32_t num_shards = GetNumShards();
-  size_t usage = 0;
-  for (uint32_t s = 0; s < num_shards; s++) {
-    usage += GetShard(s)->GetUsage();
-  }
-  return usage;
-}
-
-size_t ShardedCache::GetUsage(Handle* handle) const {
+size_t ShardedCacheBase::GetUsage(Handle* handle) const {
  return GetCharge(handle);
 }

-size_t ShardedCache::GetPinnedUsage() const {
-  // We will not lock the cache when getting the usage from shards.
-  uint32_t num_shards = GetNumShards();
-  size_t usage = 0;
-  for (uint32_t s = 0; s < num_shards; s++) {
-    usage += GetShard(s)->GetPinnedUsage();
-  }
-  return usage;
-}
-
-void ShardedCache::ApplyToAllEntries(
-    const std::function<void(const Slice& key, void* value, size_t charge,
-                             DeleterFn deleter)>& callback,
-    const ApplyToAllEntriesOptions& opts) {
-  uint32_t num_shards = GetNumShards();
-  // Iterate over part of each shard, rotating between shards, to
-  // minimize impact on latency of concurrent operations.
-  std::unique_ptr<uint32_t[]> states(new uint32_t[num_shards]{});
-
-  uint32_t aepl_in_32 = static_cast<uint32_t>(
-      std::min(size_t{UINT32_MAX}, opts.average_entries_per_lock));
-  aepl_in_32 = std::min(aepl_in_32, uint32_t{1});
-
-  bool remaining_work;
-  do {
-    remaining_work = false;
-    for (uint32_t s = 0; s < num_shards; s++) {
-      if (states[s] != UINT32_MAX) {
-        GetShard(s)->ApplyToSomeEntries(callback, aepl_in_32, &states[s]);
-        remaining_work |= states[s] != UINT32_MAX;
-      }
-    }
-  } while (remaining_work);
-}
-
-void ShardedCache::EraseUnRefEntries() {
-  uint32_t num_shards = GetNumShards();
-  for (uint32_t s = 0; s < num_shards; s++) {
-    GetShard(s)->EraseUnRefEntries();
-  }
-}
-
-std::string ShardedCache::GetPrintableOptions() const {
+std::string ShardedCacheBase::GetPrintableOptions() const {
  std::string ret;
  ret.reserve(20000);
  const int kBufferSize = 200;
  char buffer[kBufferSize];
  {
-    MutexLock l(&capacity_mutex_);
+    MutexLock l(&config_mutex_);
    snprintf(buffer, kBufferSize, "    capacity : %" ROCKSDB_PRIszt "\n",
             capacity_);
    ret.append(buffer);
@ -210,7 +75,7 @@ std::string ShardedCache::GetPrintableOptions() const {
  snprintf(buffer, kBufferSize, "    memory_allocator : %s\n",
           memory_allocator() ? memory_allocator()->Name() : "None");
  ret.append(buffer);
-  ret.append(GetShard(0)->GetPrintableOptions());
+  AppendPrintableOptions(ret);
  return ret;
 }

@ -226,25 +91,10 @@ int GetDefaultCacheShardBits(size_t capacity, size_t min_shard_size) {
  return num_shard_bits;
 }

-int ShardedCache::GetNumShardBits() const { return BitsSetToOne(shard_mask_); }
-
-uint32_t ShardedCache::GetNumShards() const { return shard_mask_ + 1; }
-
-size_t ShardedCache::GetOccupancyCount() const {
-  size_t oc = 0;
-  uint32_t num_shards = GetNumShards();
-  for (uint32_t s = 0; s < num_shards; s++) {
-    oc += GetShard(s)->GetOccupancyCount();
-  }
-  return oc;
-}
-size_t ShardedCache::GetTableAddressCount() const {
-  size_t tac = 0;
-  uint32_t num_shards = GetNumShards();
-  for (uint32_t s = 0; s < num_shards; s++) {
-    tac += GetShard(s)->GetTableAddressCount();
-  }
-  return tac;
+int ShardedCacheBase::GetNumShardBits() const {
+  return BitsSetToOne(shard_mask_);
 }

+uint32_t ShardedCacheBase::GetNumShards() const { return shard_mask_ + 1; }
+
 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@ -10,122 +10,309 @@
 #pragma once

 #include <atomic>
+#include <cstdint>
 #include <string>

+#include "port/lang.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"

 namespace ROCKSDB_NAMESPACE {

-// Single cache shard interface.
-class CacheShard {
+// Optional base class for classes implementing the CacheShard concept
+class CacheShardBase {
 public:
-  explicit CacheShard(CacheMetadataChargePolicy metadata_charge_policy)
+  explicit CacheShardBase(CacheMetadataChargePolicy metadata_charge_policy)
      : metadata_charge_policy_(metadata_charge_policy) {}
-  virtual ~CacheShard() = default;

  using DeleterFn = Cache::DeleterFn;
-  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
-                        size_t charge, DeleterFn deleter,
-                        Cache::Handle** handle, Cache::Priority priority) = 0;
-  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
+
+  // Expected by concept CacheShard (TODO with C++20 support)
+  // Some Defaults
+  std::string GetPrintableOptions() const { return ""; }
+  using HashVal = uint64_t;
+  using HashCref = uint64_t;
+  static inline HashVal ComputeHash(const Slice& key) {
+    return GetSliceNPHash64(key);
+  }
+  static inline uint32_t HashPieceForSharding(HashCref hash) {
+    return Lower32of64(hash);
+  }
+  void AppendPrintableOptions(std::string& /*str*/) const {}
+
+  // Must be provided for concept CacheShard (TODO with C++20 support)
+  /*
+  struct HandleImpl {  // for concept HandleImpl
+    HashVal hash;
+    HashCref GetHash() const;
+    ...
+  };
+  Status Insert(const Slice& key, HashCref hash, void* value, size_t charge,
+                DeleterFn deleter, HandleImpl** handle,
+                Cache::Priority priority) = 0;
+  Status Insert(const Slice& key, HashCref hash, void* value,
                const Cache::CacheItemHelper* helper, size_t charge,
-                        Cache::Handle** handle, Cache::Priority priority) = 0;
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) = 0;
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash,
+                HandleImpl** handle, Cache::Priority priority) = 0;
+  HandleImpl* Lookup(const Slice& key, HashCref hash) = 0;
+  HandleImpl* Lookup(const Slice& key, HashCref hash,
                        const Cache::CacheItemHelper* helper,
                        const Cache::CreateCallback& create_cb,
                        Cache::Priority priority, bool wait,
                        Statistics* stats) = 0;
-  virtual bool Release(Cache::Handle* handle, bool useful,
-                       bool erase_if_last_ref) = 0;
-  virtual bool IsReady(Cache::Handle* handle) = 0;
-  virtual void Wait(Cache::Handle* handle) = 0;
-  virtual bool Ref(Cache::Handle* handle) = 0;
-  virtual bool Release(Cache::Handle* handle, bool erase_if_last_ref) = 0;
-  virtual void Erase(const Slice& key, uint32_t hash) = 0;
-  virtual void SetCapacity(size_t capacity) = 0;
-  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
-  virtual size_t GetUsage() const = 0;
-  virtual size_t GetPinnedUsage() const = 0;
-  virtual size_t GetOccupancyCount() const = 0;
-  virtual size_t GetTableAddressCount() const = 0;
+  bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) = 0;
+  bool IsReady(HandleImpl* handle) = 0;
+  void Wait(HandleImpl* handle) = 0;
+  bool Ref(HandleImpl* handle) = 0;
+  void Erase(const Slice& key, HashCref hash) = 0;
+  void SetCapacity(size_t capacity) = 0;
+  void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
+  size_t GetUsage() const = 0;
+  size_t GetPinnedUsage() const = 0;
+  size_t GetOccupancyCount() const = 0;
+  size_t GetTableAddressCount() const = 0;
  // Handles iterating over roughly `average_entries_per_lock` entries, using
  // `state` to somehow record where it last ended up. Caller initially uses
-  // *state == 0 and implementation sets *state = UINT32_MAX to indicate
+  // *state == 0 and implementation sets *state = SIZE_MAX to indicate
  // completion.
-  virtual void ApplyToSomeEntries(
+  void ApplyToSomeEntries(
      const std::function<void(const Slice& key, void* value, size_t charge,
                               DeleterFn deleter)>& callback,
-      uint32_t average_entries_per_lock, uint32_t* state) = 0;
-  virtual void EraseUnRefEntries() = 0;
-  virtual std::string GetPrintableOptions() const { return ""; }
+      size_t average_entries_per_lock, size_t* state) = 0;
+  void EraseUnRefEntries() = 0;
+  */

 protected:
  const CacheMetadataChargePolicy metadata_charge_policy_;
 };

-// Generic cache interface which shards cache by hash of keys. 2^num_shard_bits
-// shards will be created, with capacity split evenly to each of the shards.
-// Keys are sharded by the highest num_shard_bits bits of hash value.
-class ShardedCache : public Cache {
+// Portions of ShardedCache that do not depend on the template parameter
+class ShardedCacheBase : public Cache {
 public:
-  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-               std::shared_ptr<MemoryAllocator> memory_allocator = nullptr);
-  virtual ~ShardedCache() = default;
-  virtual CacheShard* GetShard(uint32_t shard) = 0;
-  virtual const CacheShard* GetShard(uint32_t shard) const = 0;
-
-  virtual uint32_t GetHash(Handle* handle) const = 0;
-
-  virtual void SetCapacity(size_t capacity) override;
-  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
-
-  virtual Status Insert(const Slice& key, void* value, size_t charge,
-                        DeleterFn deleter, Handle** handle,
-                        Priority priority) override;
-  virtual Status Insert(const Slice& key, void* value,
-                        const CacheItemHelper* helper, size_t charge,
-                        Handle** handle = nullptr,
-                        Priority priority = Priority::LOW) override;
-  virtual Handle* Lookup(const Slice& key, Statistics* stats) override;
-  virtual Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
-                         const CreateCallback& create_cb, Priority priority,
-                         bool wait, Statistics* stats = nullptr) override;
-  virtual bool Release(Handle* handle, bool useful,
-                       bool erase_if_last_ref = false) override;
-  virtual bool IsReady(Handle* handle) override;
-  virtual void Wait(Handle* handle) override;
-  virtual bool Ref(Handle* handle) override;
-  virtual bool Release(Handle* handle, bool erase_if_last_ref = false) override;
-  virtual void Erase(const Slice& key) override;
-  virtual uint64_t NewId() override;
-  virtual size_t GetCapacity() const override;
-  virtual bool HasStrictCapacityLimit() const override;
-  virtual size_t GetUsage() const override;
-  virtual size_t GetUsage(Handle* handle) const override;
-  virtual size_t GetPinnedUsage() const override;
-  virtual size_t GetOccupancyCount() const override;
-  virtual size_t GetTableAddressCount() const override;
-  virtual void ApplyToAllEntries(
-      const std::function<void(const Slice& key, void* value, size_t charge,
-                               DeleterFn deleter)>& callback,
-      const ApplyToAllEntriesOptions& opts) override;
-  virtual void EraseUnRefEntries() override;
-  virtual std::string GetPrintableOptions() const override;
+  ShardedCacheBase(size_t capacity, int num_shard_bits,
+                   bool strict_capacity_limit,
+                   std::shared_ptr<MemoryAllocator> memory_allocator);
+  virtual ~ShardedCacheBase() = default;

  int GetNumShardBits() const;
  uint32_t GetNumShards() const;

+  uint64_t NewId() override;
+
+  bool HasStrictCapacityLimit() const override;
+  size_t GetCapacity() const override;
+
+  using Cache::GetUsage;
+  size_t GetUsage(Handle* handle) const override;
+  std::string GetPrintableOptions() const override;
+
+ protected:  // fns
+  virtual void AppendPrintableOptions(std::string& str) const = 0;
+  size_t GetPerShardCapacity() const;
+  size_t ComputePerShardCapacity(size_t capacity) const;
+
+ protected:                        // data
+  std::atomic<uint64_t> last_id_;  // For NewId
+  const uint32_t shard_mask_;
+
+  // Dynamic configuration parameters, guarded by config_mutex_
+  bool strict_capacity_limit_;
+  size_t capacity_;
+  mutable port::Mutex config_mutex_;
+};
+
+// Generic cache interface that shards cache by hash of keys. 2^num_shard_bits
+// shards will be created, with capacity split evenly to each of the shards.
+// Keys are typically sharded by the lowest num_shard_bits bits of hash value
+// so that the upper bits of the hash value can keep a stable ordering of
+// table entries even as the table grows (using more upper hash bits).
+// See CacheShardBase above for what is expected of the CacheShard parameter.
+template <class CacheShard>
+class ShardedCache : public ShardedCacheBase {
+ public:
+  using HashVal = typename CacheShard::HashVal;
+  using HashCref = typename CacheShard::HashCref;
+  using HandleImpl = typename CacheShard::HandleImpl;
+
+  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+               std::shared_ptr<MemoryAllocator> allocator)
+      : ShardedCacheBase(capacity, num_shard_bits, strict_capacity_limit,
+                         allocator),
+        shards_(reinterpret_cast<CacheShard*>(port::cacheline_aligned_alloc(
+            sizeof(CacheShard) * GetNumShards()))),
+        destroy_shards_in_dtor_(false) {}
+
+  virtual ~ShardedCache() {
+    if (destroy_shards_in_dtor_) {
+      ForEachShard([](CacheShard* cs) { cs->~CacheShard(); });
+    }
+    port::cacheline_aligned_free(shards_);
+  }
+
+  CacheShard& GetShard(HashCref hash) {
+    return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_];
+  }
+
+  const CacheShard& GetShard(HashCref hash) const {
+    return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_];
+  }
+
+  void SetCapacity(size_t capacity) override {
+    MutexLock l(&config_mutex_);
+    capacity_ = capacity;
+    auto per_shard = ComputePerShardCapacity(capacity);
+    ForEachShard([=](CacheShard* cs) { cs->SetCapacity(per_shard); });
+  }
+
+  void SetStrictCapacityLimit(bool s_c_l) override {
+    MutexLock l(&config_mutex_);
+    strict_capacity_limit_ = s_c_l;
+    ForEachShard(
+        [s_c_l](CacheShard* cs) { cs->SetStrictCapacityLimit(s_c_l); });
+  }
+
+  Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
+                Handle** handle, Priority priority) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    auto h_out = reinterpret_cast<HandleImpl**>(handle);
+    return GetShard(hash).Insert(key, hash, value, charge, deleter, h_out,
+                                 priority);
+  }
+  Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
+                size_t charge, Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override {
+    if (!helper) {
+      return Status::InvalidArgument();
+    }
+    HashVal hash = CacheShard::ComputeHash(key);
+    auto h_out = reinterpret_cast<HandleImpl**>(handle);
+    return GetShard(hash).Insert(key, hash, value, helper, charge, h_out,
+                                 priority);
+  }
+
+  Handle* Lookup(const Slice& key, Statistics* /*stats*/) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    HandleImpl* result = GetShard(hash).Lookup(key, hash);
+    return reinterpret_cast<Handle*>(result);
+  }
+  Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+                 const CreateCallback& create_cb, Priority priority, bool wait,
+                 Statistics* stats = nullptr) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, create_cb,
+                                               priority, wait, stats);
+    return reinterpret_cast<Handle*>(result);
+  }
+
+  void Erase(const Slice& key) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    GetShard(hash).Erase(key, hash);
+  }
+
+  bool Release(Handle* handle, bool useful,
+               bool erase_if_last_ref = false) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    return GetShard(h->GetHash()).Release(h, useful, erase_if_last_ref);
+  }
+  bool IsReady(Handle* handle) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    return GetShard(h->GetHash()).IsReady(h);
+  }
+  void Wait(Handle* handle) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    GetShard(h->GetHash()).Wait(h);
+  }
+  bool Ref(Handle* handle) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    return GetShard(h->GetHash()).Ref(h);
+  }
+  bool Release(Handle* handle, bool erase_if_last_ref = false) override {
+    return Release(handle, true /*useful*/, erase_if_last_ref);
+  }
+  using ShardedCacheBase::GetUsage;
+  size_t GetUsage() const override {
+    return SumOverShards2(&CacheShard::GetUsage);
+  }
+  size_t GetPinnedUsage() const override {
+    return SumOverShards2(&CacheShard::GetPinnedUsage);
+  }
+  size_t GetOccupancyCount() const override {
+    return SumOverShards2(&CacheShard::GetPinnedUsage);
+  }
+  size_t GetTableAddressCount() const override {
+    return SumOverShards2(&CacheShard::GetTableAddressCount);
+  }
+  void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override {
+    uint32_t num_shards = GetNumShards();
+    // Iterate over part of each shard, rotating between shards, to
+    // minimize impact on latency of concurrent operations.
+    std::unique_ptr<size_t[]> states(new size_t[num_shards]{});
+
+    size_t aepl = opts.average_entries_per_lock;
+    aepl = std::min(aepl, size_t{1});
+
+    bool remaining_work;
+    do {
+      remaining_work = false;
+      for (uint32_t i = 0; i < num_shards; i++) {
+        if (states[i] != SIZE_MAX) {
+          shards_[i].ApplyToSomeEntries(callback, aepl, &states[i]);
+          remaining_work |= states[i] != SIZE_MAX;
+        }
+      }
+    } while (remaining_work);
+  }
+
+  virtual void EraseUnRefEntries() override {
+    ForEachShard([](CacheShard* cs) { cs->EraseUnRefEntries(); });
+  }
+
+  void DisownData() override {
+    // Leak data only if that won't generate an ASAN/valgrind warning.
+    if (!kMustFreeHeapAllocations) {
+      destroy_shards_in_dtor_ = false;
+    }
+  }
+
 protected:
-  inline uint32_t Shard(uint32_t hash) { return hash & shard_mask_; }
+  inline void ForEachShard(const std::function<void(CacheShard*)>& fn) {
+    uint32_t num_shards = GetNumShards();
+    for (uint32_t i = 0; i < num_shards; i++) {
+      fn(shards_ + i);
+    }
+  }
+
+  inline size_t SumOverShards(
+      const std::function<size_t(CacheShard&)>& fn) const {
+    uint32_t num_shards = GetNumShards();
+    size_t result = 0;
+    for (uint32_t i = 0; i < num_shards; i++) {
+      result += fn(shards_[i]);
+    }
+    return result;
+  }
+
+  inline size_t SumOverShards2(size_t (CacheShard::*fn)() const) const {
+    return SumOverShards([fn](CacheShard& cs) { return (cs.*fn)(); });
+  }
+
+  // Must be called exactly once by derived class constructor
+  void InitShards(const std::function<void(CacheShard*)>& placement_new) {
+    ForEachShard(placement_new);
+    destroy_shards_in_dtor_ = true;
+  }
+
+  void AppendPrintableOptions(std::string& str) const override {
+    shards_[0].AppendPrintableOptions(str);
+  }

 private:
-  const uint32_t shard_mask_;
-  mutable port::Mutex capacity_mutex_;
-  size_t capacity_;
-  bool strict_capacity_limit_;
-  std::atomic<uint64_t> last_id_;
+  CacheShard* const shards_;
+  bool destroy_shards_in_dtor_;
 };

 // 512KB is traditional minimum shard size.
--- a/options/options_test.cc
+++ b/options/options_test.cc
@ -613,7 +613,7 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
      &new_cf_opt));
  ASSERT_NE(new_cf_opt.blob_cache, nullptr);
  ASSERT_EQ(new_cf_opt.blob_cache->GetCapacity(), 1024UL * 1024UL);
-  ASSERT_EQ(static_cast<ShardedCache*>(new_cf_opt.blob_cache.get())
+  ASSERT_EQ(static_cast<ShardedCacheBase*>(new_cf_opt.blob_cache.get())
                ->GetNumShardBits(),
            4);
  ASSERT_EQ(new_cf_opt.blob_cache->HasStrictCapacityLimit(), true);
@ -1064,15 +1064,18 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
      &new_opt));
  ASSERT_TRUE(new_opt.block_cache != nullptr);
  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
@ -1088,8 +1091,8 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
  ASSERT_TRUE(new_opt.block_cache != nullptr);
  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL);
  // Default values
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(),
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
            GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
@ -1098,10 +1101,11 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL);
  // Default values
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(),
-                GetDefaultCacheShardBits(
-                    new_opt.block_cache_compressed->GetCapacity()));
+  ASSERT_EQ(
+      std::dynamic_pointer_cast<ShardedCacheBase>(
+          new_opt.block_cache_compressed)
+          ->GetNumShardBits(),
+      GetDefaultCacheShardBits(new_opt.block_cache_compressed->GetCapacity()));
  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                ->GetHighPriPoolRatio(),
@ -1115,15 +1119,18 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
      "high_pri_pool_ratio=0.0;}",
      &new_opt));
  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 5);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            5);
  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 0);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 5);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            5);
  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                ->GetHighPriPoolRatio(),
@ -1139,16 +1146,19 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
      &new_opt));
  ASSERT_TRUE(new_opt.block_cache != nullptr);
  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
                ->GetHighPriPoolRatio(),
            0.5);
  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                ->GetHighPriPoolRatio(),
@ -2790,7 +2800,7 @@ TEST_F(OptionsOldApiTest, GetColumnFamilyOptionsFromStringTest) {
      &new_cf_opt));
  ASSERT_NE(new_cf_opt.blob_cache, nullptr);
  ASSERT_EQ(new_cf_opt.blob_cache->GetCapacity(), 1024UL * 1024UL);
-  ASSERT_EQ(static_cast<ShardedCache*>(new_cf_opt.blob_cache.get())
+  ASSERT_EQ(static_cast<ShardedCacheBase*>(new_cf_opt.blob_cache.get())
                ->GetNumShardBits(),
            4);
  ASSERT_EQ(new_cf_opt.blob_cache->HasStrictCapacityLimit(), true);
@ -2970,15 +2980,18 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
             &new_opt));
  ASSERT_TRUE(new_opt.block_cache != nullptr);
  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
@ -2993,8 +3006,8 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
  ASSERT_TRUE(new_opt.block_cache != nullptr);
  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL);
  // Default values
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(),
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
            GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
@ -3003,10 +3016,11 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL);
  // Default values
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(),
-                GetDefaultCacheShardBits(
-                    new_opt.block_cache_compressed->GetCapacity()));
+  ASSERT_EQ(
+      std::dynamic_pointer_cast<ShardedCacheBase>(
+          new_opt.block_cache_compressed)
+          ->GetNumShardBits(),
+      GetDefaultCacheShardBits(new_opt.block_cache_compressed->GetCapacity()));
  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                ->GetHighPriPoolRatio(),
@ -3020,15 +3034,18 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
      "high_pri_pool_ratio=0.0;}",
      &new_opt));
  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 5);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            5);
  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 0);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 5);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            5);
  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                ->GetHighPriPoolRatio(),
@ -3043,16 +3060,19 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
             &new_opt));
  ASSERT_TRUE(new_opt.block_cache != nullptr);
  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
                ->GetHighPriPoolRatio(),
            0.5);
  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                ->GetHighPriPoolRatio(),
--- a/port/win/port_win.h
+++ b/port/win/port_win.h
@ -246,13 +246,8 @@ inline void cacheline_aligned_free(void *memblock) {

 extern const size_t kPageSize;

-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52991 for MINGW32
-// could not be worked around with by -mno-ms-bitfields
-#ifndef __MINGW32__
-#define ALIGN_AS(n) __declspec(align(n))
-#else
-#define ALIGN_AS(n)
-#endif
+// Part of C++11
+#define ALIGN_AS(n) alignas(n)

 static inline void AsmVolatilePause() {
 #if defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM)
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@ -524,32 +524,32 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {

  // More complex test of shared key space, in case the instances are wrappers
  // for some shared underlying cache.
-  std::string sentinel_key(size_t{1}, '\0');
+  CacheKey sentinel_key = CacheKey::CreateUniqueForProcessLifetime();
  static char kRegularBlockCacheMarker = 'b';
  static char kCompressedBlockCacheMarker = 'c';
  static char kPersistentCacheMarker = 'p';
  if (bbto.block_cache) {
    bbto.block_cache
-        ->Insert(Slice(sentinel_key), &kRegularBlockCacheMarker, 1,
+        ->Insert(sentinel_key.AsSlice(), &kRegularBlockCacheMarker, 1,
                 GetNoopDeleterForRole<CacheEntryRole::kMisc>())
        .PermitUncheckedError();
  }
  if (bbto.block_cache_compressed) {
    bbto.block_cache_compressed
-        ->Insert(Slice(sentinel_key), &kCompressedBlockCacheMarker, 1,
+        ->Insert(sentinel_key.AsSlice(), &kCompressedBlockCacheMarker, 1,
                 GetNoopDeleterForRole<CacheEntryRole::kMisc>())
        .PermitUncheckedError();
  }
  if (bbto.persistent_cache) {
    // Note: persistent cache copies the data, not keeping the pointer
    bbto.persistent_cache
-        ->Insert(Slice(sentinel_key), &kPersistentCacheMarker, 1)
+        ->Insert(sentinel_key.AsSlice(), &kPersistentCacheMarker, 1)
        .PermitUncheckedError();
  }
  // If we get something different from what we inserted, that indicates
  // dangerously overlapping key spaces.
  if (bbto.block_cache) {
-    auto handle = bbto.block_cache->Lookup(Slice(sentinel_key));
+    auto handle = bbto.block_cache->Lookup(sentinel_key.AsSlice());
    if (handle) {
      auto v = static_cast<char*>(bbto.block_cache->Value(handle));
      char c = *v;
@ -568,7 +568,7 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {
    }
  }
  if (bbto.block_cache_compressed) {
-    auto handle = bbto.block_cache_compressed->Lookup(Slice(sentinel_key));
+    auto handle = bbto.block_cache_compressed->Lookup(sentinel_key.AsSlice());
    if (handle) {
      auto v = static_cast<char*>(bbto.block_cache_compressed->Value(handle));
      char c = *v;
@ -591,7 +591,7 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {
  if (bbto.persistent_cache) {
    std::unique_ptr<char[]> data;
    size_t size = 0;
-    bbto.persistent_cache->Lookup(Slice(sentinel_key), &data, &size)
+    bbto.persistent_cache->Lookup(sentinel_key.AsSlice(), &data, &size)
        .PermitUncheckedError();
    if (data && size > 0) {
      if (data[0] == kRegularBlockCacheMarker) {
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@ -21,7 +21,6 @@

 #include "cache/cache_entry_roles.h"
 #include "cache/cache_key.h"
-#include "cache/sharded_cache.h"
 #include "db/compaction/compaction_picker.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"