From 7555243bcfb7086e8bad38d43a518ff4c53dc17a Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Tue, 18 Oct 2022 22:06:57 -0700
Subject: [PATCH] Refactor ShardedCache for more sharing, static polymorphism
 (#10801)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
The motivations for this change include
* Free up space in ClockHandle so that we can add data for secondary cache handling while still keeping within single cache line (64 byte) size.
  * This change frees up space by eliminating the need for the `hash` field by making the fixed-size key itself a hash, using a 128-bit bijective (lossless) hash.
* Generally more customizability of ShardedCache (such as hashing) without worrying about virtual call overheads
  * ShardedCache now uses static polymorphism (template) instead of dynamic polymorphism (virtual overrides) for the CacheShard. No obvious performance benefit is seen from the change (as mostly expected; most calls to virtual functions in CacheShard could already be optimized to static calls), but offers more flexibility without incurring the runtime cost of adhering to a common interface (without type parameters or static callbacks).
  * You'll also notice less `reinterpret_cast`ing and other boilerplate in the Cache implementations, as this can go in ShardedCache.

More detail:
* Don't have LRUCacheShard maintain `std::shared_ptr<SecondaryCache>` copies (extra refcount) when LRUCache can be in charge of keeping a `shared_ptr`.
* Renamed `capacity_mutex_` to `config_mutex_` to better represent the scope of what it guards.
* Some preparation for 64-bit hash and indexing in LRUCache, but didn't include the full change because of slight performance regression.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10801

Test Plan:
Unit test updates were non-trivial because of major changes to the ClockCacheShard interface in handling of key vs. hash.

Performance:
Create with `TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=30000000 -disable_wal=1 -bloom_bits=16`

Test with
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=readrandom[-X1000] -readonly -num=30000000 -bloom_bits=16 -cache_index_and_filter_blocks=1 -cache_size=610000000 -duration 20 -threads=16
```

Before: `readrandom [AVG 150 runs] : 321147 (± 253) ops/sec`
After: `readrandom [AVG 150 runs] : 321530 (± 326) ops/sec`

So possibly ~0.1% improvement.

And with `-cache_type=hyper_clock_cache`:
Before: `readrandom [AVG 30 runs] : 614126 (± 7978) ops/sec`
After: `readrandom [AVG 30 runs] : 645349 (± 8087) ops/sec`

So roughly 5% improvement!

Reviewed By: anand1976

Differential Revision: D40252236

Pulled By: pdillinger

fbshipit-source-id: ff8fc70ef569585edc95bcbaaa0386f61355ae5b
---
 cache/cache_test.cc                           |   8 +-
 cache/clock_cache.cc                          | 233 +++++------
 cache/clock_cache.h                           | 174 +++++----
 cache/fast_lru_cache.cc                       |  92 ++---
 cache/fast_lru_cache.h                        |  81 ++--
 cache/lru_cache.cc                            | 146 +++----
 cache/lru_cache.h                             | 122 +++---
 cache/lru_cache_test.cc                       | 156 ++++----
 cache/sharded_cache.cc                        | 194 ++--------
 cache/sharded_cache.h                         | 361 +++++++++++++-----
 options/options_test.cc                       | 100 +++--
 port/win/port_win.h                           |   9 +-
 .../block_based/block_based_table_factory.cc  |  14 +-
 table/block_based/block_based_table_reader.cc |   1 -
 14 files changed, 809 insertions(+), 882 deletions(-)
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index 1a8bae4df5..75c28c2b8b 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -1023,21 +1023,21 @@ TEST_P(CacheTest, DefaultShardBits) {
       (GetParam() == kHyperClock ? 32U * 1024U : 512U) * 1024U;
 
   std::shared_ptr<Cache> cache = NewCache(32U * min_shard_size);
-  ShardedCache* sc = dynamic_cast<ShardedCache*>(cache.get());
+  ShardedCacheBase* sc = dynamic_cast<ShardedCacheBase*>(cache.get());
   ASSERT_EQ(5, sc->GetNumShardBits());
 
   cache = NewCache(min_shard_size / 1000U * 999U);
-  sc = dynamic_cast<ShardedCache*>(cache.get());
+  sc = dynamic_cast<ShardedCacheBase*>(cache.get());
   ASSERT_EQ(0, sc->GetNumShardBits());
 
   cache = NewCache(3U * 1024U * 1024U * 1024U);
-  sc = dynamic_cast<ShardedCache*>(cache.get());
+  sc = dynamic_cast<ShardedCacheBase*>(cache.get());
   // current maximum of 6
   ASSERT_EQ(6, sc->GetNumShardBits());
 
   if constexpr (sizeof(size_t) > 4) {
     cache = NewCache(128U * min_shard_size);
-    sc = dynamic_cast<ShardedCache*>(cache.get());
+    sc = dynamic_cast<ShardedCacheBase*>(cache.get());
     // current maximum of 6
     ASSERT_EQ(6, sc->GetNumShardBits());
   }
diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index 58a7f94bb3..d353c9966c 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -12,6 +12,7 @@
 #include <cassert>
 #include <functional>
 
+#include "cache/cache_key.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "port/lang.h"
@@ -29,16 +30,22 @@ inline uint64_t GetRefcount(uint64_t meta) {
          ClockHandle::kCounterMask;
 }
 
+void ClockHandleBasicData::FreeData() const {
+  if (deleter) {
+    UniqueId64x2 unhashed;
+    (*deleter)(ClockCacheShard::ReverseHash(hashed_key, &unhashed), value);
+  }
+}
+
 static_assert(sizeof(ClockHandle) == 64U,
               "Expecting size / alignment with common cache line size");
 
 ClockHandleTable::ClockHandleTable(int hash_bits, bool initial_charge_metadata)
     : length_bits_(hash_bits),
-      length_bits_mask_(Lower32of64((uint64_t{1} << length_bits_) - 1)),
-      occupancy_limit_(static_cast<uint32_t>((uint64_t{1} << length_bits_) *
-                                             kStrictLoadFactor)),
+      length_bits_mask_((size_t{1} << length_bits_) - 1),
+      occupancy_limit_(static_cast<size_t>((uint64_t{1} << length_bits_) *
+                                           kStrictLoadFactor)),
       array_(new ClockHandle[size_t{1} << length_bits_]) {
-  assert(hash_bits <= 32);  // FIXME: ensure no overlap with sharding bits
   if (initial_charge_metadata) {
     usage_ += size_t{GetTableSize()} * sizeof(ClockHandle);
   }
@@ -47,7 +54,7 @@ ClockHandleTable::ClockHandleTable(int hash_bits, bool initial_charge_metadata)
 ClockHandleTable::~ClockHandleTable() {
   // Assumes there are no references or active operations on any slot/element
   // in the table.
-  for (uint32_t i = 0; i < GetTableSize(); i++) {
+  for (size_t i = 0; i < GetTableSize(); i++) {
     ClockHandle& h = array_[i];
     switch (h.meta >> ClockHandle::kStateShift) {
       case ClockHandle::kStateEmpty:
@@ -58,7 +65,7 @@ ClockHandleTable::~ClockHandleTable() {
         assert(GetRefcount(h.meta) == 0);
         h.FreeData();
 #ifndef NDEBUG
-        Rollback(h.hash, &h);
+        Rollback(h.hashed_key, &h);
         usage_.fetch_sub(h.total_charge, std::memory_order_relaxed);
         occupancy_.fetch_sub(1U, std::memory_order_relaxed);
 #endif
@@ -71,7 +78,7 @@ ClockHandleTable::~ClockHandleTable() {
   }
 
 #ifndef NDEBUG
-  for (uint32_t i = 0; i < GetTableSize(); i++) {
+  for (size_t i = 0; i < GetTableSize(); i++) {
     assert(array_[i].displacements.load() == 0);
   }
 #endif
@@ -154,12 +161,12 @@ inline void CorrectNearOverflow(uint64_t old_meta,
   }
 }
 
-Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
+Status ClockHandleTable::Insert(const ClockHandleBasicData& proto,
                                 ClockHandle** handle, Cache::Priority priority,
                                 size_t capacity, bool strict_capacity_limit) {
   // Do we have the available occupancy? Optimistically assume we do
   // and deal with it if we don't.
-  uint32_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
+  size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
   auto revert_occupancy_fn = [&]() {
     occupancy_.fetch_sub(1, std::memory_order_relaxed);
   };
@@ -198,7 +205,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
     }
     if (request_evict_charge > 0) {
       size_t evicted_charge = 0;
-      uint32_t evicted_count = 0;
+      size_t evicted_count = 0;
       Evict(request_evict_charge, &evicted_charge, &evicted_count);
       occupancy_.fetch_sub(evicted_count, std::memory_order_release);
       if (LIKELY(evicted_charge > need_evict_charge)) {
@@ -263,7 +270,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
       need_evict_charge = 1;
     }
     size_t evicted_charge = 0;
-    uint32_t evicted_count = 0;
+    size_t evicted_count = 0;
     if (need_evict_charge > 0) {
       Evict(need_evict_charge, &evicted_charge, &evicted_count);
       // Deal with potential occupancy deficit
@@ -323,9 +330,9 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
     }
     assert(initial_countdown > 0);
 
-    uint32_t probe = 0;
+    size_t probe = 0;
     ClockHandle* e = FindSlot(
-        proto.hash,
+        proto.hashed_key,
         [&](ClockHandle* h) {
           // Optimistically transition the slot from "empty" to
           // "under construction" (no effect on other states)
@@ -338,7 +345,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
           if (old_state == ClockHandle::kStateEmpty) {
             // We've started inserting into an available slot, and taken
             // ownership Save data fields
-            ClockHandleMoreData* h_alias = h;
+            ClockHandleBasicData* h_alias = h;
             *h_alias = proto;
 
             // Transition from "under construction" state to "visible" state
@@ -375,7 +382,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
           if ((old_meta >> ClockHandle::kStateShift) ==
               ClockHandle::kStateVisible) {
             // Acquired a read reference
-            if (h->key == proto.key) {
+            if (h->hashed_key == proto.hashed_key) {
               // Match. Release in a way that boosts the clock state
               old_meta = h->meta.fetch_add(
                   ClockHandle::kReleaseIncrement * initial_countdown,
@@ -431,7 +438,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
       return Status::OK();
     }
     // Roll back table insertion
-    Rollback(proto.hash, e);
+    Rollback(proto.hashed_key, e);
     revert_occupancy_fn();
     // Maybe fall back on detached insert
     if (handle == nullptr) {
@@ -446,7 +453,7 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
   assert(use_detached_insert);
 
   ClockHandle* h = new ClockHandle();
-  ClockHandleMoreData* h_alias = h;
+  ClockHandleBasicData* h_alias = h;
   *h_alias = proto;
   h->detached = true;
   // Single reference (detached entries only created if returning a refed
@@ -467,10 +474,10 @@ Status ClockHandleTable::Insert(const ClockHandleMoreData& proto,
   return Status::OkOverwritten();
 }
 
-ClockHandle* ClockHandleTable::Lookup(const CacheKeyBytes& key, uint32_t hash) {
-  uint32_t probe = 0;
+ClockHandle* ClockHandleTable::Lookup(const UniqueId64x2& hashed_key) {
+  size_t probe = 0;
   ClockHandle* e = FindSlot(
-      hash,
+      hashed_key,
       [&](ClockHandle* h) {
         // Mostly branch-free version (similar performance)
         /*
@@ -501,7 +508,7 @@ ClockHandle* ClockHandleTable::Lookup(const CacheKeyBytes& key, uint32_t hash) {
         if ((old_meta >> ClockHandle::kStateShift) ==
             ClockHandle::kStateVisible) {
           // Acquired a read reference
-          if (h->key == key) {
+          if (h->hashed_key == hashed_key) {
             // Match
             return true;
           } else {
@@ -596,7 +603,7 @@ bool ClockHandleTable::Release(ClockHandle* h, bool useful,
       delete h;
       detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
     } else {
-      uint32_t hash = h->hash;
+      UniqueId64x2 hashed_key = h->hashed_key;
 #ifndef NDEBUG
       // Mark slot as empty, with assertion
       old_meta = h->meta.exchange(0, std::memory_order_release);
@@ -607,7 +614,7 @@ bool ClockHandleTable::Release(ClockHandle* h, bool useful,
       h->meta.store(0, std::memory_order_release);
 #endif
       occupancy_.fetch_sub(1U, std::memory_order_release);
-      Rollback(hash, h);
+      Rollback(hashed_key, h);
     }
     usage_.fetch_sub(total_charge, std::memory_order_relaxed);
     assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
@@ -654,10 +661,10 @@ void ClockHandleTable::TEST_ReleaseN(ClockHandle* h, size_t n) {
   }
 }
 
-void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
-  uint32_t probe = 0;
+void ClockHandleTable::Erase(const UniqueId64x2& hashed_key) {
+  size_t probe = 0;
   (void)FindSlot(
-      hash,
+      hashed_key,
       [&](ClockHandle* h) {
         // Could be multiple entries in rare cases. Erase them all.
         // Optimistically increment acquire counter
@@ -667,7 +674,7 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
         if ((old_meta >> ClockHandle::kStateShift) ==
             ClockHandle::kStateVisible) {
           // Acquired a read reference
-          if (h->key == key) {
+          if (h->hashed_key == hashed_key) {
             // Match. Set invisible.
             old_meta =
                 h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit}
@@ -691,7 +698,7 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
                                  << ClockHandle::kStateShift,
                              std::memory_order_acq_rel)) {
                 // Took ownership
-                assert(hash == h->hash);
+                assert(hashed_key == h->hashed_key);
                 // TODO? Delay freeing?
                 h->FreeData();
                 usage_.fetch_sub(h->total_charge, std::memory_order_relaxed);
@@ -706,7 +713,7 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
                 h->meta.store(0, std::memory_order_release);
 #endif
                 occupancy_.fetch_sub(1U, std::memory_order_release);
-                Rollback(hash, h);
+                Rollback(hashed_key, h);
                 break;
               }
             }
@@ -735,14 +742,14 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
 }
 
 void ClockHandleTable::ConstApplyToEntriesRange(
-    std::function<void(const ClockHandle&)> func, uint32_t index_begin,
-    uint32_t index_end, bool apply_if_will_be_deleted) const {
+    std::function<void(const ClockHandle&)> func, size_t index_begin,
+    size_t index_end, bool apply_if_will_be_deleted) const {
   uint64_t check_state_mask = ClockHandle::kStateShareableBit;
   if (!apply_if_will_be_deleted) {
     check_state_mask |= ClockHandle::kStateVisibleBit;
   }
 
-  for (uint32_t i = index_begin; i < index_end; i++) {
+  for (size_t i = index_begin; i < index_end; i++) {
     ClockHandle& h = array_[i];
 
     // Note: to avoid using compare_exchange, we have to be extra careful.
@@ -776,7 +783,7 @@ void ClockHandleTable::ConstApplyToEntriesRange(
 }
 
 void ClockHandleTable::EraseUnRefEntries() {
-  for (uint32_t i = 0; i <= this->length_bits_mask_; i++) {
+  for (size_t i = 0; i <= this->length_bits_mask_; i++) {
     ClockHandle& h = array_[i];
 
     uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
@@ -788,7 +795,7 @@ void ClockHandleTable::EraseUnRefEntries() {
                                            << ClockHandle::kStateShift,
                                        std::memory_order_acquire)) {
       // Took ownership
-      uint32_t hash = h.hash;
+      UniqueId64x2 hashed_key = h.hashed_key;
       h.FreeData();
       usage_.fetch_sub(h.total_charge, std::memory_order_relaxed);
 #ifndef NDEBUG
@@ -801,37 +808,29 @@ void ClockHandleTable::EraseUnRefEntries() {
       h.meta.store(0, std::memory_order_release);
 #endif
       occupancy_.fetch_sub(1U, std::memory_order_release);
-      Rollback(hash, &h);
+      Rollback(hashed_key, &h);
     }
   }
 }
 
-namespace {
-inline uint32_t Remix1(uint32_t hash) {
-  return Lower32of64((uint64_t{hash} * 0xbc9f1d35) >> 29);
-}
-
-inline uint32_t Remix2(uint32_t hash) {
-  return Lower32of64((uint64_t{hash} * 0x7a2bb9d5) >> 29);
-}
-}  // namespace
-
 ClockHandle* ClockHandleTable::FindSlot(
-    uint32_t hash, std::function<bool(ClockHandle*)> match_fn,
+    const UniqueId64x2& hashed_key, std::function<bool(ClockHandle*)> match_fn,
     std::function<bool(ClockHandle*)> abort_fn,
-    std::function<void(ClockHandle*)> update_fn, uint32_t& probe) {
+    std::function<void(ClockHandle*)> update_fn, size_t& probe) {
+  // NOTE: upper 32 bits of hashed_key[0] is used for sharding
+  //
   // We use double-hashing probing. Every probe in the sequence is a
   // pseudorandom integer, computed as a linear function of two random hashes,
   // which we call base and increment. Specifically, the i-th probe is base + i
   // * increment modulo the table size.
-  uint32_t base = ModTableSize(Remix1(hash));
+  size_t base = static_cast<size_t>(hashed_key[1]);
   // We use an odd increment, which is relatively prime with the power-of-two
   // table size. This implies that we cycle back to the first probe only
   // after probing every slot exactly once.
   // TODO: we could also reconsider linear probing, though locality benefits
   // are limited because each slot is a full cache line
-  uint32_t increment = Remix2(hash) | 1U;
-  uint32_t current = ModTableSize(base + probe * increment);
+  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
+  size_t current = ModTableSize(base + probe * increment);
   while (probe <= length_bits_mask_) {
     ClockHandle* h = &array_[current];
     if (match_fn(h)) {
@@ -849,22 +848,23 @@ ClockHandle* ClockHandleTable::FindSlot(
   return nullptr;
 }
 
-void ClockHandleTable::Rollback(uint32_t hash, const ClockHandle* h) {
-  uint32_t current = ModTableSize(Remix1(hash));
-  uint32_t increment = Remix2(hash) | 1U;
-  for (uint32_t i = 0; &array_[current] != h; i++) {
+void ClockHandleTable::Rollback(const UniqueId64x2& hashed_key,
+                                const ClockHandle* h) {
+  size_t current = ModTableSize(hashed_key[1]);
+  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
+  for (size_t i = 0; &array_[current] != h; i++) {
     array_[current].displacements.fetch_sub(1, std::memory_order_relaxed);
     current = ModTableSize(current + increment);
   }
 }
 
 void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
-                             uint32_t* freed_count) {
+                             size_t* freed_count) {
   // precondition
   assert(requested_charge > 0);
 
   // TODO: make a tuning parameter?
-  constexpr uint32_t step_size = 4;
+  constexpr size_t step_size = 4;
 
   // First (concurrent) increment clock pointer
   uint64_t old_clock_pointer =
@@ -879,7 +879,7 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
       old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_);
 
   for (;;) {
-    for (uint32_t i = 0; i < step_size; i++) {
+    for (size_t i = 0; i < step_size; i++) {
       ClockHandle& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))];
       uint64_t meta = h.meta.load(std::memory_order_relaxed);
 
@@ -920,7 +920,7 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
                   << ClockHandle::kStateShift,
               std::memory_order_acquire)) {
         // Took ownership
-        uint32_t hash = h.hash;
+        const UniqueId64x2& hashed_key = h.hashed_key;
         // TODO? Delay freeing?
         h.FreeData();
         *freed_charge += h.total_charge;
@@ -934,7 +934,7 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
         h.meta.store(0, std::memory_order_release);
 #endif
         *freed_count += 1;
-        Rollback(hash, &h);
+        Rollback(hashed_key, &h);
       }
     }
 
@@ -955,7 +955,7 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
 ClockCacheShard::ClockCacheShard(
     size_t capacity, size_t estimated_value_size, bool strict_capacity_limit,
     CacheMetadataChargePolicy metadata_charge_policy)
-    : CacheShard(metadata_charge_policy),
+    : CacheShardBase(metadata_charge_policy),
       table_(
           CalcHashBits(capacity, estimated_value_size, metadata_charge_policy),
           /*initial_charge_metadata*/ metadata_charge_policy ==
@@ -971,31 +971,33 @@ void ClockCacheShard::EraseUnRefEntries() { table_.EraseUnRefEntries(); }
 void ClockCacheShard::ApplyToSomeEntries(
     const std::function<void(const Slice& key, void* value, size_t charge,
                              DeleterFn deleter)>& callback,
-    uint32_t average_entries_per_lock, uint32_t* state) {
+    size_t average_entries_per_lock, size_t* state) {
   // The state is essentially going to be the starting hash, which works
   // nicely even if we resize between calls because we use upper-most
   // hash bits for table indexes.
-  uint32_t length_bits = table_.GetLengthBits();
-  uint32_t length = table_.GetTableSize();
+  size_t length_bits = table_.GetLengthBits();
+  size_t length = table_.GetTableSize();
 
   assert(average_entries_per_lock > 0);
   // Assuming we are called with same average_entries_per_lock repeatedly,
   // this simplifies some logic (index_end will not overflow).
   assert(average_entries_per_lock < length || *state == 0);
 
-  uint32_t index_begin = *state >> (32 - length_bits);
-  uint32_t index_end = index_begin + average_entries_per_lock;
+  size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
+  size_t index_end = index_begin + average_entries_per_lock;
   if (index_end >= length) {
     // Going to end.
     index_end = length;
-    *state = UINT32_MAX;
+    *state = SIZE_MAX;
   } else {
-    *state = index_end << (32 - length_bits);
+    *state = index_end << (sizeof(size_t) * 8u - length_bits);
   }
 
   table_.ConstApplyToEntriesRange(
       [callback](const ClockHandle& h) {
-        callback(h.KeySlice(), h.value, h.total_charge, h.deleter);
+        UniqueId64x2 unhashed;
+        callback(ReverseHash(h.hashed_key, &unhashed), h.value, h.total_charge,
+                 h.deleter);
       },
       index_begin, index_end, false);
 }
@@ -1011,7 +1013,7 @@ int ClockCacheShard::CalcHashBits(
   uint64_t num_slots =
       static_cast<uint64_t>(capacity / average_slot_charge + 0.999999);
 
-  int hash_bits = std::min(FloorLog2((num_slots << 1) - 1), 32);
+  int hash_bits = FloorLog2((num_slots << 1) - 1);
   if (metadata_charge_policy == kFullChargeCacheMetadata) {
     // For very small estimated value sizes, it's possible to overshoot
     while (hash_bits > 0 &&
@@ -1033,17 +1035,16 @@ void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
   // next Insert will take care of any necessary evictions
 }
 
-Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
-                               size_t charge, Cache::DeleterFn deleter,
-                               Cache::Handle** handle,
+Status ClockCacheShard::Insert(const Slice& key, const UniqueId64x2& hashed_key,
+                               void* value, size_t charge,
+                               Cache::DeleterFn deleter, ClockHandle** handle,
                                Cache::Priority priority) {
   if (UNLIKELY(key.size() != kCacheKeySize)) {
     return Status::NotSupported("ClockCache only supports key size " +
                                 std::to_string(kCacheKeySize) + "B");
   }
-  ClockHandleMoreData proto;
-  proto.key = *reinterpret_cast<const CacheKeyBytes*>(key.data());
-  proto.hash = hash;
+  ClockHandleBasicData proto;
+  proto.hashed_key = hashed_key;
   proto.value = value;
   proto.deleter = deleter;
   proto.total_charge = charge;
@@ -1054,49 +1055,47 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
   return s;
 }
 
-Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {
+ClockHandle* ClockCacheShard::Lookup(const Slice& key,
+                                     const UniqueId64x2& hashed_key) {
   if (UNLIKELY(key.size() != kCacheKeySize)) {
     return nullptr;
   }
-  auto key_bytes = reinterpret_cast<const CacheKeyBytes*>(key.data());
-  return reinterpret_cast<Cache::Handle*>(table_.Lookup(*key_bytes, hash));
+  return table_.Lookup(hashed_key);
 }
 
-bool ClockCacheShard::Ref(Cache::Handle* h) {
+bool ClockCacheShard::Ref(ClockHandle* h) {
   if (h == nullptr) {
     return false;
   }
-  table_.Ref(*reinterpret_cast<ClockHandle*>(h));
+  table_.Ref(*h);
   return true;
 }
 
-bool ClockCacheShard::Release(Cache::Handle* handle, bool useful,
+bool ClockCacheShard::Release(ClockHandle* handle, bool useful,
                               bool erase_if_last_ref) {
   if (handle == nullptr) {
     return false;
   }
-  return table_.Release(reinterpret_cast<ClockHandle*>(handle), useful,
-                        erase_if_last_ref);
+  return table_.Release(handle, useful, erase_if_last_ref);
 }
 
-void ClockCacheShard::TEST_RefN(Cache::Handle* h, size_t n) {
-  table_.TEST_RefN(*reinterpret_cast<ClockHandle*>(h), n);
+void ClockCacheShard::TEST_RefN(ClockHandle* h, size_t n) {
+  table_.TEST_RefN(*h, n);
 }
 
-void ClockCacheShard::TEST_ReleaseN(Cache::Handle* h, size_t n) {
-  table_.TEST_ReleaseN(reinterpret_cast<ClockHandle*>(h), n);
+void ClockCacheShard::TEST_ReleaseN(ClockHandle* h, size_t n) {
+  table_.TEST_ReleaseN(h, n);
 }
 
-bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
+bool ClockCacheShard::Release(ClockHandle* handle, bool erase_if_last_ref) {
   return Release(handle, /*useful=*/true, erase_if_last_ref);
 }
 
-void ClockCacheShard::Erase(const Slice& key, uint32_t hash) {
+void ClockCacheShard::Erase(const Slice& key, const UniqueId64x2& hashed_key) {
   if (UNLIKELY(key.size() != kCacheKeySize)) {
     return;
   }
-  auto key_bytes = reinterpret_cast<const CacheKeyBytes*>(key.data());
-  table_.Erase(*key_bytes, hash);
+  table_.Erase(hashed_key);
 }
 
 size_t ClockCacheShard::GetUsage() const { return table_.GetUsage(); }
@@ -1140,39 +1139,19 @@ size_t ClockCacheShard::GetTableAddressCount() const {
 HyperClockCache::HyperClockCache(
     size_t capacity, size_t estimated_value_size, int num_shard_bits,
     bool strict_capacity_limit,
-    CacheMetadataChargePolicy metadata_charge_policy)
-    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit),
-      num_shards_(1 << num_shard_bits) {
+    CacheMetadataChargePolicy metadata_charge_policy,
+    std::shared_ptr<MemoryAllocator> memory_allocator)
+    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+                   std::move(memory_allocator)) {
   assert(estimated_value_size > 0 ||
          metadata_charge_policy != kDontChargeCacheMetadata);
   // TODO: should not need to go through two levels of pointer indirection to
   // get to table entries
-  shards_ = reinterpret_cast<ClockCacheShard*>(
-      port::cacheline_aligned_alloc(sizeof(ClockCacheShard) * num_shards_));
-  size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
-  for (int i = 0; i < num_shards_; i++) {
-    new (&shards_[i])
-        ClockCacheShard(per_shard, estimated_value_size, strict_capacity_limit,
-                        metadata_charge_policy);
-  }
-}
-
-HyperClockCache::~HyperClockCache() {
-  if (shards_ != nullptr) {
-    assert(num_shards_ > 0);
-    for (int i = 0; i < num_shards_; i++) {
-      shards_[i].~ClockCacheShard();
-    }
-    port::cacheline_aligned_free(shards_);
-  }
-}
-
-CacheShard* HyperClockCache::GetShard(uint32_t shard) {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
-}
-
-const CacheShard* HyperClockCache::GetShard(uint32_t shard) const {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
+  size_t per_shard = GetPerShardCapacity();
+  InitShards([=](ClockCacheShard* cs) {
+    new (cs) ClockCacheShard(per_shard, estimated_value_size,
+                             strict_capacity_limit, metadata_charge_policy);
+  });
 }
 
 void* HyperClockCache::Value(Handle* handle) {
@@ -1188,18 +1167,6 @@ Cache::DeleterFn HyperClockCache::GetDeleter(Handle* handle) const {
   return h->deleter;
 }
 
-uint32_t HyperClockCache::GetHash(Handle* handle) const {
-  return reinterpret_cast<const ClockHandle*>(handle)->hash;
-}
-
-void HyperClockCache::DisownData() {
-  // Leak data only if that won't generate an ASAN/valgrind warning.
-  if (!kMustFreeHeapAllocations) {
-    shards_ = nullptr;
-    num_shards_ = 0;
-  }
-}
-
 }  // namespace hyper_clock_cache
 
 // DEPRECATED (see public API)
@@ -1225,7 +1192,7 @@ std::shared_ptr<Cache> HyperClockCacheOptions::MakeSharedCache() const {
   }
   return std::make_shared<hyper_clock_cache::HyperClockCache>(
       capacity, estimated_entry_charge, my_num_shard_bits,
-      strict_capacity_limit, metadata_charge_policy);
+      strict_capacity_limit, metadata_charge_policy, memory_allocator);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/clock_cache.h b/cache/clock_cache.h
index a68514e36f..53a9de5f0a 100644
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -303,30 +303,24 @@ constexpr double kLoadFactor = 0.7;
 // strict upper bound on the load factor.
 constexpr double kStrictLoadFactor = 0.84;
 
-using CacheKeyBytes = std::array<char, kCacheKeySize>;
-
 struct ClockHandleBasicData {
   void* value = nullptr;
   Cache::DeleterFn deleter = nullptr;
-  CacheKeyBytes key = {};
+  // A lossless, reversible hash of the fixed-size (16 byte) cache key. This
+  // eliminates the need to store a hash separately.
+  UniqueId64x2 hashed_key = kNullUniqueId64x2;
   size_t total_charge = 0;
 
-  Slice KeySlice() const { return Slice(key.data(), kCacheKeySize); }
+  // Calls deleter (if non-null) on cache key and value
+  void FreeData() const;
 
-  void FreeData() const {
-    if (deleter) {
-      (*deleter)(KeySlice(), value);
-    }
-  }
-};
-
-struct ClockHandleMoreData : public ClockHandleBasicData {
-  uint32_t hash = 0;
+  // Required by concept HandleImpl
+  const UniqueId64x2& GetHash() const { return hashed_key; }
 };
 
 // Target size to be exactly a common cache line size (see static_assert in
 // clock_cache.cc)
-struct ALIGN_AS(64U) ClockHandle : public ClockHandleMoreData {
+struct ALIGN_AS(64U) ClockHandle : public ClockHandleBasicData {
   // Constants for handling the atomic `meta` word, which tracks most of the
   // state of the handle. The meta word looks like this:
   // low bits                                                     high bits
@@ -391,31 +385,31 @@ class ClockHandleTable {
   explicit ClockHandleTable(int hash_bits, bool initial_charge_metadata);
   ~ClockHandleTable();
 
-  Status Insert(const ClockHandleMoreData& proto, ClockHandle** handle,
+  Status Insert(const ClockHandleBasicData& proto, ClockHandle** handle,
                 Cache::Priority priority, size_t capacity,
                 bool strict_capacity_limit);
 
-  ClockHandle* Lookup(const CacheKeyBytes& key, uint32_t hash);
+  ClockHandle* Lookup(const UniqueId64x2& hashed_key);
 
   bool Release(ClockHandle* handle, bool useful, bool erase_if_last_ref);
 
   void Ref(ClockHandle& handle);
 
-  void Erase(const CacheKeyBytes& key, uint32_t hash);
+  void Erase(const UniqueId64x2& hashed_key);
 
   void ConstApplyToEntriesRange(std::function<void(const ClockHandle&)> func,
-                                uint32_t index_begin, uint32_t index_end,
+                                size_t index_begin, size_t index_end,
                                 bool apply_if_will_be_deleted) const;
 
   void EraseUnRefEntries();
 
-  uint32_t GetTableSize() const { return uint32_t{1} << length_bits_; }
+  size_t GetTableSize() const { return size_t{1} << length_bits_; }
 
   int GetLengthBits() const { return length_bits_; }
 
-  uint32_t GetOccupancyLimit() const { return occupancy_limit_; }
+  size_t GetOccupancyLimit() const { return occupancy_limit_; }
 
-  uint32_t GetOccupancy() const {
+  size_t GetOccupancy() const {
     return occupancy_.load(std::memory_order_relaxed);
   }
 
@@ -431,13 +425,15 @@ class ClockHandleTable {
 
  private:  // functions
   // Returns x mod 2^{length_bits_}.
-  uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; }
+  inline size_t ModTableSize(uint64_t x) {
+    return static_cast<size_t>(x) & length_bits_mask_;
+  }
 
   // Runs the clock eviction algorithm trying to reclaim at least
   // requested_charge. Returns how much is evicted, which could be less
   // if it appears impossible to evict the requested amount without blocking.
   void Evict(size_t requested_charge, size_t* freed_charge,
-             uint32_t* freed_count);
+             size_t* freed_count);
 
   // Returns the first slot in the probe sequence, starting from the given
   // probe number, with a handle e such that match(e) is true. At every
@@ -450,15 +446,15 @@ class ClockHandleTable {
   // value of probe is one more than the last non-aborting probe during the
   // call. This is so that that the variable can be used to keep track of
   // progress across consecutive calls to FindSlot.
-  inline ClockHandle* FindSlot(uint32_t hash,
+  inline ClockHandle* FindSlot(const UniqueId64x2& hashed_key,
                                std::function<bool(ClockHandle*)> match,
                                std::function<bool(ClockHandle*)> stop,
                                std::function<void(ClockHandle*)> update,
-                               uint32_t& probe);
+                               size_t& probe);
 
   // Re-decrement all displacements in probe path starting from beginning
   // until (not including) the given handle
-  void Rollback(uint32_t hash, const ClockHandle* h);
+  void Rollback(const UniqueId64x2& hashed_key, const ClockHandle* h);
 
  private:  // data
   // Number of hash bits used for table index.
@@ -466,10 +462,10 @@ class ClockHandleTable {
   const int length_bits_;
 
   // For faster computation of ModTableSize.
-  const uint32_t length_bits_mask_;
+  const size_t length_bits_mask_;
 
   // Maximum number of elements the user can store in the table.
-  const uint32_t occupancy_limit_;
+  const size_t occupancy_limit_;
 
   // Array of slots comprising the hash table.
   const std::unique_ptr<ClockHandle[]> array_;
@@ -484,7 +480,7 @@ class ClockHandleTable {
 
   ALIGN_AS(CACHE_LINE_SIZE)
   // Number of elements in the table.
-  std::atomic<uint32_t> occupancy_{};
+  std::atomic<size_t> occupancy_{};
 
   // Memory usage by entries tracked by the cache (including detached)
   std::atomic<size_t> usage_{};
@@ -494,78 +490,107 @@ class ClockHandleTable {
 };  // class ClockHandleTable
 
 // A single shard of sharded cache.
-class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
+class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
  public:
   ClockCacheShard(size_t capacity, size_t estimated_value_size,
                   bool strict_capacity_limit,
                   CacheMetadataChargePolicy metadata_charge_policy);
-  ~ClockCacheShard() override = default;
 
-  // TODO: document limitations
-  void SetCapacity(size_t capacity) override;
+  // For CacheShard concept
+  using HandleImpl = ClockHandle;
+  // Hash is lossless hash of 128-bit key
+  using HashVal = UniqueId64x2;
+  using HashCref = const HashVal&;
+  static inline uint32_t HashPieceForSharding(HashCref hash) {
+    return Upper32of64(hash[0]);
+  }
+  static inline HashVal ComputeHash(const Slice& key) {
+    assert(key.size() == kCacheKeySize);
+    HashVal in;
+    HashVal out;
+    // NOTE: endian dependence
+    // TODO: use GetUnaligned?
+    std::memcpy(&in, key.data(), kCacheKeySize);
+    BijectiveHash2x64(in[1], in[0], &out[1], &out[0]);
+    return out;
+  }
 
-  void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+  // For reconstructing key from hashed_key. Requires the caller to provide
+  // backing storage for the Slice in `unhashed`
+  static inline Slice ReverseHash(const UniqueId64x2& hashed,
+                                  UniqueId64x2* unhashed) {
+    BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]);
+    // NOTE: endian dependence
+    return Slice(reinterpret_cast<const char*>(unhashed), kCacheKeySize);
+  }
 
-  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
-                Cache::DeleterFn deleter, Cache::Handle** handle,
-                Cache::Priority priority) override;
+  // Although capacity is dynamically changeable, the number of table slots is
+  // not, so growing capacity substantially could lead to hitting occupancy
+  // limit.
+  void SetCapacity(size_t capacity);
 
-  Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+  void SetStrictCapacityLimit(bool strict_capacity_limit);
 
-  bool Release(Cache::Handle* handle, bool useful,
-               bool erase_if_last_ref) override;
+  Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
+                size_t charge, Cache::DeleterFn deleter, ClockHandle** handle,
+                Cache::Priority priority);
 
-  bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
+  ClockHandle* Lookup(const Slice& key, const UniqueId64x2& hashed_key);
 
-  bool Ref(Cache::Handle* handle) override;
+  bool Release(ClockHandle* handle, bool useful, bool erase_if_last_ref);
 
-  void Erase(const Slice& key, uint32_t hash) override;
+  bool Release(ClockHandle* handle, bool erase_if_last_ref = false);
 
-  size_t GetUsage() const override;
+  bool Ref(ClockHandle* handle);
 
-  size_t GetPinnedUsage() const override;
+  void Erase(const Slice& key, const UniqueId64x2& hashed_key);
 
-  size_t GetOccupancyCount() const override;
+  size_t GetUsage() const;
 
-  size_t GetTableAddressCount() const override;
+  size_t GetPinnedUsage() const;
+
+  size_t GetOccupancyCount() const;
+
+  size_t GetTableAddressCount() const;
 
   void ApplyToSomeEntries(
       const std::function<void(const Slice& key, void* value, size_t charge,
                                DeleterFn deleter)>& callback,
-      uint32_t average_entries_per_lock, uint32_t* state) override;
+      size_t average_entries_per_lock, size_t* state);
 
-  void EraseUnRefEntries() override;
+  void EraseUnRefEntries();
 
-  std::string GetPrintableOptions() const override { return std::string{}; }
+  std::string GetPrintableOptions() const { return std::string{}; }
 
   // SecondaryCache not yet supported
-  Status Insert(const Slice& key, uint32_t hash, void* value,
+  Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
                 const Cache::CacheItemHelper* helper, size_t charge,
-                Cache::Handle** handle, Cache::Priority priority) override {
-    return Insert(key, hash, value, charge, helper->del_cb, handle, priority);
+                ClockHandle** handle, Cache::Priority priority) {
+    return Insert(key, hashed_key, value, charge, helper->del_cb, handle,
+                  priority);
   }
 
-  Cache::Handle* Lookup(const Slice& key, uint32_t hash,
-                        const Cache::CacheItemHelper* /*helper*/,
-                        const Cache::CreateCallback& /*create_cb*/,
-                        Cache::Priority /*priority*/, bool /*wait*/,
-                        Statistics* /*stats*/) override {
-    return Lookup(key, hash);
+  ClockHandle* Lookup(const Slice& key, const UniqueId64x2& hashed_key,
+                      const Cache::CacheItemHelper* /*helper*/,
+                      const Cache::CreateCallback& /*create_cb*/,
+                      Cache::Priority /*priority*/, bool /*wait*/,
+                      Statistics* /*stats*/) {
+    return Lookup(key, hashed_key);
   }
 
-  bool IsReady(Cache::Handle* /*handle*/) override { return true; }
+  bool IsReady(ClockHandle* /*handle*/) { return true; }
 
-  void Wait(Cache::Handle* /*handle*/) override {}
+  void Wait(ClockHandle* /*handle*/) {}
 
   // Acquire/release N references
-  void TEST_RefN(Cache::Handle* handle, size_t n);
-  void TEST_ReleaseN(Cache::Handle* handle, size_t n);
+  void TEST_RefN(ClockHandle* handle, size_t n);
+  void TEST_ReleaseN(ClockHandle* handle, size_t n);
 
  private:  // functions
   friend class ClockCache;
   friend class ClockCacheTest;
 
-  ClockHandle* DetachedInsert(const ClockHandleMoreData& h);
+  ClockHandle* DetachedInsert(const ClockHandleBasicData& h);
 
   // Returns the number of bits used to hash an element in the hash
   // table.
@@ -586,35 +611,20 @@ class HyperClockCache
 #ifdef NDEBUG
     final
 #endif
-    : public ShardedCache {
+    : public ShardedCache<ClockCacheShard> {
  public:
   HyperClockCache(size_t capacity, size_t estimated_value_size,
                   int num_shard_bits, bool strict_capacity_limit,
-                  CacheMetadataChargePolicy metadata_charge_policy =
-                      kDontChargeCacheMetadata);
-
-  ~HyperClockCache() override;
+                  CacheMetadataChargePolicy metadata_charge_policy,
+                  std::shared_ptr<MemoryAllocator> memory_allocator);
 
   const char* Name() const override { return "HyperClockCache"; }
 
-  CacheShard* GetShard(uint32_t shard) override;
-
-  const CacheShard* GetShard(uint32_t shard) const override;
-
   void* Value(Handle* handle) override;
 
   size_t GetCharge(Handle* handle) const override;
 
-  uint32_t GetHash(Handle* handle) const override;
-
   DeleterFn GetDeleter(Handle* handle) const override;
-
-  void DisownData() override;
-
- private:
-  ClockCacheShard* shards_ = nullptr;
-
-  int num_shards_;
 };  // class HyperClockCache
 
 }  // namespace hyper_clock_cache
diff --git a/cache/fast_lru_cache.cc b/cache/fast_lru_cache.cc
index f5f93800d5..3a540f139b 100644
--- a/cache/fast_lru_cache.cc
+++ b/cache/fast_lru_cache.cc
@@ -173,7 +173,7 @@ inline int LRUHandleTable::FindSlot(const Slice& key,
 LRUCacheShard::LRUCacheShard(size_t capacity, size_t estimated_value_size,
                              bool strict_capacity_limit,
                              CacheMetadataChargePolicy metadata_charge_policy)
-    : CacheShard(metadata_charge_policy),
+    : CacheShardBase(metadata_charge_policy),
       capacity_(capacity),
       strict_capacity_limit_(strict_capacity_limit),
       table_(
@@ -211,27 +211,27 @@ void LRUCacheShard::EraseUnRefEntries() {
 void LRUCacheShard::ApplyToSomeEntries(
     const std::function<void(const Slice& key, void* value, size_t charge,
                              DeleterFn deleter)>& callback,
-    uint32_t average_entries_per_lock, uint32_t* state) {
+    size_t average_entries_per_lock, size_t* state) {
   // The state is essentially going to be the starting hash, which works
   // nicely even if we resize between calls because we use upper-most
   // hash bits for table indexes.
   DMutexLock l(mutex_);
-  uint32_t length_bits = table_.GetLengthBits();
-  uint32_t length = table_.GetTableSize();
+  size_t length_bits = table_.GetLengthBits();
+  size_t length = table_.GetTableSize();
 
   assert(average_entries_per_lock > 0);
   // Assuming we are called with same average_entries_per_lock repeatedly,
   // this simplifies some logic (index_end will not overflow).
   assert(average_entries_per_lock < length || *state == 0);
 
-  uint32_t index_begin = *state >> (32 - length_bits);
-  uint32_t index_end = index_begin + average_entries_per_lock;
+  size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
+  size_t index_end = index_begin + average_entries_per_lock;
   if (index_end >= length) {
     // Going to end
     index_end = length;
-    *state = UINT32_MAX;
+    *state = SIZE_MAX;
   } else {
-    *state = index_end << (32 - length_bits);
+    *state = index_end << (sizeof(size_t) * 8u - length_bits);
   }
 
   table_.ApplyToEntriesRange(
@@ -322,8 +322,7 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
 
 Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
                              size_t charge, Cache::DeleterFn deleter,
-                             Cache::Handle** handle,
-                             Cache::Priority /*priority*/) {
+                             LRUHandle** handle, Cache::Priority /*priority*/) {
   if (key.size() != kCacheKeySize) {
     return Status::NotSupported("FastLRUCache only supports key size " +
                                 std::to_string(kCacheKeySize) + "B");
@@ -409,7 +408,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
         if (!h->HasRefs()) {
           h->Ref();
         }
-        *handle = reinterpret_cast<Cache::Handle*>(h);
+        *handle = h;
       }
     }
   }
@@ -422,7 +421,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
   return s;
 }
 
-Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
+LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
   LRUHandle* h = nullptr;
   {
     DMutexLock l(mutex_);
@@ -437,23 +436,21 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
       h->Ref();
     }
   }
-  return reinterpret_cast<Cache::Handle*>(h);
+  return h;
 }
 
-bool LRUCacheShard::Ref(Cache::Handle* h) {
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(h);
+bool LRUCacheShard::Ref(LRUHandle* h) {
   DMutexLock l(mutex_);
   // To create another reference - entry must be already externally referenced.
-  assert(e->HasRefs());
-  e->Ref();
+  assert(h->HasRefs());
+  h->Ref();
   return true;
 }
 
-bool LRUCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
-  if (handle == nullptr) {
+bool LRUCacheShard::Release(LRUHandle* h, bool erase_if_last_ref) {
+  if (h == nullptr) {
     return false;
   }
-  LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
   LRUHandle copy;
   bool last_reference = false;
   {
@@ -535,41 +532,18 @@ size_t LRUCacheShard::GetTableAddressCount() const {
   return table_.GetTableSize();
 }
 
-std::string LRUCacheShard::GetPrintableOptions() const { return std::string{}; }
-
 LRUCache::LRUCache(size_t capacity, size_t estimated_value_size,
                    int num_shard_bits, bool strict_capacity_limit,
                    CacheMetadataChargePolicy metadata_charge_policy)
-    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
+    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+                   nullptr /*allocator*/) {
   assert(estimated_value_size > 0 ||
          metadata_charge_policy != kDontChargeCacheMetadata);
-  num_shards_ = 1 << num_shard_bits;
-  shards_ = reinterpret_cast<LRUCacheShard*>(
-      port::cacheline_aligned_alloc(sizeof(LRUCacheShard) * num_shards_));
-  size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
-  for (int i = 0; i < num_shards_; i++) {
-    new (&shards_[i])
-        LRUCacheShard(per_shard, estimated_value_size, strict_capacity_limit,
-                      metadata_charge_policy);
-  }
-}
-
-LRUCache::~LRUCache() {
-  if (shards_ != nullptr) {
-    assert(num_shards_ > 0);
-    for (int i = 0; i < num_shards_; i++) {
-      shards_[i].~LRUCacheShard();
-    }
-    port::cacheline_aligned_free(shards_);
-  }
-}
-
-CacheShard* LRUCache::GetShard(uint32_t shard) {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
-}
-
-const CacheShard* LRUCache::GetShard(uint32_t shard) const {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
+  size_t per_shard = GetPerShardCapacity();
+  InitShards([=](LRUCacheShard* cs) {
+    new (cs) LRUCacheShard(per_shard, estimated_value_size,
+                           strict_capacity_limit, metadata_charge_policy);
+  });
 }
 
 void* LRUCache::Value(Handle* handle) {
@@ -577,12 +551,8 @@ void* LRUCache::Value(Handle* handle) {
 }
 
 size_t LRUCache::GetCharge(Handle* handle) const {
-  CacheMetadataChargePolicy metadata_charge_policy = kDontChargeCacheMetadata;
-  if (num_shards_ > 0) {
-    metadata_charge_policy = shards_[0].metadata_charge_policy_;
-  }
   return reinterpret_cast<const LRUHandle*>(handle)->GetCharge(
-      metadata_charge_policy);
+      GetShard(0).metadata_charge_policy_);
 }
 
 Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
@@ -590,18 +560,6 @@ Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
   return h->deleter;
 }
 
-uint32_t LRUCache::GetHash(Handle* handle) const {
-  return reinterpret_cast<const LRUHandle*>(handle)->hash;
-}
-
-void LRUCache::DisownData() {
-  // Leak data only if that won't generate an ASAN/valgrind warning.
-  if (!kMustFreeHeapAllocations) {
-    shards_ = nullptr;
-    num_shards_ = 0;
-  }
-}
-
 }  // namespace fast_lru_cache
 
 std::shared_ptr<Cache> NewFastLRUCache(
diff --git a/cache/fast_lru_cache.h b/cache/fast_lru_cache.h
index 77aff8babc..3cd55ca869 100644
--- a/cache/fast_lru_cache.h
+++ b/cache/fast_lru_cache.h
@@ -141,6 +141,9 @@ struct LRUHandle {
 
   Slice key() const { return Slice(key_data.data(), kCacheKeySize); }
 
+  // For HandleImpl concept
+  uint32_t GetHash() const { return hash; }
+
   // Increase the reference count by 1.
   void Ref() { refs++; }
 
@@ -260,8 +263,8 @@ class LRUHandleTable {
   void Assign(int slot, LRUHandle* h);
 
   template <typename T>
-  void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) {
-    for (uint32_t i = index_begin; i < index_end; i++) {
+  void ApplyToEntriesRange(T func, size_t index_begin, size_t index_end) {
+    for (size_t i = index_begin; i < index_end; i++) {
       LRUHandle* h = &array_[i];
       if (h->IsVisible()) {
         func(h);
@@ -316,20 +319,30 @@ class LRUHandleTable {
 };
 
 // A single shard of sharded cache.
-class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
+class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
  public:
   LRUCacheShard(size_t capacity, size_t estimated_value_size,
                 bool strict_capacity_limit,
                 CacheMetadataChargePolicy metadata_charge_policy);
-  ~LRUCacheShard() override = default;
+
+  // For CacheShard concept
+  using HandleImpl = LRUHandle;
+
+  // Keep 32-bit hashing for now (FIXME: upgrade to 64-bit)
+  using HashVal = uint32_t;
+  using HashCref = uint32_t;
+  static inline HashVal ComputeHash(const Slice& key) {
+    return Lower32of64(GetSliceNPHash64(key));
+  }
+  static inline uint32_t HashPieceForSharding(HashCref hash) { return hash; }
 
   // Separate from constructor so caller can easily make an array of LRUCache
   // if current usage is more than new capacity, the function will attempt to
   // free the needed space.
-  void SetCapacity(size_t capacity) override;
+  void SetCapacity(size_t capacity);
 
   // Set the flag to reject insertion if cache if full.
-  void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+  void SetStrictCapacityLimit(bool strict_capacity_limit);
 
   // Like Cache methods, but with an extra "hash" parameter.
   // Insert an item into the hash table and, if handle is null, insert into
@@ -337,48 +350,45 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
   // and free_handle_on_fail is true, the item is deleted and handle is set to
   // nullptr.
   Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
-                Cache::DeleterFn deleter, Cache::Handle** handle,
-                Cache::Priority priority) override;
+                Cache::DeleterFn deleter, LRUHandle** handle,
+                Cache::Priority priority);
 
   Status Insert(const Slice& key, uint32_t hash, void* value,
                 const Cache::CacheItemHelper* helper, size_t charge,
-                Cache::Handle** handle, Cache::Priority priority) override {
+                LRUHandle** handle, Cache::Priority priority) {
     return Insert(key, hash, value, charge, helper->del_cb, handle, priority);
   }
 
-  Cache::Handle* Lookup(const Slice& key, uint32_t hash,
-                        const Cache::CacheItemHelper* /*helper*/,
-                        const Cache::CreateCallback& /*create_cb*/,
-                        Cache::Priority /*priority*/, bool /*wait*/,
-                        Statistics* /*stats*/) override {
+  LRUHandle* Lookup(const Slice& key, uint32_t hash,
+                    const Cache::CacheItemHelper* /*helper*/,
+                    const Cache::CreateCallback& /*create_cb*/,
+                    Cache::Priority /*priority*/, bool /*wait*/,
+                    Statistics* /*stats*/) {
     return Lookup(key, hash);
   }
-  Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+  LRUHandle* Lookup(const Slice& key, uint32_t hash);
 
-  bool Release(Cache::Handle* handle, bool /*useful*/,
-               bool erase_if_last_ref) override {
+  bool Release(LRUHandle* handle, bool /*useful*/, bool erase_if_last_ref) {
     return Release(handle, erase_if_last_ref);
   }
-  bool IsReady(Cache::Handle* /*handle*/) override { return true; }
-  void Wait(Cache::Handle* /*handle*/) override {}
+  bool IsReady(LRUHandle* /*handle*/) { return true; }
+  void Wait(LRUHandle* /*handle*/) {}
 
-  bool Ref(Cache::Handle* handle) override;
-  bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
-  void Erase(const Slice& key, uint32_t hash) override;
+  bool Ref(LRUHandle* handle);
+  bool Release(LRUHandle* handle, bool erase_if_last_ref = false);
+  void Erase(const Slice& key, uint32_t hash);
 
-  size_t GetUsage() const override;
-  size_t GetPinnedUsage() const override;
-  size_t GetOccupancyCount() const override;
-  size_t GetTableAddressCount() const override;
+  size_t GetUsage() const;
+  size_t GetPinnedUsage() const;
+  size_t GetOccupancyCount() const;
+  size_t GetTableAddressCount() const;
 
   void ApplyToSomeEntries(
       const std::function<void(const Slice& key, void* value, size_t charge,
                                DeleterFn deleter)>& callback,
-      uint32_t average_entries_per_lock, uint32_t* state) override;
+      size_t average_entries_per_lock, size_t* state);
 
-  void EraseUnRefEntries() override;
-
-  std::string GetPrintableOptions() const override;
+  void EraseUnRefEntries();
 
  private:
   friend class LRUCache;
@@ -446,25 +456,16 @@ class LRUCache
 #ifdef NDEBUG
     final
 #endif
-    : public ShardedCache {
+    : public ShardedCache<LRUCacheShard> {
  public:
   LRUCache(size_t capacity, size_t estimated_value_size, int num_shard_bits,
            bool strict_capacity_limit,
            CacheMetadataChargePolicy metadata_charge_policy =
                kDontChargeCacheMetadata);
-  ~LRUCache() override;
   const char* Name() const override { return "LRUCache"; }
-  CacheShard* GetShard(uint32_t shard) override;
-  const CacheShard* GetShard(uint32_t shard) const override;
   void* Value(Handle* handle) override;
   size_t GetCharge(Handle* handle) const override;
-  uint32_t GetHash(Handle* handle) const override;
   DeleterFn GetDeleter(Handle* handle) const override;
-  void DisownData() override;
-
- private:
-  LRUCacheShard* shards_ = nullptr;
-  int num_shards_ = 0;
 };
 }  // namespace fast_lru_cache
 
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index afecedde9b..06d223f3aa 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -38,7 +38,7 @@ LRUHandleTable::~LRUHandleTable() {
           h->Free();
         }
       },
-      0, uint32_t{1} << length_bits_);
+      0, size_t{1} << length_bits_);
 }
 
 LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) {
@@ -113,12 +113,13 @@ void LRUHandleTable::Resize() {
   length_bits_ = new_length_bits;
 }
 
-LRUCacheShard::LRUCacheShard(
-    size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio,
-    double low_pri_pool_ratio, bool use_adaptive_mutex,
-    CacheMetadataChargePolicy metadata_charge_policy, int max_upper_hash_bits,
-    const std::shared_ptr<SecondaryCache>& secondary_cache)
-    : CacheShard(metadata_charge_policy),
+LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
+                             double high_pri_pool_ratio,
+                             double low_pri_pool_ratio, bool use_adaptive_mutex,
+                             CacheMetadataChargePolicy metadata_charge_policy,
+                             int max_upper_hash_bits,
+                             SecondaryCache* secondary_cache)
+    : CacheShardBase(metadata_charge_policy),
       capacity_(0),
       high_pri_pool_usage_(0),
       low_pri_pool_usage_(0),
@@ -165,27 +166,27 @@ void LRUCacheShard::EraseUnRefEntries() {
 void LRUCacheShard::ApplyToSomeEntries(
     const std::function<void(const Slice& key, void* value, size_t charge,
                              DeleterFn deleter)>& callback,
-    uint32_t average_entries_per_lock, uint32_t* state) {
+    size_t average_entries_per_lock, size_t* state) {
   // The state is essentially going to be the starting hash, which works
   // nicely even if we resize between calls because we use upper-most
   // hash bits for table indexes.
   DMutexLock l(mutex_);
-  uint32_t length_bits = table_.GetLengthBits();
-  uint32_t length = uint32_t{1} << length_bits;
+  int length_bits = table_.GetLengthBits();
+  size_t length = size_t{1} << length_bits;
 
   assert(average_entries_per_lock > 0);
   // Assuming we are called with same average_entries_per_lock repeatedly,
   // this simplifies some logic (index_end will not overflow).
   assert(average_entries_per_lock < length || *state == 0);
 
-  uint32_t index_begin = *state >> (32 - length_bits);
-  uint32_t index_end = index_begin + average_entries_per_lock;
+  size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
+  size_t index_end = index_begin + average_entries_per_lock;
   if (index_end >= length) {
     // Going to end
     index_end = length;
-    *state = UINT32_MAX;
+    *state = SIZE_MAX;
   } else {
-    *state = index_end << (32 - length_bits);
+    *state = index_end << (sizeof(size_t) * 8u - length_bits);
   }
 
   table_.ApplyToEntriesRange(
@@ -364,7 +365,7 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
   strict_capacity_limit_ = strict_capacity_limit;
 }
 
-Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle,
+Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle,
                                  bool free_handle_on_fail) {
   Status s = Status::OK();
   autovector<LRUHandle*> last_reference_list;
@@ -414,7 +415,7 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle,
         if (!e->HasRefs()) {
           e->Ref();
         }
-        *handle = reinterpret_cast<Cache::Handle*>(e);
+        *handle = e;
       }
     }
   }
@@ -480,7 +481,7 @@ void LRUCacheShard::Promote(LRUHandle* e) {
                  priority);
     } else {
       e->SetInCache(true);
-      Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(e);
+      LRUHandle* handle = e;
       // This InsertItem() could fail if the cache is over capacity and
       // strict_capacity_limit_ is true. In such a case, we don't want
       // InsertItem() to free the handle, since the item is already in memory
@@ -505,11 +506,11 @@ void LRUCacheShard::Promote(LRUHandle* e) {
   }
 }
 
-Cache::Handle* LRUCacheShard::Lookup(
-    const Slice& key, uint32_t hash,
-    const ShardedCache::CacheItemHelper* helper,
-    const ShardedCache::CreateCallback& create_cb, Cache::Priority priority,
-    bool wait, Statistics* stats) {
+LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash,
+                                 const Cache::CacheItemHelper* helper,
+                                 const Cache::CreateCallback& create_cb,
+                                 Cache::Priority priority, bool wait,
+                                 Statistics* stats) {
   LRUHandle* e = nullptr;
   bool found_dummy_entry{false};
   {
@@ -607,11 +608,10 @@ Cache::Handle* LRUCacheShard::Lookup(
       assert(e == nullptr);
     }
   }
-  return reinterpret_cast<Cache::Handle*>(e);
+  return e;
 }
 
-bool LRUCacheShard::Ref(Cache::Handle* h) {
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(h);
+bool LRUCacheShard::Ref(LRUHandle* e) {
   DMutexLock l(mutex_);
   // To create another reference - entry must be already externally referenced.
   assert(e->HasRefs());
@@ -635,11 +635,11 @@ void LRUCacheShard::SetLowPriorityPoolRatio(double low_pri_pool_ratio) {
   MaintainPoolSize();
 }
 
-bool LRUCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
-  if (handle == nullptr) {
+bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/,
+                            bool erase_if_last_ref) {
+  if (e == nullptr) {
     return false;
   }
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
   bool last_reference = false;
   // Must Wait or WaitAll first on pending handles. Otherwise, would leak
   // a secondary cache handle.
@@ -679,7 +679,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
                              size_t charge,
                              void (*deleter)(const Slice& key, void* value),
                              const Cache::CacheItemHelper* helper,
-                             Cache::Handle** handle, Cache::Priority priority) {
+                             LRUHandle** handle, Cache::Priority priority) {
   // Allocate the memory here outside of the mutex.
   // If the cache is full, we'll have to release it.
   // It shouldn't happen very often though.
@@ -738,8 +738,7 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
   }
 }
 
-bool LRUCacheShard::IsReady(Cache::Handle* handle) {
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
+bool LRUCacheShard::IsReady(LRUHandle* e) {
   bool ready = true;
   if (e->IsPending()) {
     assert(secondary_cache_);
@@ -770,7 +769,7 @@ size_t LRUCacheShard::GetTableAddressCount() const {
   return size_t{1} << table_.GetLengthBits();
 }
 
-std::string LRUCacheShard::GetPrintableOptions() const {
+void LRUCacheShard::AppendPrintableOptions(std::string& str) const {
   const int kBufferSize = 200;
   char buffer[kBufferSize];
   {
@@ -780,7 +779,7 @@ std::string LRUCacheShard::GetPrintableOptions() const {
     snprintf(buffer + strlen(buffer), kBufferSize - strlen(buffer),
              "    low_pri_pool_ratio: %.3lf\n", low_pri_pool_ratio_);
   }
-  return std::string(buffer);
+  str.append(buffer);
 }
 
 LRUCache::LRUCache(size_t capacity, int num_shard_bits,
@@ -789,38 +788,18 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits,
                    std::shared_ptr<MemoryAllocator> allocator,
                    bool use_adaptive_mutex,
                    CacheMetadataChargePolicy metadata_charge_policy,
-                   const std::shared_ptr<SecondaryCache>& secondary_cache)
+                   std::shared_ptr<SecondaryCache> _secondary_cache)
     : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
-                   std::move(allocator)) {
-  num_shards_ = 1 << num_shard_bits;
-  shards_ = reinterpret_cast<LRUCacheShard*>(
-      port::cacheline_aligned_alloc(sizeof(LRUCacheShard) * num_shards_));
-  size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
-  for (int i = 0; i < num_shards_; i++) {
-    new (&shards_[i]) LRUCacheShard(
+                   std::move(allocator)),
+      secondary_cache_(std::move(_secondary_cache)) {
+  size_t per_shard = GetPerShardCapacity();
+  SecondaryCache* secondary_cache = secondary_cache_.get();
+  InitShards([=](LRUCacheShard* cs) {
+    new (cs) LRUCacheShard(
         per_shard, strict_capacity_limit, high_pri_pool_ratio,
         low_pri_pool_ratio, use_adaptive_mutex, metadata_charge_policy,
         /* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache);
-  }
-  secondary_cache_ = secondary_cache;
-}
-
-LRUCache::~LRUCache() {
-  if (shards_ != nullptr) {
-    assert(num_shards_ > 0);
-    for (int i = 0; i < num_shards_; i++) {
-      shards_[i].~LRUCacheShard();
-    }
-    port::cacheline_aligned_free(shards_);
-  }
-}
-
-CacheShard* LRUCache::GetShard(uint32_t shard) {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
-}
-
-const CacheShard* LRUCache::GetShard(uint32_t shard) const {
-  return reinterpret_cast<CacheShard*>(&shards_[shard]);
+  });
 }
 
 void* LRUCache::Value(Handle* handle) {
@@ -831,12 +810,8 @@ void* LRUCache::Value(Handle* handle) {
 }
 
 size_t LRUCache::GetCharge(Handle* handle) const {
-  CacheMetadataChargePolicy metadata_charge_policy = kDontChargeCacheMetadata;
-  if (num_shards_ > 0) {
-    metadata_charge_policy = shards_[0].metadata_charge_policy_;
-  }
   return reinterpret_cast<const LRUHandle*>(handle)->GetCharge(
-      metadata_charge_policy);
+      GetShard(0).metadata_charge_policy_);
 }
 
 Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
@@ -848,32 +823,12 @@ Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
   }
 }
 
-uint32_t LRUCache::GetHash(Handle* handle) const {
-  return reinterpret_cast<const LRUHandle*>(handle)->hash;
-}
-
-void LRUCache::DisownData() {
-  // Leak data only if that won't generate an ASAN/valgrind warning.
-  if (!kMustFreeHeapAllocations) {
-    shards_ = nullptr;
-    num_shards_ = 0;
-  }
-}
-
 size_t LRUCache::TEST_GetLRUSize() {
-  size_t lru_size_of_all_shards = 0;
-  for (int i = 0; i < num_shards_; i++) {
-    lru_size_of_all_shards += shards_[i].TEST_GetLRUSize();
-  }
-  return lru_size_of_all_shards;
+  return SumOverShards([](LRUCacheShard& cs) { return cs.TEST_GetLRUSize(); });
 }
 
 double LRUCache::GetHighPriPoolRatio() {
-  double result = 0.0;
-  if (num_shards_ > 0) {
-    result = shards_[0].GetHighPriPoolRatio();
-  }
-  return result;
+  return GetShard(0).GetHighPriPoolRatio();
 }
 
 void LRUCache::WaitAll(std::vector<Handle*>& handles) {
@@ -899,22 +854,17 @@ void LRUCache::WaitAll(std::vector<Handle*>& handles) {
       if (!lru_handle->IsPending()) {
         continue;
       }
-      uint32_t hash = GetHash(handle);
-      LRUCacheShard* shard = static_cast<LRUCacheShard*>(GetShard(Shard(hash)));
-      shard->Promote(lru_handle);
+      GetShard(lru_handle->hash).Promote(lru_handle);
     }
   }
 }
 
-std::string LRUCache::GetPrintableOptions() const {
-  std::string ret;
-  ret.reserve(20000);
-  ret.append(ShardedCache::GetPrintableOptions());
+void LRUCache::AppendPrintableOptions(std::string& str) const {
+  ShardedCache::AppendPrintableOptions(str);  // options from shard
   if (secondary_cache_) {
-    ret.append("  secondary_cache:\n");
-    ret.append(secondary_cache_->GetPrintableOptions());
+    str.append("  secondary_cache:\n");
+    str.append(secondary_cache_->GetPrintableOptions());
   }
-  return ret;
 }
 
 }  // namespace lru_cache
diff --git a/cache/lru_cache.h b/cache/lru_cache.h
index 062cbcb67f..ff5d364679 100644
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@@ -53,7 +53,7 @@ struct LRUHandle {
     Info() {}
     ~Info() {}
     Cache::DeleterFn deleter;
-    const ShardedCache::CacheItemHelper* helper;
+    const Cache::CacheItemHelper* helper;
   } info_;
   // An entry is not added to the LRUHandleTable until the secondary cache
   // lookup is complete, so its safe to have this union.
@@ -108,6 +108,9 @@ struct LRUHandle {
 
   Slice key() const { return Slice(key_data, key_length); }
 
+  // For HandleImpl concept
+  uint32_t GetHash() const { return hash; }
+
   // Increase the reference count by 1.
   void Ref() { refs++; }
 
@@ -262,9 +265,6 @@ struct LRUHandle {
 // 4.4.3's builtin hashtable.
 class LRUHandleTable {
  public:
-  // If the table uses more hash bits than `max_upper_hash_bits`,
-  // it will eat into the bits used for sharding, which are constant
-  // for a given LRUHandleTable.
   explicit LRUHandleTable(int max_upper_hash_bits);
   ~LRUHandleTable();
 
@@ -273,8 +273,8 @@ class LRUHandleTable {
   LRUHandle* Remove(const Slice& key, uint32_t hash);
 
   template <typename T>
-  void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) {
-    for (uint32_t i = index_begin; i < index_end; i++) {
+  void ApplyToEntriesRange(T func, size_t index_begin, size_t index_end) {
+    for (size_t i = index_begin; i < index_end; i++) {
       LRUHandle* h = list_[i];
       while (h != nullptr) {
         auto n = h->next_hash;
@@ -313,23 +313,31 @@ class LRUHandleTable {
 };
 
 // A single shard of sharded cache.
-class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
+class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
  public:
   LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                 double high_pri_pool_ratio, double low_pri_pool_ratio,
                 bool use_adaptive_mutex,
                 CacheMetadataChargePolicy metadata_charge_policy,
-                int max_upper_hash_bits,
-                const std::shared_ptr<SecondaryCache>& secondary_cache);
-  virtual ~LRUCacheShard() override = default;
+                int max_upper_hash_bits, SecondaryCache* secondary_cache);
+
+ public:  // Type definitions expected as parameter to ShardedCache
+  using HandleImpl = LRUHandle;
+  using HashVal = uint32_t;
+  using HashCref = uint32_t;
+
+ public:  // Function definitions expected as parameter to ShardedCache
+  static inline HashVal ComputeHash(const Slice& key) {
+    return Lower32of64(GetSliceNPHash64(key));
+  }
 
   // Separate from constructor so caller can easily make an array of LRUCache
   // if current usage is more than new capacity, the function will attempt to
   // free the needed space.
-  virtual void SetCapacity(size_t capacity) override;
+  void SetCapacity(size_t capacity);
 
   // Set the flag to reject insertion if cache if full.
-  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+  void SetStrictCapacityLimit(bool strict_capacity_limit);
 
   // Set percentage of capacity reserved for high-pri cache entries.
   void SetHighPriorityPoolRatio(double high_pri_pool_ratio);
@@ -338,58 +346,49 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
   void SetLowPriorityPoolRatio(double low_pri_pool_ratio);
 
   // Like Cache methods, but with an extra "hash" parameter.
-  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
-                        size_t charge, Cache::DeleterFn deleter,
-                        Cache::Handle** handle,
-                        Cache::Priority priority) override {
+  inline Status Insert(const Slice& key, uint32_t hash, void* value,
+                       size_t charge, Cache::DeleterFn deleter,
+                       LRUHandle** handle, Cache::Priority priority) {
     return Insert(key, hash, value, charge, deleter, nullptr, handle, priority);
   }
-  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
-                        const Cache::CacheItemHelper* helper, size_t charge,
-                        Cache::Handle** handle,
-                        Cache::Priority priority) override {
+  inline Status Insert(const Slice& key, uint32_t hash, void* value,
+                       const Cache::CacheItemHelper* helper, size_t charge,
+                       LRUHandle** handle, Cache::Priority priority) {
     assert(helper);
     return Insert(key, hash, value, charge, nullptr, helper, handle, priority);
   }
   // If helper_cb is null, the values of the following arguments don't matter.
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash,
-                                const ShardedCache::CacheItemHelper* helper,
-                                const ShardedCache::CreateCallback& create_cb,
-                                ShardedCache::Priority priority, bool wait,
-                                Statistics* stats) override;
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override {
+  LRUHandle* Lookup(const Slice& key, uint32_t hash,
+                    const Cache::CacheItemHelper* helper,
+                    const Cache::CreateCallback& create_cb,
+                    Cache::Priority priority, bool wait, Statistics* stats);
+  inline LRUHandle* Lookup(const Slice& key, uint32_t hash) {
     return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true,
                   nullptr);
   }
-  virtual bool Release(Cache::Handle* handle, bool /*useful*/,
-                       bool erase_if_last_ref) override {
-    return Release(handle, erase_if_last_ref);
-  }
-  virtual bool IsReady(Cache::Handle* /*handle*/) override;
-  virtual void Wait(Cache::Handle* /*handle*/) override {}
-  virtual bool Ref(Cache::Handle* handle) override;
-  virtual bool Release(Cache::Handle* handle,
-                       bool erase_if_last_ref = false) override;
-  virtual void Erase(const Slice& key, uint32_t hash) override;
+  bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref);
+  bool IsReady(LRUHandle* /*handle*/);
+  void Wait(LRUHandle* /*handle*/) {}
+  bool Ref(LRUHandle* handle);
+  void Erase(const Slice& key, uint32_t hash);
 
   // Although in some platforms the update of size_t is atomic, to make sure
   // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
   // protect them with mutex_.
 
-  virtual size_t GetUsage() const override;
-  virtual size_t GetPinnedUsage() const override;
-  virtual size_t GetOccupancyCount() const override;
-  virtual size_t GetTableAddressCount() const override;
+  size_t GetUsage() const;
+  size_t GetPinnedUsage() const;
+  size_t GetOccupancyCount() const;
+  size_t GetTableAddressCount() const;
 
-  virtual void ApplyToSomeEntries(
+  void ApplyToSomeEntries(
       const std::function<void(const Slice& key, void* value, size_t charge,
                                DeleterFn deleter)>& callback,
-      uint32_t average_entries_per_lock, uint32_t* state) override;
+      size_t average_entries_per_lock, size_t* state);
 
-  virtual void EraseUnRefEntries() override;
-
-  virtual std::string GetPrintableOptions() const override;
+  void EraseUnRefEntries();
 
+ public:  // other function definitions
   void TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri,
                        LRUHandle** lru_bottom_pri);
 
@@ -403,17 +402,19 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
   // Retrieves low pri pool ratio
   double GetLowPriPoolRatio();
 
+  void AppendPrintableOptions(std::string& /*str*/) const;
+
  private:
   friend class LRUCache;
   // Insert an item into the hash table and, if handle is null, insert into
   // the LRU list. Older items are evicted as necessary. If the cache is full
   // and free_handle_on_fail is true, the item is deleted and handle is set to
   // nullptr.
-  Status InsertItem(LRUHandle* item, Cache::Handle** handle,
+  Status InsertItem(LRUHandle* item, LRUHandle** handle,
                     bool free_handle_on_fail);
   Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
                 DeleterFn deleter, const Cache::CacheItemHelper* helper,
-                Cache::Handle** handle, Cache::Priority priority);
+                LRUHandle** handle, Cache::Priority priority);
   // Promote an item looked up from the secondary cache to the LRU cache.
   // The item may be still in the secondary cache.
   // It is only inserted into the hash table and not the LRU list, and only
@@ -500,14 +501,15 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
   // don't mind mutex_ invoking the non-const actions.
   mutable DMutex mutex_;
 
-  std::shared_ptr<SecondaryCache> secondary_cache_;
+  // Owned by LRUCache
+  SecondaryCache* secondary_cache_;
 };
 
 class LRUCache
 #ifdef NDEBUG
     final
 #endif
-    : public ShardedCache {
+    : public ShardedCache<LRUCacheShard> {
  public:
   LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
            double high_pri_pool_ratio, double low_pri_pool_ratio,
@@ -515,27 +517,21 @@ class LRUCache
            bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
            CacheMetadataChargePolicy metadata_charge_policy =
                kDontChargeCacheMetadata,
-           const std::shared_ptr<SecondaryCache>& secondary_cache = nullptr);
-  virtual ~LRUCache();
-  virtual const char* Name() const override { return "LRUCache"; }
-  virtual CacheShard* GetShard(uint32_t shard) override;
-  virtual const CacheShard* GetShard(uint32_t shard) const override;
-  virtual void* Value(Handle* handle) override;
-  virtual size_t GetCharge(Handle* handle) const override;
-  virtual uint32_t GetHash(Handle* handle) const override;
-  virtual DeleterFn GetDeleter(Handle* handle) const override;
-  virtual void DisownData() override;
-  virtual void WaitAll(std::vector<Handle*>& handles) override;
-  std::string GetPrintableOptions() const override;
+           std::shared_ptr<SecondaryCache> secondary_cache = nullptr);
+  const char* Name() const override { return "LRUCache"; }
+  void* Value(Handle* handle) override;
+  size_t GetCharge(Handle* handle) const override;
+  DeleterFn GetDeleter(Handle* handle) const override;
+  void WaitAll(std::vector<Handle*>& handles) override;
 
   // Retrieves number of elements in LRU, for unit test purpose only.
   size_t TEST_GetLRUSize();
   // Retrieves high pri pool ratio.
   double GetHighPriPoolRatio();
 
+  void AppendPrintableOptions(std::string& str) const override;
+
  private:
-  LRUCacheShard* shards_ = nullptr;
-  int num_shards_ = 0;
   std::shared_ptr<SecondaryCache> secondary_cache_;
 };
 
diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index 748908d0cb..fbf336f873 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -67,7 +67,7 @@ class LRUCacheTest : public testing::Test {
   bool Lookup(const std::string& key) {
     auto handle = cache_->Lookup(key, 0 /*hash*/);
     if (handle) {
-      cache_->Release(handle);
+      cache_->Release(handle, true /*useful*/, false /*erase*/);
       return true;
     }
     return false;
@@ -529,22 +529,27 @@ class ClockCacheTest : public testing::Test {
                                  kDontChargeCacheMetadata);
   }
 
-  Status Insert(const std::string& key,
+  Status Insert(const UniqueId64x2& hashed_key,
                 Cache::Priority priority = Cache::Priority::LOW) {
-    return shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/,
-                          nullptr /*deleter*/, nullptr /*handle*/, priority);
+    return shard_->Insert(TestKey(hashed_key), hashed_key, nullptr /*value*/,
+                          1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/,
+                          priority);
   }
 
   Status Insert(char key, Cache::Priority priority = Cache::Priority::LOW) {
-    return Insert(std::string(kCacheKeySize, key), priority);
+    return Insert(TestHashedKey(key), priority);
   }
 
   Status InsertWithLen(char key, size_t len) {
-    return Insert(std::string(len, key));
+    std::string skey(len, key);
+    return shard_->Insert(skey, TestHashedKey(key), nullptr /*value*/,
+                          1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/,
+                          Cache::Priority::LOW);
   }
 
-  bool Lookup(const std::string& key, bool useful = true) {
-    auto handle = shard_->Lookup(key, 0 /*hash*/);
+  bool Lookup(const Slice& key, const UniqueId64x2& hashed_key,
+              bool useful = true) {
+    auto handle = shard_->Lookup(key, hashed_key);
     if (handle) {
       shard_->Release(handle, useful, /*erase_if_last_ref=*/false);
       return true;
@@ -552,44 +557,29 @@ class ClockCacheTest : public testing::Test {
     return false;
   }
 
+  bool Lookup(const UniqueId64x2& hashed_key, bool useful = true) {
+    return Lookup(TestKey(hashed_key), hashed_key, useful);
+  }
+
   bool Lookup(char key, bool useful = true) {
-    return Lookup(std::string(kCacheKeySize, key), useful);
+    return Lookup(TestHashedKey(key), useful);
   }
 
-  void Erase(const std::string& key) { shard_->Erase(key, 0 /*hash*/); }
-
-#if 0  // FIXME
-  size_t CalcEstimatedHandleChargeWrapper(
-      size_t estimated_value_size,
-      CacheMetadataChargePolicy metadata_charge_policy) {
-    return ClockCacheShard::CalcEstimatedHandleCharge(estimated_value_size,
-                                                      metadata_charge_policy);
+  void Erase(char key) {
+    UniqueId64x2 hashed_key = TestHashedKey(key);
+    shard_->Erase(TestKey(hashed_key), hashed_key);
   }
 
-  int CalcHashBitsWrapper(size_t capacity, size_t estimated_value_size,
-                          CacheMetadataChargePolicy metadata_charge_policy) {
-    return ClockCacheShard::CalcHashBits(capacity, estimated_value_size,
-                                         metadata_charge_policy);
+  static inline Slice TestKey(const UniqueId64x2& hashed_key) {
+    return Slice(reinterpret_cast<const char*>(&hashed_key), 16U);
   }
 
-  // Maximum number of items that a shard can hold.
-  double CalcMaxOccupancy(size_t capacity, size_t estimated_value_size,
-                          CacheMetadataChargePolicy metadata_charge_policy) {
-    size_t handle_charge = ClockCacheShard::CalcEstimatedHandleCharge(
-        estimated_value_size, metadata_charge_policy);
-    return capacity / (kLoadFactor * handle_charge);
+  static inline UniqueId64x2 TestHashedKey(char key) {
+    // For testing hash near-collision behavior, put the variance in
+    // hashed_key in bits that are unlikely to be used as hash bits.
+    return {(static_cast<uint64_t>(key) << 56) + 1234U, 5678U};
   }
 
-  bool TableSizeIsAppropriate(int hash_bits, double max_occupancy) {
-    if (hash_bits == 0) {
-      return max_occupancy <= 1;
-    } else {
-      return (1 << hash_bits >= max_occupancy) &&
-             (1 << (hash_bits - 1) <= max_occupancy);
-    }
-  }
-#endif
-
   ClockCacheShard* shard_ = nullptr;
 };
 
@@ -607,10 +597,10 @@ TEST_F(ClockCacheTest, Misc) {
 
   // Some of this is motivated by code coverage
   std::string wrong_size_key(15, 'x');
-  EXPECT_FALSE(Lookup(wrong_size_key));
+  EXPECT_FALSE(Lookup(wrong_size_key, TestHashedKey('x')));
   EXPECT_FALSE(shard_->Ref(nullptr));
   EXPECT_FALSE(shard_->Release(nullptr));
-  shard_->Erase(wrong_size_key, /*hash*/ 42);  // no-op
+  shard_->Erase(wrong_size_key, TestHashedKey('x'));  // no-op
 }
 
 TEST_F(ClockCacheTest, Limits) {
@@ -622,11 +612,11 @@ TEST_F(ClockCacheTest, Limits) {
     // Also tests switching between strict limit and not
     shard_->SetStrictCapacityLimit(strict_capacity_limit);
 
-    std::string key(16, 'x');
+    UniqueId64x2 hkey = TestHashedKey('x');
 
     // Single entry charge beyond capacity
     {
-      Status s = shard_->Insert(key, 0 /*hash*/, nullptr /*value*/,
+      Status s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/,
                                 5 /*charge*/, nullptr /*deleter*/,
                                 nullptr /*handle*/, Cache::Priority::LOW);
       if (strict_capacity_limit) {
@@ -638,9 +628,10 @@ TEST_F(ClockCacheTest, Limits) {
 
     // Single entry fills capacity
     {
-      Cache::Handle* h;
-      ASSERT_OK(shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 3 /*charge*/,
-                               nullptr /*deleter*/, &h, Cache::Priority::LOW));
+      ClockHandle* h;
+      ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/,
+                               3 /*charge*/, nullptr /*deleter*/, &h,
+                               Cache::Priority::LOW));
       // Try to insert more
       Status s = Insert('a');
       if (strict_capacity_limit) {
@@ -657,11 +648,11 @@ TEST_F(ClockCacheTest, Limits) {
     // entries) to exceed occupancy limit.
     {
       size_t n = shard_->GetTableAddressCount() + 1;
-      std::unique_ptr<Cache::Handle* []> ha { new Cache::Handle* [n] {} };
+      std::unique_ptr<ClockHandle* []> ha { new ClockHandle* [n] {} };
       Status s;
       for (size_t i = 0; i < n && s.ok(); ++i) {
-        EncodeFixed64(&key[0], i);
-        s = shard_->Insert(key, 0 /*hash*/, nullptr /*value*/, 0 /*charge*/,
+        hkey[1] = i;
+        s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, 0 /*charge*/,
                            nullptr /*deleter*/, &ha[i], Cache::Priority::LOW);
         if (i == 0) {
           EXPECT_OK(s);
@@ -807,12 +798,11 @@ void IncrementIntDeleter(const Slice& /*key*/, void* value) {
 // Testing calls to CorrectNearOverflow in Release
 TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
   NewShard(6, /*strict_capacity_limit*/ false);
-  Cache::Handle* h;
+  ClockHandle* h;
   int deleted = 0;
-  std::string my_key(kCacheKeySize, 'x');
-  uint32_t my_hash = 42;
-  ASSERT_OK(shard_->Insert(my_key, my_hash, &deleted, 1, IncrementIntDeleter,
-                           &h, Cache::Priority::HIGH));
+  UniqueId64x2 hkey = TestHashedKey('x');
+  ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &deleted, 1,
+                           IncrementIntDeleter, &h, Cache::Priority::HIGH));
 
   // Some large number outstanding
   shard_->TEST_RefN(h, 123456789);
@@ -822,7 +812,7 @@ TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
     shard_->TEST_ReleaseN(h, 1234567);
   }
   // Mark it invisible (to reach a different CorrectNearOverflow() in Release)
-  shard_->Erase(my_key, my_hash);
+  shard_->Erase(TestKey(hkey), hkey);
   // Simulate many more lookup/ref + release (one-by-one would be too
   // expensive for unit test)
   for (int i = 0; i < 10000; ++i) {
@@ -844,63 +834,65 @@ TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
 TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
   NewShard(6, /*strict_capacity_limit*/ false);
   int deleted = 0;
-  std::string key1(kCacheKeySize, 'x');
-  std::string key2(kCacheKeySize, 'y');
-  std::string key3(kCacheKeySize, 'z');
-  uint32_t my_hash = 42;
-  Cache::Handle* h1;
-  ASSERT_OK(shard_->Insert(key1, my_hash, &deleted, 1, IncrementIntDeleter, &h1,
+  UniqueId64x2 hkey1 = TestHashedKey('x');
+  Slice key1 = TestKey(hkey1);
+  UniqueId64x2 hkey2 = TestHashedKey('y');
+  Slice key2 = TestKey(hkey2);
+  UniqueId64x2 hkey3 = TestHashedKey('z');
+  Slice key3 = TestKey(hkey3);
+  ClockHandle* h1;
+  ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter, &h1,
                            Cache::Priority::HIGH));
-  Cache::Handle* h2;
-  ASSERT_OK(shard_->Insert(key2, my_hash, &deleted, 1, IncrementIntDeleter, &h2,
+  ClockHandle* h2;
+  ASSERT_OK(shard_->Insert(key2, hkey2, &deleted, 1, IncrementIntDeleter, &h2,
                            Cache::Priority::HIGH));
-  Cache::Handle* h3;
-  ASSERT_OK(shard_->Insert(key3, my_hash, &deleted, 1, IncrementIntDeleter, &h3,
+  ClockHandle* h3;
+  ASSERT_OK(shard_->Insert(key3, hkey3, &deleted, 1, IncrementIntDeleter, &h3,
                            Cache::Priority::HIGH));
 
   // Can repeatedly lookup+release despite the hash collision
-  Cache::Handle* tmp_h;
+  ClockHandle* tmp_h;
   for (bool erase_if_last_ref : {true, false}) {  // but not last ref
-    tmp_h = shard_->Lookup(key1, my_hash);
+    tmp_h = shard_->Lookup(key1, hkey1);
     ASSERT_EQ(h1, tmp_h);
     ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
 
-    tmp_h = shard_->Lookup(key2, my_hash);
+    tmp_h = shard_->Lookup(key2, hkey2);
     ASSERT_EQ(h2, tmp_h);
     ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
 
-    tmp_h = shard_->Lookup(key3, my_hash);
+    tmp_h = shard_->Lookup(key3, hkey3);
     ASSERT_EQ(h3, tmp_h);
     ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
   }
 
   // Make h1 invisible
-  shard_->Erase(key1, my_hash);
+  shard_->Erase(key1, hkey1);
   // Redundant erase
-  shard_->Erase(key1, my_hash);
+  shard_->Erase(key1, hkey1);
 
   // All still alive
   ASSERT_EQ(deleted, 0);
 
   // Invisible to Lookup
-  tmp_h = shard_->Lookup(key1, my_hash);
+  tmp_h = shard_->Lookup(key1, hkey1);
   ASSERT_EQ(nullptr, tmp_h);
 
   // Can still find h2, h3
   for (bool erase_if_last_ref : {true, false}) {  // but not last ref
-    tmp_h = shard_->Lookup(key2, my_hash);
+    tmp_h = shard_->Lookup(key2, hkey2);
     ASSERT_EQ(h2, tmp_h);
     ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
 
-    tmp_h = shard_->Lookup(key3, my_hash);
+    tmp_h = shard_->Lookup(key3, hkey3);
     ASSERT_EQ(h3, tmp_h);
     ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
   }
 
   // Also Insert with invisible entry there
-  ASSERT_OK(shard_->Insert(key1, my_hash, &deleted, 1, IncrementIntDeleter,
+  ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter,
                            nullptr, Cache::Priority::HIGH));
-  tmp_h = shard_->Lookup(key1, my_hash);
+  tmp_h = shard_->Lookup(key1, hkey1);
   // Found but distinct handle
   ASSERT_NE(nullptr, tmp_h);
   ASSERT_NE(h1, tmp_h);
@@ -918,11 +910,11 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
 
   // Can still find h2, h3
   for (bool erase_if_last_ref : {true, false}) {  // but not last ref
-    tmp_h = shard_->Lookup(key2, my_hash);
+    tmp_h = shard_->Lookup(key2, hkey2);
     ASSERT_EQ(h2, tmp_h);
     ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
 
-    tmp_h = shard_->Lookup(key3, my_hash);
+    tmp_h = shard_->Lookup(key3, hkey3);
     ASSERT_EQ(h3, tmp_h);
     ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
   }
@@ -934,7 +926,7 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
   ASSERT_EQ(deleted, 0);
 
   // Can still find it
-  tmp_h = shard_->Lookup(key2, my_hash);
+  tmp_h = shard_->Lookup(key2, hkey2);
   ASSERT_EQ(h2, tmp_h);
 
   // Release last ref on h2, with erase
@@ -942,12 +934,12 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
 
   // h2 deleted
   ASSERT_EQ(deleted--, 1);
-  tmp_h = shard_->Lookup(key2, my_hash);
+  tmp_h = shard_->Lookup(key2, hkey2);
   ASSERT_EQ(nullptr, tmp_h);
 
   // Can still find h3
   for (bool erase_if_last_ref : {true, false}) {  // but not last ref
-    tmp_h = shard_->Lookup(key3, my_hash);
+    tmp_h = shard_->Lookup(key3, hkey3);
     ASSERT_EQ(h3, tmp_h);
     ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
   }
@@ -959,11 +951,11 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
   ASSERT_EQ(deleted, 0);
 
   // Explicit erase
-  shard_->Erase(key3, my_hash);
+  shard_->Erase(key3, hkey3);
 
   // h3 deleted
   ASSERT_EQ(deleted--, 1);
-  tmp_h = shard_->Lookup(key3, my_hash);
+  tmp_h = shard_->Lookup(key3, hkey3);
   ASSERT_EQ(nullptr, tmp_h);
 }
 
@@ -1371,9 +1363,11 @@ TEST_F(LRUCacheSecondaryCacheTest, SaveFailTest) {
   std::string str2 = rnd.RandomString(1020);
   TestItem* item2 = new TestItem(str2.data(), str2.length());
   // k1 should be demoted to NVM
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
   ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
                           &LRUCacheSecondaryCacheTest::helper_fail_,
                           str2.length()));
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
 
   Cache::Handle* handle;
   handle =
diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc
index 3e6d6a4f73..9ebca3ba82 100644
--- a/cache/sharded_cache.cc
+++ b/cache/sharded_cache.cc
@@ -19,184 +19,49 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-namespace {
-
-inline uint32_t HashSlice(const Slice& s) {
-  return Lower32of64(GetSliceNPHash64(s));
-}
-
-}  // namespace
-
-ShardedCache::ShardedCache(size_t capacity, int num_shard_bits,
-                           bool strict_capacity_limit,
-                           std::shared_ptr<MemoryAllocator> allocator)
+ShardedCacheBase::ShardedCacheBase(size_t capacity, int num_shard_bits,
+                                   bool strict_capacity_limit,
+                                   std::shared_ptr<MemoryAllocator> allocator)
     : Cache(std::move(allocator)),
+      last_id_(1),
       shard_mask_((uint32_t{1} << num_shard_bits) - 1),
-      capacity_(capacity),
       strict_capacity_limit_(strict_capacity_limit),
-      last_id_(1) {}
+      capacity_(capacity) {}
 
-void ShardedCache::SetCapacity(size_t capacity) {
+size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const {
   uint32_t num_shards = GetNumShards();
-  const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
-  MutexLock l(&capacity_mutex_);
-  for (uint32_t s = 0; s < num_shards; s++) {
-    GetShard(s)->SetCapacity(per_shard);
-  }
-  capacity_ = capacity;
+  return (capacity + (num_shards - 1)) / num_shards;
 }
 
-void ShardedCache::SetStrictCapacityLimit(bool strict_capacity_limit) {
-  uint32_t num_shards = GetNumShards();
-  MutexLock l(&capacity_mutex_);
-  for (uint32_t s = 0; s < num_shards; s++) {
-    GetShard(s)->SetStrictCapacityLimit(strict_capacity_limit);
-  }
-  strict_capacity_limit_ = strict_capacity_limit;
+size_t ShardedCacheBase::GetPerShardCapacity() const {
+  return ComputePerShardCapacity(GetCapacity());
 }
 
-Status ShardedCache::Insert(const Slice& key, void* value, size_t charge,
-                            DeleterFn deleter, Handle** handle,
-                            Priority priority) {
-  uint32_t hash = HashSlice(key);
-  return GetShard(Shard(hash))
-      ->Insert(key, hash, value, charge, deleter, handle, priority);
-}
-
-Status ShardedCache::Insert(const Slice& key, void* value,
-                            const CacheItemHelper* helper, size_t charge,
-                            Handle** handle, Priority priority) {
-  uint32_t hash = HashSlice(key);
-  if (!helper) {
-    return Status::InvalidArgument();
-  }
-  return GetShard(Shard(hash))
-      ->Insert(key, hash, value, helper, charge, handle, priority);
-}
-
-Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* /*stats*/) {
-  uint32_t hash = HashSlice(key);
-  return GetShard(Shard(hash))->Lookup(key, hash);
-}
-
-Cache::Handle* ShardedCache::Lookup(const Slice& key,
-                                    const CacheItemHelper* helper,
-                                    const CreateCallback& create_cb,
-                                    Priority priority, bool wait,
-                                    Statistics* stats) {
-  uint32_t hash = HashSlice(key);
-  return GetShard(Shard(hash))
-      ->Lookup(key, hash, helper, create_cb, priority, wait, stats);
-}
-
-bool ShardedCache::IsReady(Handle* handle) {
-  uint32_t hash = GetHash(handle);
-  return GetShard(Shard(hash))->IsReady(handle);
-}
-
-void ShardedCache::Wait(Handle* handle) {
-  uint32_t hash = GetHash(handle);
-  GetShard(Shard(hash))->Wait(handle);
-}
-
-bool ShardedCache::Ref(Handle* handle) {
-  uint32_t hash = GetHash(handle);
-  return GetShard(Shard(hash))->Ref(handle);
-}
-
-bool ShardedCache::Release(Handle* handle, bool erase_if_last_ref) {
-  uint32_t hash = GetHash(handle);
-  return GetShard(Shard(hash))->Release(handle, erase_if_last_ref);
-}
-
-bool ShardedCache::Release(Handle* handle, bool useful,
-                           bool erase_if_last_ref) {
-  uint32_t hash = GetHash(handle);
-  return GetShard(Shard(hash))->Release(handle, useful, erase_if_last_ref);
-}
-
-void ShardedCache::Erase(const Slice& key) {
-  uint32_t hash = HashSlice(key);
-  GetShard(Shard(hash))->Erase(key, hash);
-}
-
-uint64_t ShardedCache::NewId() {
+uint64_t ShardedCacheBase::NewId() {
   return last_id_.fetch_add(1, std::memory_order_relaxed);
 }
 
-size_t ShardedCache::GetCapacity() const {
-  MutexLock l(&capacity_mutex_);
+size_t ShardedCacheBase::GetCapacity() const {
+  MutexLock l(&config_mutex_);
   return capacity_;
 }
 
-bool ShardedCache::HasStrictCapacityLimit() const {
-  MutexLock l(&capacity_mutex_);
+bool ShardedCacheBase::HasStrictCapacityLimit() const {
+  MutexLock l(&config_mutex_);
   return strict_capacity_limit_;
 }
 
-size_t ShardedCache::GetUsage() const {
-  // We will not lock the cache when getting the usage from shards.
-  uint32_t num_shards = GetNumShards();
-  size_t usage = 0;
-  for (uint32_t s = 0; s < num_shards; s++) {
-    usage += GetShard(s)->GetUsage();
-  }
-  return usage;
-}
-
-size_t ShardedCache::GetUsage(Handle* handle) const {
+size_t ShardedCacheBase::GetUsage(Handle* handle) const {
   return GetCharge(handle);
 }
 
-size_t ShardedCache::GetPinnedUsage() const {
-  // We will not lock the cache when getting the usage from shards.
-  uint32_t num_shards = GetNumShards();
-  size_t usage = 0;
-  for (uint32_t s = 0; s < num_shards; s++) {
-    usage += GetShard(s)->GetPinnedUsage();
-  }
-  return usage;
-}
-
-void ShardedCache::ApplyToAllEntries(
-    const std::function<void(const Slice& key, void* value, size_t charge,
-                             DeleterFn deleter)>& callback,
-    const ApplyToAllEntriesOptions& opts) {
-  uint32_t num_shards = GetNumShards();
-  // Iterate over part of each shard, rotating between shards, to
-  // minimize impact on latency of concurrent operations.
-  std::unique_ptr<uint32_t[]> states(new uint32_t[num_shards]{});
-
-  uint32_t aepl_in_32 = static_cast<uint32_t>(
-      std::min(size_t{UINT32_MAX}, opts.average_entries_per_lock));
-  aepl_in_32 = std::min(aepl_in_32, uint32_t{1});
-
-  bool remaining_work;
-  do {
-    remaining_work = false;
-    for (uint32_t s = 0; s < num_shards; s++) {
-      if (states[s] != UINT32_MAX) {
-        GetShard(s)->ApplyToSomeEntries(callback, aepl_in_32, &states[s]);
-        remaining_work |= states[s] != UINT32_MAX;
-      }
-    }
-  } while (remaining_work);
-}
-
-void ShardedCache::EraseUnRefEntries() {
-  uint32_t num_shards = GetNumShards();
-  for (uint32_t s = 0; s < num_shards; s++) {
-    GetShard(s)->EraseUnRefEntries();
-  }
-}
-
-std::string ShardedCache::GetPrintableOptions() const {
+std::string ShardedCacheBase::GetPrintableOptions() const {
   std::string ret;
   ret.reserve(20000);
   const int kBufferSize = 200;
   char buffer[kBufferSize];
   {
-    MutexLock l(&capacity_mutex_);
+    MutexLock l(&config_mutex_);
     snprintf(buffer, kBufferSize, "    capacity : %" ROCKSDB_PRIszt "\n",
              capacity_);
     ret.append(buffer);
@@ -210,7 +75,7 @@ std::string ShardedCache::GetPrintableOptions() const {
   snprintf(buffer, kBufferSize, "    memory_allocator : %s\n",
            memory_allocator() ? memory_allocator()->Name() : "None");
   ret.append(buffer);
-  ret.append(GetShard(0)->GetPrintableOptions());
+  AppendPrintableOptions(ret);
   return ret;
 }
 
@@ -226,25 +91,10 @@ int GetDefaultCacheShardBits(size_t capacity, size_t min_shard_size) {
   return num_shard_bits;
 }
 
-int ShardedCache::GetNumShardBits() const { return BitsSetToOne(shard_mask_); }
-
-uint32_t ShardedCache::GetNumShards() const { return shard_mask_ + 1; }
-
-size_t ShardedCache::GetOccupancyCount() const {
-  size_t oc = 0;
-  uint32_t num_shards = GetNumShards();
-  for (uint32_t s = 0; s < num_shards; s++) {
-    oc += GetShard(s)->GetOccupancyCount();
-  }
-  return oc;
-}
-size_t ShardedCache::GetTableAddressCount() const {
-  size_t tac = 0;
-  uint32_t num_shards = GetNumShards();
-  for (uint32_t s = 0; s < num_shards; s++) {
-    tac += GetShard(s)->GetTableAddressCount();
-  }
-  return tac;
+int ShardedCacheBase::GetNumShardBits() const {
+  return BitsSetToOne(shard_mask_);
 }
 
+uint32_t ShardedCacheBase::GetNumShards() const { return shard_mask_ + 1; }
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h
index 8713d1dce9..e3271cc7bd 100644
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@@ -10,122 +10,309 @@
 #pragma once
 
 #include <atomic>
+#include <cstdint>
 #include <string>
 
+#include "port/lang.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-// Single cache shard interface.
-class CacheShard {
+// Optional base class for classes implementing the CacheShard concept
+class CacheShardBase {
  public:
-  explicit CacheShard(CacheMetadataChargePolicy metadata_charge_policy)
+  explicit CacheShardBase(CacheMetadataChargePolicy metadata_charge_policy)
       : metadata_charge_policy_(metadata_charge_policy) {}
-  virtual ~CacheShard() = default;
 
   using DeleterFn = Cache::DeleterFn;
-  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
-                        size_t charge, DeleterFn deleter,
-                        Cache::Handle** handle, Cache::Priority priority) = 0;
-  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
-                        const Cache::CacheItemHelper* helper, size_t charge,
-                        Cache::Handle** handle, Cache::Priority priority) = 0;
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) = 0;
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash,
-                                const Cache::CacheItemHelper* helper,
-                                const Cache::CreateCallback& create_cb,
-                                Cache::Priority priority, bool wait,
-                                Statistics* stats) = 0;
-  virtual bool Release(Cache::Handle* handle, bool useful,
-                       bool erase_if_last_ref) = 0;
-  virtual bool IsReady(Cache::Handle* handle) = 0;
-  virtual void Wait(Cache::Handle* handle) = 0;
-  virtual bool Ref(Cache::Handle* handle) = 0;
-  virtual bool Release(Cache::Handle* handle, bool erase_if_last_ref) = 0;
-  virtual void Erase(const Slice& key, uint32_t hash) = 0;
-  virtual void SetCapacity(size_t capacity) = 0;
-  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
-  virtual size_t GetUsage() const = 0;
-  virtual size_t GetPinnedUsage() const = 0;
-  virtual size_t GetOccupancyCount() const = 0;
-  virtual size_t GetTableAddressCount() const = 0;
+
+  // Expected by concept CacheShard (TODO with C++20 support)
+  // Some Defaults
+  std::string GetPrintableOptions() const { return ""; }
+  using HashVal = uint64_t;
+  using HashCref = uint64_t;
+  static inline HashVal ComputeHash(const Slice& key) {
+    return GetSliceNPHash64(key);
+  }
+  static inline uint32_t HashPieceForSharding(HashCref hash) {
+    return Lower32of64(hash);
+  }
+  void AppendPrintableOptions(std::string& /*str*/) const {}
+
+  // Must be provided for concept CacheShard (TODO with C++20 support)
+  /*
+  struct HandleImpl {  // for concept HandleImpl
+    HashVal hash;
+    HashCref GetHash() const;
+    ...
+  };
+  Status Insert(const Slice& key, HashCref hash, void* value, size_t charge,
+                DeleterFn deleter, HandleImpl** handle,
+                Cache::Priority priority) = 0;
+  Status Insert(const Slice& key, HashCref hash, void* value,
+                const Cache::CacheItemHelper* helper, size_t charge,
+                HandleImpl** handle, Cache::Priority priority) = 0;
+  HandleImpl* Lookup(const Slice& key, HashCref hash) = 0;
+  HandleImpl* Lookup(const Slice& key, HashCref hash,
+                        const Cache::CacheItemHelper* helper,
+                        const Cache::CreateCallback& create_cb,
+                        Cache::Priority priority, bool wait,
+                        Statistics* stats) = 0;
+  bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) = 0;
+  bool IsReady(HandleImpl* handle) = 0;
+  void Wait(HandleImpl* handle) = 0;
+  bool Ref(HandleImpl* handle) = 0;
+  void Erase(const Slice& key, HashCref hash) = 0;
+  void SetCapacity(size_t capacity) = 0;
+  void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
+  size_t GetUsage() const = 0;
+  size_t GetPinnedUsage() const = 0;
+  size_t GetOccupancyCount() const = 0;
+  size_t GetTableAddressCount() const = 0;
   // Handles iterating over roughly `average_entries_per_lock` entries, using
   // `state` to somehow record where it last ended up. Caller initially uses
-  // *state == 0 and implementation sets *state = UINT32_MAX to indicate
+  // *state == 0 and implementation sets *state = SIZE_MAX to indicate
   // completion.
-  virtual void ApplyToSomeEntries(
+  void ApplyToSomeEntries(
       const std::function<void(const Slice& key, void* value, size_t charge,
                                DeleterFn deleter)>& callback,
-      uint32_t average_entries_per_lock, uint32_t* state) = 0;
-  virtual void EraseUnRefEntries() = 0;
-  virtual std::string GetPrintableOptions() const { return ""; }
+      size_t average_entries_per_lock, size_t* state) = 0;
+  void EraseUnRefEntries() = 0;
+  */
 
  protected:
   const CacheMetadataChargePolicy metadata_charge_policy_;
 };
 
-// Generic cache interface which shards cache by hash of keys. 2^num_shard_bits
-// shards will be created, with capacity split evenly to each of the shards.
-// Keys are sharded by the highest num_shard_bits bits of hash value.
-class ShardedCache : public Cache {
+// Portions of ShardedCache that do not depend on the template parameter
+class ShardedCacheBase : public Cache {
  public:
-  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-               std::shared_ptr<MemoryAllocator> memory_allocator = nullptr);
-  virtual ~ShardedCache() = default;
-  virtual CacheShard* GetShard(uint32_t shard) = 0;
-  virtual const CacheShard* GetShard(uint32_t shard) const = 0;
-
-  virtual uint32_t GetHash(Handle* handle) const = 0;
-
-  virtual void SetCapacity(size_t capacity) override;
-  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
-
-  virtual Status Insert(const Slice& key, void* value, size_t charge,
-                        DeleterFn deleter, Handle** handle,
-                        Priority priority) override;
-  virtual Status Insert(const Slice& key, void* value,
-                        const CacheItemHelper* helper, size_t charge,
-                        Handle** handle = nullptr,
-                        Priority priority = Priority::LOW) override;
-  virtual Handle* Lookup(const Slice& key, Statistics* stats) override;
-  virtual Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
-                         const CreateCallback& create_cb, Priority priority,
-                         bool wait, Statistics* stats = nullptr) override;
-  virtual bool Release(Handle* handle, bool useful,
-                       bool erase_if_last_ref = false) override;
-  virtual bool IsReady(Handle* handle) override;
-  virtual void Wait(Handle* handle) override;
-  virtual bool Ref(Handle* handle) override;
-  virtual bool Release(Handle* handle, bool erase_if_last_ref = false) override;
-  virtual void Erase(const Slice& key) override;
-  virtual uint64_t NewId() override;
-  virtual size_t GetCapacity() const override;
-  virtual bool HasStrictCapacityLimit() const override;
-  virtual size_t GetUsage() const override;
-  virtual size_t GetUsage(Handle* handle) const override;
-  virtual size_t GetPinnedUsage() const override;
-  virtual size_t GetOccupancyCount() const override;
-  virtual size_t GetTableAddressCount() const override;
-  virtual void ApplyToAllEntries(
-      const std::function<void(const Slice& key, void* value, size_t charge,
-                               DeleterFn deleter)>& callback,
-      const ApplyToAllEntriesOptions& opts) override;
-  virtual void EraseUnRefEntries() override;
-  virtual std::string GetPrintableOptions() const override;
+  ShardedCacheBase(size_t capacity, int num_shard_bits,
+                   bool strict_capacity_limit,
+                   std::shared_ptr<MemoryAllocator> memory_allocator);
+  virtual ~ShardedCacheBase() = default;
 
   int GetNumShardBits() const;
   uint32_t GetNumShards() const;
 
+  uint64_t NewId() override;
+
+  bool HasStrictCapacityLimit() const override;
+  size_t GetCapacity() const override;
+
+  using Cache::GetUsage;
+  size_t GetUsage(Handle* handle) const override;
+  std::string GetPrintableOptions() const override;
+
+ protected:  // fns
+  virtual void AppendPrintableOptions(std::string& str) const = 0;
+  size_t GetPerShardCapacity() const;
+  size_t ComputePerShardCapacity(size_t capacity) const;
+
+ protected:                        // data
+  std::atomic<uint64_t> last_id_;  // For NewId
+  const uint32_t shard_mask_;
+
+  // Dynamic configuration parameters, guarded by config_mutex_
+  bool strict_capacity_limit_;
+  size_t capacity_;
+  mutable port::Mutex config_mutex_;
+};
+
+// Generic cache interface that shards cache by hash of keys. 2^num_shard_bits
+// shards will be created, with capacity split evenly to each of the shards.
+// Keys are typically sharded by the lowest num_shard_bits bits of hash value
+// so that the upper bits of the hash value can keep a stable ordering of
+// table entries even as the table grows (using more upper hash bits).
+// See CacheShardBase above for what is expected of the CacheShard parameter.
+template <class CacheShard>
+class ShardedCache : public ShardedCacheBase {
+ public:
+  using HashVal = typename CacheShard::HashVal;
+  using HashCref = typename CacheShard::HashCref;
+  using HandleImpl = typename CacheShard::HandleImpl;
+
+  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+               std::shared_ptr<MemoryAllocator> allocator)
+      : ShardedCacheBase(capacity, num_shard_bits, strict_capacity_limit,
+                         allocator),
+        shards_(reinterpret_cast<CacheShard*>(port::cacheline_aligned_alloc(
+            sizeof(CacheShard) * GetNumShards()))),
+        destroy_shards_in_dtor_(false) {}
+
+  virtual ~ShardedCache() {
+    if (destroy_shards_in_dtor_) {
+      ForEachShard([](CacheShard* cs) { cs->~CacheShard(); });
+    }
+    port::cacheline_aligned_free(shards_);
+  }
+
+  CacheShard& GetShard(HashCref hash) {
+    return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_];
+  }
+
+  const CacheShard& GetShard(HashCref hash) const {
+    return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_];
+  }
+
+  void SetCapacity(size_t capacity) override {
+    MutexLock l(&config_mutex_);
+    capacity_ = capacity;
+    auto per_shard = ComputePerShardCapacity(capacity);
+    ForEachShard([=](CacheShard* cs) { cs->SetCapacity(per_shard); });
+  }
+
+  void SetStrictCapacityLimit(bool s_c_l) override {
+    MutexLock l(&config_mutex_);
+    strict_capacity_limit_ = s_c_l;
+    ForEachShard(
+        [s_c_l](CacheShard* cs) { cs->SetStrictCapacityLimit(s_c_l); });
+  }
+
+  Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
+                Handle** handle, Priority priority) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    auto h_out = reinterpret_cast<HandleImpl**>(handle);
+    return GetShard(hash).Insert(key, hash, value, charge, deleter, h_out,
+                                 priority);
+  }
+  Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
+                size_t charge, Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override {
+    if (!helper) {
+      return Status::InvalidArgument();
+    }
+    HashVal hash = CacheShard::ComputeHash(key);
+    auto h_out = reinterpret_cast<HandleImpl**>(handle);
+    return GetShard(hash).Insert(key, hash, value, helper, charge, h_out,
+                                 priority);
+  }
+
+  Handle* Lookup(const Slice& key, Statistics* /*stats*/) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    HandleImpl* result = GetShard(hash).Lookup(key, hash);
+    return reinterpret_cast<Handle*>(result);
+  }
+  Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+                 const CreateCallback& create_cb, Priority priority, bool wait,
+                 Statistics* stats = nullptr) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, create_cb,
+                                               priority, wait, stats);
+    return reinterpret_cast<Handle*>(result);
+  }
+
+  void Erase(const Slice& key) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    GetShard(hash).Erase(key, hash);
+  }
+
+  bool Release(Handle* handle, bool useful,
+               bool erase_if_last_ref = false) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    return GetShard(h->GetHash()).Release(h, useful, erase_if_last_ref);
+  }
+  bool IsReady(Handle* handle) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    return GetShard(h->GetHash()).IsReady(h);
+  }
+  void Wait(Handle* handle) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    GetShard(h->GetHash()).Wait(h);
+  }
+  bool Ref(Handle* handle) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    return GetShard(h->GetHash()).Ref(h);
+  }
+  bool Release(Handle* handle, bool erase_if_last_ref = false) override {
+    return Release(handle, true /*useful*/, erase_if_last_ref);
+  }
+  using ShardedCacheBase::GetUsage;
+  size_t GetUsage() const override {
+    return SumOverShards2(&CacheShard::GetUsage);
+  }
+  size_t GetPinnedUsage() const override {
+    return SumOverShards2(&CacheShard::GetPinnedUsage);
+  }
+  size_t GetOccupancyCount() const override {
+    return SumOverShards2(&CacheShard::GetPinnedUsage);
+  }
+  size_t GetTableAddressCount() const override {
+    return SumOverShards2(&CacheShard::GetTableAddressCount);
+  }
+  void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override {
+    uint32_t num_shards = GetNumShards();
+    // Iterate over part of each shard, rotating between shards, to
+    // minimize impact on latency of concurrent operations.
+    std::unique_ptr<size_t[]> states(new size_t[num_shards]{});
+
+    size_t aepl = opts.average_entries_per_lock;
+    aepl = std::min(aepl, size_t{1});
+
+    bool remaining_work;
+    do {
+      remaining_work = false;
+      for (uint32_t i = 0; i < num_shards; i++) {
+        if (states[i] != SIZE_MAX) {
+          shards_[i].ApplyToSomeEntries(callback, aepl, &states[i]);
+          remaining_work |= states[i] != SIZE_MAX;
+        }
+      }
+    } while (remaining_work);
+  }
+
+  virtual void EraseUnRefEntries() override {
+    ForEachShard([](CacheShard* cs) { cs->EraseUnRefEntries(); });
+  }
+
+  void DisownData() override {
+    // Leak data only if that won't generate an ASAN/valgrind warning.
+    if (!kMustFreeHeapAllocations) {
+      destroy_shards_in_dtor_ = false;
+    }
+  }
+
  protected:
-  inline uint32_t Shard(uint32_t hash) { return hash & shard_mask_; }
+  inline void ForEachShard(const std::function<void(CacheShard*)>& fn) {
+    uint32_t num_shards = GetNumShards();
+    for (uint32_t i = 0; i < num_shards; i++) {
+      fn(shards_ + i);
+    }
+  }
+
+  inline size_t SumOverShards(
+      const std::function<size_t(CacheShard&)>& fn) const {
+    uint32_t num_shards = GetNumShards();
+    size_t result = 0;
+    for (uint32_t i = 0; i < num_shards; i++) {
+      result += fn(shards_[i]);
+    }
+    return result;
+  }
+
+  inline size_t SumOverShards2(size_t (CacheShard::*fn)() const) const {
+    return SumOverShards([fn](CacheShard& cs) { return (cs.*fn)(); });
+  }
+
+  // Must be called exactly once by derived class constructor
+  void InitShards(const std::function<void(CacheShard*)>& placement_new) {
+    ForEachShard(placement_new);
+    destroy_shards_in_dtor_ = true;
+  }
+
+  void AppendPrintableOptions(std::string& str) const override {
+    shards_[0].AppendPrintableOptions(str);
+  }
 
  private:
-  const uint32_t shard_mask_;
-  mutable port::Mutex capacity_mutex_;
-  size_t capacity_;
-  bool strict_capacity_limit_;
-  std::atomic<uint64_t> last_id_;
+  CacheShard* const shards_;
+  bool destroy_shards_in_dtor_;
 };
 
 // 512KB is traditional minimum shard size.
diff --git a/options/options_test.cc b/options/options_test.cc
index b89434c3ae..37001379a5 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -613,7 +613,7 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
       &new_cf_opt));
   ASSERT_NE(new_cf_opt.blob_cache, nullptr);
   ASSERT_EQ(new_cf_opt.blob_cache->GetCapacity(), 1024UL * 1024UL);
-  ASSERT_EQ(static_cast<ShardedCache*>(new_cf_opt.blob_cache.get())
+  ASSERT_EQ(static_cast<ShardedCacheBase*>(new_cf_opt.blob_cache.get())
                 ->GetNumShardBits(),
             4);
   ASSERT_EQ(new_cf_opt.blob_cache->HasStrictCapacityLimit(), true);
@@ -1064,15 +1064,18 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
       &new_opt));
   ASSERT_TRUE(new_opt.block_cache != nullptr);
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                 new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                 new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
@@ -1088,9 +1091,9 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_TRUE(new_opt.block_cache != nullptr);
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL);
   // Default values
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(),
-                GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
                 ->GetHighPriPoolRatio(),
@@ -1098,10 +1101,11 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL);
   // Default values
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(),
-                GetDefaultCacheShardBits(
-                    new_opt.block_cache_compressed->GetCapacity()));
+  ASSERT_EQ(
+      std::dynamic_pointer_cast<ShardedCacheBase>(
+          new_opt.block_cache_compressed)
+          ->GetNumShardBits(),
+      GetDefaultCacheShardBits(new_opt.block_cache_compressed->GetCapacity()));
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                 ->GetHighPriPoolRatio(),
@@ -1115,15 +1119,18 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
       "high_pri_pool_ratio=0.0;}",
       &new_opt));
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 5);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            5);
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                 new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 0);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 5);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            5);
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                 ->GetHighPriPoolRatio(),
@@ -1139,16 +1146,19 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
       &new_opt));
   ASSERT_TRUE(new_opt.block_cache != nullptr);
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
                 ->GetHighPriPoolRatio(),
             0.5);
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                 ->GetHighPriPoolRatio(),
@@ -2790,7 +2800,7 @@ TEST_F(OptionsOldApiTest, GetColumnFamilyOptionsFromStringTest) {
       &new_cf_opt));
   ASSERT_NE(new_cf_opt.blob_cache, nullptr);
   ASSERT_EQ(new_cf_opt.blob_cache->GetCapacity(), 1024UL * 1024UL);
-  ASSERT_EQ(static_cast<ShardedCache*>(new_cf_opt.blob_cache.get())
+  ASSERT_EQ(static_cast<ShardedCacheBase*>(new_cf_opt.blob_cache.get())
                 ->GetNumShardBits(),
             4);
   ASSERT_EQ(new_cf_opt.blob_cache->HasStrictCapacityLimit(), true);
@@ -2970,15 +2980,18 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
              &new_opt));
   ASSERT_TRUE(new_opt.block_cache != nullptr);
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                 new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                 new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
@@ -2993,9 +3006,9 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_TRUE(new_opt.block_cache != nullptr);
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL);
   // Default values
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(),
-                GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
                 ->GetHighPriPoolRatio(),
@@ -3003,10 +3016,11 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL);
   // Default values
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(),
-                GetDefaultCacheShardBits(
-                    new_opt.block_cache_compressed->GetCapacity()));
+  ASSERT_EQ(
+      std::dynamic_pointer_cast<ShardedCacheBase>(
+          new_opt.block_cache_compressed)
+          ->GetNumShardBits(),
+      GetDefaultCacheShardBits(new_opt.block_cache_compressed->GetCapacity()));
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                 ->GetHighPriPoolRatio(),
@@ -3020,15 +3034,18 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
       "high_pri_pool_ratio=0.0;}",
       &new_opt));
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 5);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            5);
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
                 new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 0);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 5);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            5);
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                 ->GetHighPriPoolRatio(),
@@ -3043,16 +3060,19 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
              &new_opt));
   ASSERT_TRUE(new_opt.block_cache != nullptr);
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
                 ->GetHighPriPoolRatio(),
             0.5);
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
-  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
-                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
   ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
                 ->GetHighPriPoolRatio(),
diff --git a/port/win/port_win.h b/port/win/port_win.h
index 5a8f660516..9ac8d045de 100644
--- a/port/win/port_win.h
+++ b/port/win/port_win.h
@@ -246,13 +246,8 @@ inline void cacheline_aligned_free(void *memblock) {
 
 extern const size_t kPageSize;
 
-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52991 for MINGW32
-// could not be worked around with by -mno-ms-bitfields
-#ifndef __MINGW32__
-#define ALIGN_AS(n) __declspec(align(n))
-#else
-#define ALIGN_AS(n)
-#endif
+// Part of C++11
+#define ALIGN_AS(n) alignas(n)
 
 static inline void AsmVolatilePause() {
 #if defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM)
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 1a11096cf2..53b59e3d89 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -524,32 +524,32 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {
 
   // More complex test of shared key space, in case the instances are wrappers
   // for some shared underlying cache.
-  std::string sentinel_key(size_t{1}, '\0');
+  CacheKey sentinel_key = CacheKey::CreateUniqueForProcessLifetime();
   static char kRegularBlockCacheMarker = 'b';
   static char kCompressedBlockCacheMarker = 'c';
   static char kPersistentCacheMarker = 'p';
   if (bbto.block_cache) {
     bbto.block_cache
-        ->Insert(Slice(sentinel_key), &kRegularBlockCacheMarker, 1,
+        ->Insert(sentinel_key.AsSlice(), &kRegularBlockCacheMarker, 1,
                  GetNoopDeleterForRole<CacheEntryRole::kMisc>())
         .PermitUncheckedError();
   }
   if (bbto.block_cache_compressed) {
     bbto.block_cache_compressed
-        ->Insert(Slice(sentinel_key), &kCompressedBlockCacheMarker, 1,
+        ->Insert(sentinel_key.AsSlice(), &kCompressedBlockCacheMarker, 1,
                  GetNoopDeleterForRole<CacheEntryRole::kMisc>())
         .PermitUncheckedError();
   }
   if (bbto.persistent_cache) {
     // Note: persistent cache copies the data, not keeping the pointer
     bbto.persistent_cache
-        ->Insert(Slice(sentinel_key), &kPersistentCacheMarker, 1)
+        ->Insert(sentinel_key.AsSlice(), &kPersistentCacheMarker, 1)
         .PermitUncheckedError();
   }
   // If we get something different from what we inserted, that indicates
   // dangerously overlapping key spaces.
   if (bbto.block_cache) {
-    auto handle = bbto.block_cache->Lookup(Slice(sentinel_key));
+    auto handle = bbto.block_cache->Lookup(sentinel_key.AsSlice());
     if (handle) {
       auto v = static_cast<char*>(bbto.block_cache->Value(handle));
       char c = *v;
@@ -568,7 +568,7 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {
     }
   }
   if (bbto.block_cache_compressed) {
-    auto handle = bbto.block_cache_compressed->Lookup(Slice(sentinel_key));
+    auto handle = bbto.block_cache_compressed->Lookup(sentinel_key.AsSlice());
     if (handle) {
       auto v = static_cast<char*>(bbto.block_cache_compressed->Value(handle));
       char c = *v;
@@ -591,7 +591,7 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {
   if (bbto.persistent_cache) {
     std::unique_ptr<char[]> data;
     size_t size = 0;
-    bbto.persistent_cache->Lookup(Slice(sentinel_key), &data, &size)
+    bbto.persistent_cache->Lookup(sentinel_key.AsSlice(), &data, &size)
         .PermitUncheckedError();
     if (data && size > 0) {
       if (data[0] == kRegularBlockCacheMarker) {
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 40dcd6e1fe..b331cb4e52 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -21,7 +21,6 @@
 
 #include "cache/cache_entry_roles.h"
 #include "cache/cache_key.h"
-#include "cache/sharded_cache.h"
 #include "db/compaction/compaction_picker.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"