diff --git a/.gitignore b/.gitignore
index 6364dfdc40..180fb4c500 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ ldb
 manifest_dump
 sst_dump
 blob_dump
+block_cache_trace_analyzer
 column_aware_encoding_exp
 util/build_version.cc
 build_tools/VALGRIND_LOGS/
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index 46ce78db68..b728c67c7d 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -562,6 +562,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) {
     ASSERT_OK(s);
     ASSERT_NE(nullptr, handles[i]);
   }
+  ASSERT_EQ(10, cache->GetUsage());
 
   // test2: set the flag to true. Insert and check if it fails.
   std::string extra_key = "extra";
@@ -571,6 +572,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) {
   s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle);
   ASSERT_TRUE(s.IsIncomplete());
   ASSERT_EQ(nullptr, handle);
+  ASSERT_EQ(10, cache->GetUsage());
 
   for (size_t i = 0; i < 10; i++) {
     cache->Release(handles[i]);
@@ -591,7 +593,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) {
   s = cache2->Insert(extra_key, extra_value, 1, &deleter);
   // AS if the key have been inserted into cache but get evicted immediately.
   ASSERT_OK(s);
-  ASSERT_EQ(5, cache->GetUsage());
+  ASSERT_EQ(5, cache2->GetUsage());
   ASSERT_EQ(nullptr, cache2->Lookup(extra_key));
 
   for (size_t i = 0; i < 5; i++) {
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index 676bed3051..7c04cb909d 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -24,7 +24,7 @@ LRUHandleTable::LRUHandleTable() : list_(nullptr), length_(0), elems_(0) {
 
 LRUHandleTable::~LRUHandleTable() {
   ApplyToAllCacheEntries([](LRUHandle* h) {
-    if (h->refs == 1) {
+    if (!h->HasRefs()) {
       h->Free();
     }
   });
@@ -113,29 +113,17 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
   SetCapacity(capacity);
 }
 
-LRUCacheShard::~LRUCacheShard() {}
-
-bool LRUCacheShard::Unref(LRUHandle* e) {
-  assert(e->refs > 0);
-  e->refs--;
-  return e->refs == 0;
-}
-
-// Call deleter and free
-
 void LRUCacheShard::EraseUnRefEntries() {
   autovector<LRUHandle*> last_reference_list;
   {
     MutexLock l(&mutex_);
     while (lru_.next != &lru_) {
       LRUHandle* old = lru_.next;
-      assert(old->InCache());
-      assert(old->refs ==
-             1);  // LRU list contains elements which may be evicted
+      // LRU list contains only elements which can be evicted
+      assert(old->InCache() && !old->HasRefs());
       LRU_Remove(old);
       table_.Remove(old->key(), old->hash);
       old->SetInCache(false);
-      Unref(old);
       usage_ -= old->charge;
       last_reference_list.push_back(old);
     }
@@ -148,22 +136,27 @@ void LRUCacheShard::EraseUnRefEntries() {
 
 void LRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
                                            bool thread_safe) {
+  const auto applyCallback = [&]() {
+    table_.ApplyToAllCacheEntries(
+        [callback](LRUHandle* h) { callback(h->value, h->charge); });
+  };
+
   if (thread_safe) {
-    mutex_.Lock();
-  }
-  table_.ApplyToAllCacheEntries(
-      [callback](LRUHandle* h) { callback(h->value, h->charge); });
-  if (thread_safe) {
-    mutex_.Unlock();
+    MutexLock l(&mutex_);
+    applyCallback();
+  } else {
+    applyCallback();
   }
 }
 
 void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) {
+  MutexLock l(&mutex_);
   *lru = &lru_;
   *lru_low_pri = lru_low_pri_;
 }
 
 size_t LRUCacheShard::TEST_GetLRUSize() {
+  MutexLock l(&mutex_);
   LRUHandle* lru_handle = lru_.next;
   size_t lru_size = 0;
   while (lru_handle != &lru_) {
@@ -231,14 +224,13 @@ void LRUCacheShard::MaintainPoolSize() {
 
 void LRUCacheShard::EvictFromLRU(size_t charge,
                                  autovector<LRUHandle*>* deleted) {
-  while (usage_ + charge > capacity_ && lru_.next != &lru_) {
+  while ((usage_ + charge) > capacity_ && lru_.next != &lru_) {
     LRUHandle* old = lru_.next;
-    assert(old->InCache());
-    assert(old->refs == 1);  // LRU list contains elements which may be evicted
+    // LRU list contains only elements which can be evicted
+    assert(old->InCache() && !old->HasRefs());
     LRU_Remove(old);
     table_.Remove(old->key(), old->hash);
     old->SetInCache(false);
-    Unref(old);
     usage_ -= old->charge;
     deleted->push_back(old);
   }
@@ -252,8 +244,8 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
     high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
     EvictFromLRU(0, &last_reference_list);
   }
-  // we free the entries here outside of mutex for
-  // performance reasons
+
+  // Free the entries outside of mutex for performance reasons
   for (auto entry : last_reference_list) {
     entry->Free();
   }
@@ -269,22 +261,22 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
   LRUHandle* e = table_.Lookup(key, hash);
   if (e != nullptr) {
     assert(e->InCache());
-    if (e->refs == 1) {
+    if (!e->HasRefs()) {
+      // The entry is in LRU since it's in hash and has no external references
       LRU_Remove(e);
     }
-    e->refs++;
+    e->Ref();
     e->SetHit();
   }
   return reinterpret_cast<Cache::Handle*>(e);
 }
 
 bool LRUCacheShard::Ref(Cache::Handle* h) {
-  LRUHandle* handle = reinterpret_cast<LRUHandle*>(h);
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(h);
   MutexLock l(&mutex_);
-  if (handle->InCache() && handle->refs == 1) {
-    LRU_Remove(handle);
-  }
-  handle->refs++;
+  // To create another reference - entry must be already externally referenced
+  assert(e->HasRefs());
+  e->Ref();
   return true;
 }
 
@@ -303,30 +295,27 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) {
   bool last_reference = false;
   {
     MutexLock l(&mutex_);
-    last_reference = Unref(e);
+    last_reference = e->Unref();
+    if (last_reference && e->InCache()) {
+      // The item is still in cache, and nobody else holds a reference to it
+      if (usage_ > capacity_ || force_erase) {
+        // The LRU list must be empty since the cache is full
+        assert(lru_.next == &lru_ || force_erase);
+        // Take this opportunity and remove the item
+        table_.Remove(e->key(), e->hash);
+        e->SetInCache(false);
+      } else {
+        // Put the item back on the LRU list, and don't free it
+        LRU_Insert(e);
+        last_reference = false;
+      }
+    }
     if (last_reference) {
       usage_ -= e->charge;
     }
-    if (e->refs == 1 && e->InCache()) {
-      // The item is still in cache, and nobody else holds a reference to it
-      if (usage_ > capacity_ || force_erase) {
-        // the cache is full
-        // The LRU list must be empty since the cache is full
-        assert(!(usage_ > capacity_) || lru_.next == &lru_);
-        // take this opportunity and remove the item
-        table_.Remove(e->key(), e->hash);
-        e->SetInCache(false);
-        Unref(e);
-        usage_ -= e->charge;
-        last_reference = true;
-      } else {
-        // put the item on the list to be potentially freed
-        LRU_Insert(e);
-      }
-    }
   }
 
-  // free outside of mutex
+  // Free the entry here outside of mutex for performance reasons
   if (last_reference) {
     e->Free();
   }
@@ -342,7 +331,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
   // It shouldn't happen very often though.
   LRUHandle* e = reinterpret_cast<LRUHandle*>(
       new char[sizeof(LRUHandle) - 1 + key.size()]);
-  Status s;
+  Status s = Status::OK();
   autovector<LRUHandle*> last_reference_list;
 
   e->value = value;
@@ -351,9 +340,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
   e->key_length = key.size();
   e->flags = 0;
   e->hash = hash;
-  e->refs = (handle == nullptr
-                 ? 1
-                 : 2);  // One from LRUCache, one for the returned handle
+  e->refs = 0;
   e->next = e->prev = nullptr;
   e->SetInCache(true);
   e->SetPriority(priority);
@@ -366,11 +353,12 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
     // is freed or the lru list is empty
     EvictFromLRU(charge, &last_reference_list);
 
-    if (usage_ - lru_usage_ + charge > capacity_ &&
+    if ((usage_ + charge) > capacity_ &&
         (strict_capacity_limit_ || handle == nullptr)) {
       if (handle == nullptr) {
         // Don't insert the entry but still return ok, as if the entry inserted
         // into cache and get evicted immediately.
+        e->SetInCache(false);
         last_reference_list.push_back(e);
       } else {
         delete[] reinterpret_cast<char*>(e);
@@ -378,32 +366,30 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
         s = Status::Incomplete("Insert failed due to LRU cache being full.");
       }
     } else {
-      // insert into the cache
-      // note that the cache might get larger than its capacity if not enough
-      // space was freed
+      // Insert into the cache. Note that the cache might get larger than its
+      // capacity if not enough space was freed up.
       LRUHandle* old = table_.Insert(e);
       usage_ += e->charge;
       if (old != nullptr) {
+        assert(old->InCache());
         old->SetInCache(false);
-        if (Unref(old)) {
-          usage_ -= old->charge;
-          // old is on LRU because it's in cache and its reference count
-          // was just 1 (Unref returned 0)
+        if (!old->HasRefs()) {
+          // old is on LRU because it's in cache and its reference count is 0
           LRU_Remove(old);
+          usage_ -= old->charge;
           last_reference_list.push_back(old);
         }
       }
       if (handle == nullptr) {
         LRU_Insert(e);
       } else {
+        e->Ref();
         *handle = reinterpret_cast<Cache::Handle*>(e);
       }
-      s = Status::OK();
     }
   }
 
-  // we free the entries here outside of mutex for
-  // performance reasons
+  // Free the entries here outside of mutex for performance reasons
   for (auto entry : last_reference_list) {
     entry->Free();
   }
@@ -418,18 +404,18 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
     MutexLock l(&mutex_);
     e = table_.Remove(key, hash);
     if (e != nullptr) {
-      last_reference = Unref(e);
-      if (last_reference) {
-        usage_ -= e->charge;
-      }
-      if (last_reference && e->InCache()) {
-        LRU_Remove(e);
-      }
+      assert(e->InCache());
       e->SetInCache(false);
+      if (!e->HasRefs()) {
+        // The entry is in LRU since it's in hash and has no external references
+        LRU_Remove(e);
+        usage_ -= e->charge;
+        last_reference = true;
+      }
     }
   }
 
-  // mutex not held here
+  // Free the entry here outside of mutex for performance reasons
   // last_reference will only be true if e != nullptr
   if (last_reference) {
     e->Free();
diff --git a/cache/lru_cache.h b/cache/lru_cache.h
index 0d9a317486..1ff765d159 100644
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@@ -17,31 +17,34 @@
 
 namespace rocksdb {
 
-// LRU cache implementation
+// LRU cache implementation. This class is not thread-safe.
 
 // An entry is a variable length heap-allocated structure.
 // Entries are referenced by cache and/or by any external entity.
-// The cache keeps all its entries in table. Some elements
+// The cache keeps all its entries in a hash table. Some elements
 // are also stored on LRU list.
 //
 // LRUHandle can be in these states:
 // 1. Referenced externally AND in hash table.
-//  In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
-// 2. Not referenced externally and in hash table. In that case the entry is
-// in the LRU and can be freed. (refs == 1 && in_cache == true)
-// 3. Referenced externally and not in hash table. In that case the entry is
-// in not on LRU and not in table. (refs >= 1 && in_cache == false)
+//    In that case the entry is *not* in the LRU list
+//    (refs >= 1 && in_cache == true)
+// 2. Not referenced externally AND in hash table.
+//    In that case the entry is in the LRU list and can be freed.
+//    (refs == 0 && in_cache == true)
+// 3. Referenced externally AND not in hash table.
+//    In that case the entry is not in the LRU list and not in hash table.
+//    The entry can be freed when refs becomes 0.
+//    (refs >= 1 && in_cache == false)
 //
 // All newly created LRUHandles are in state 1. If you call
-// LRUCacheShard::Release
-// on entry in state 1, it will go into state 2. To move from state 1 to
-// state 3, either call LRUCacheShard::Erase or LRUCacheShard::Insert with the
-// same key.
+// LRUCacheShard::Release on entry in state 1, it will go into state 2.
+// To move from state 1 to state 3, either call LRUCacheShard::Erase or
+// LRUCacheShard::Insert with the same key (but possibly different value).
 // To move from state 2 to state 1, use LRUCacheShard::Lookup.
 // Before destruction, make sure that no handles are in state 1. This means
 // that any successful LRUCacheShard::Lookup/LRUCacheShard::Insert have a
-// matching
-// RUCache::Release (to move into state 2) or LRUCacheShard::Erase (for state 3)
+// matching LRUCache::Release (to move into state 2) or LRUCacheShard::Erase
+// (to move into state 3).
 
 struct LRUHandle {
   void* value;
@@ -51,37 +54,42 @@ struct LRUHandle {
   LRUHandle* prev;
   size_t charge;  // TODO(opt): Only allow uint32_t?
   size_t key_length;
-  uint32_t refs;     // a number of refs to this entry
-                     // cache itself is counted as 1
+  // The hash of key(). Used for fast sharding and comparisons.
+  uint32_t hash;
+  // The number of external refs to this entry. The cache itself is not counted.
+  uint32_t refs;
 
-  // Include the following flags:
-  //   IN_CACHE:         whether this entry is referenced by the hash table.
-  //   IS_HIGH_PRI:      whether this entry is high priority entry.
-  //   IN_HIGH_PRI_POOL: whether this entry is in high-pri pool.
-  //   HAS_HIT:          whether this entry has had any lookups (hits).
   enum Flags : uint8_t {
+    // Whether this entry is referenced by the hash table.
     IN_CACHE = (1 << 0),
+    // Whether this entry is high priority entry.
     IS_HIGH_PRI = (1 << 1),
+    // Whether this entry is in high-pri pool.
     IN_HIGH_PRI_POOL = (1 << 2),
+    // Wwhether this entry has had any lookups (hits).
     HAS_HIT = (1 << 3),
   };
 
   uint8_t flags;
 
-  uint32_t hash;     // Hash of key(); used for fast sharding and comparisons
+  // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
+  char key_data[1];
 
-  char key_data[1];  // Beginning of key
+  Slice key() const { return Slice(key_data, key_length); }
 
-  Slice key() const {
-    // For cheaper lookups, we allow a temporary Handle object
-    // to store a pointer to a key in "value".
-    if (next == this) {
-      return *(reinterpret_cast<Slice*>(value));
-    } else {
-      return Slice(key_data, key_length);
-    }
+  // Increase the reference count by 1.
+  void Ref() { refs++; }
+
+  // Just reduce the reference count by 1. Return true if it was last reference.
+  bool Unref() {
+    assert(refs > 0);
+    refs--;
+    return refs == 0;
   }
 
+  // Return true if there are external refs, false otherwise.
+  bool HasRefs() const { return refs > 0; }
+
   bool InCache() const { return flags & IN_CACHE; }
   bool IsHighPri() const { return flags & IS_HIGH_PRI; }
   bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; }
@@ -114,7 +122,7 @@ struct LRUHandle {
   void SetHit() { flags |= HAS_HIT; }
 
   void Free() {
-    assert((refs == 1 && InCache()) || (refs == 0 && !InCache()));
+    assert(refs == 0);
     if (deleter) {
       (*deleter)(key(), value);
     }
@@ -169,7 +177,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  public:
   LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                 double high_pri_pool_ratio, bool use_adaptive_mutex);
-  virtual ~LRUCacheShard();
+  virtual ~LRUCacheShard() override = default;
 
   // Separate from constructor so caller can easily make an array of LRUCache
   // if current usage is more than new capacity, the function will attempt to
@@ -225,10 +233,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
   // high-pri pool is no larger than the size specify by high_pri_pool_pct.
   void MaintainPoolSize();
 
-  // Just reduce the reference count by 1.
-  // Return true if last reference
-  bool Unref(LRUHandle* e);
-
   // Free some space following strict LRU policy until enough space
   // to hold (usage_ + charge) is freed or the lru list is empty
   // This function is not thread safe - it needs to be executed while