Use optimized folly DistributedMutex in LRUCache when available (#10179)

Summary: folly DistributedMutex is faster than standard mutexes though imposes some static obligations on usage. See https://github.com/facebook/folly/blob/main/folly/synchronization/DistributedMutex.h for details. Here we use this alternative for our Cache implementations (especially LRUCache) for better locking performance, when RocksDB is compiled with folly. Also added information about which distributed mutex implementation is being used to cache_bench output and to DB LOG. Intended follow-up: * Use DMutex in more places, perhaps improving API to support non-scoped locking * Fix linking with fbcode compiler (needs ROCKSDB_NO_FBCODE=1 currently) Credit: Thanks Siying for reminding me about this line of work that was previously left unfinished. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10179 Test Plan: for correctness, existing tests. CircleCI config updated. Also Meta-internal buck build updated. For performance, ran simultaneous before & after cache_bench. Out of three comparison runs, the middle improvement to ops/sec was +21%: Baseline: USE_CLANG=1 DEBUG_LEVEL=0 make -j24 cache_bench (fbcode compiler) ``` Complete in 20.201 s; Rough parallel ops/sec = 1584062 Thread ops/sec = 107176 Operation latency (ns): Count: 32000000 Average: 9257.9421 StdDev: 122412.04 Min: 134 Median: 3623.0493 Max: 56918500 Percentiles: P50: 3623.05 P75: 10288.02 P99: 30219.35 P99.9: 683522.04 P99.99: 7302791.63 ``` New: (add USE_FOLLY=1) ``` Complete in 16.674 s; Rough parallel ops/sec = 1919135 (+21%) Thread ops/sec = 135487 Operation latency (ns): Count: 32000000 Average: 7304.9294 StdDev: 108530.28 Min: 132 Median: 3777.6012 Max: 91030902 Percentiles: P50: 3777.60 P75: 10169.89 P99: 24504.51 P99.9: 59721.59 P99.99: 1861151.83 ``` Reviewed By: anand1976 Differential Revision: D37182983 Pulled By: pdillinger fbshipit-source-id: a17eb05f25b832b6a2c1356f5c657e831a5af8d1
2022-06-17 13:08:45 -07:00 · 2022-06-17 13:08:45 -07:00 · 1aac814578
parent f87adcfb3f
commit 1aac814578
17 changed files with 160 additions and 51 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -168,6 +168,16 @@ commands:
          command: |
            echo "export PKG_CONFIG_PATH=/usr/local/OFF/:~/libprotobuf-mutator/build/external.protobuf/lib/pkgconfig/" >> $BASH_ENV
            echo "export PROTOC_BIN=~/libprotobuf-mutator/build/external.protobuf/bin/protoc" >> $BASH_ENV
+  setup-folly:
+    steps:
+      - run:
+          name: Install folly dependencies
+          command: |
+            sudo apt-get install libgoogle-glog-dev
+      - run:
+          name: Checkout folly sources
+          command: |
+            make checkout_folly

  build-for-benchmarks:
    steps:
@ -442,7 +452,7 @@ jobs:
      - pre-steps
      - install-gflags
      - upgrade-cmake
-      - run: make checkout_folly
+      - setup-folly
      - run: (mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 .. && make V=1 -j20 && ctest -j20)
      - post-steps

@ -477,7 +487,7 @@ jobs:
    steps:
      - pre-steps
      - run: sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test && sudo apt-get update -y && sudo apt-get install gcc-7 g++-7 libgflags-dev
-      - run: make checkout_folly
+      - setup-folly
      - run: USE_FOLLY=1 CC=gcc-7 CXX=g++-7 V=1 make -j32 check
      - post-steps

@ -532,7 +542,7 @@ jobs:
      - pre-steps
      - install-clang-13
      - install-gflags
-      - run: make checkout_folly
+      - setup-folly
      - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
      - post-steps

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -587,6 +587,7 @@ include_directories(${PROJECT_SOURCE_DIR}/include)
 if(USE_FOLLY)
  include_directories(${PROJECT_SOURCE_DIR}/third-party/folly)
  add_definitions(-DUSE_FOLLY -DFOLLY_NO_CONFIG)
+  list(APPEND THIRDPARTY_LIBS glog)
 endif()
 find_package(Threads REQUIRED)

@ -975,9 +976,13 @@ endif()
 if(USE_FOLLY)
  list(APPEND SOURCES
    third-party/folly/folly/container/detail/F14Table.cpp
+    third-party/folly/folly/detail/Futex.cpp
    third-party/folly/folly/lang/SafeAssert.cpp
    third-party/folly/folly/lang/ToAscii.cpp
-    third-party/folly/folly/ScopeGuard.cpp)
+    third-party/folly/folly/ScopeGuard.cpp
+    third-party/folly/folly/synchronization/AtomicNotification.cpp
+    third-party/folly/folly/synchronization/DistributedMutex.cpp
+    third-party/folly/folly/synchronization/ParkingLot.cpp)
 endif()

 set(ROCKSDB_STATIC_LIB rocksdb${ARTIFACT_SUFFIX})
--- a/HISTORY.md
+++ b/HISTORY.md
@ -42,6 +42,9 @@
 * Removed support for reading Bloom filters using obsolete block-based filter format. (Support for writing such filters was dropped in 7.0.) For good read performance on old DBs using these filters, a full compaction is required.
 * Per KV checksum in write batch is verified before a write batch is written to WAL to detect any corruption to the write batch (#10114).

+### Performance Improvements
+* When compiled with folly (Meta-internal integration; experimental in open source build), improve the locking performance (CPU efficiency) of LRUCache by using folly DistributedMutex in place of standard mutex.
+
 ## 7.3.0 (05/20/2022)
 ### Bug Fixes
 * Fixed a bug where manual flush would block forever even though flush options had wait=false.
--- a/8
+++ b/8
@ -461,6 +461,8 @@ ifeq ($(USE_FOLLY),1)
 	endif
 	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
 	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+# TODO: fix linking with fbcode compiler config
+	PLATFORM_LDFLAGS += -lglog
 endif

 ifdef TEST_CACHE_LINE_SIZE
@ -2354,10 +2356,14 @@ checkout_folly:
 	fi
 	@# Pin to a particular version for public CI, so that PR authors don't
 	@# need to worry about folly breaking our integration. Update periodically
-	cd third-party/folly && git reset --hard 98b9b2c1124e99f50f9085ddee74ce32afffc665
+	cd third-party/folly && git reset --hard beacd86d63cd71c904632262e6c36f60874d78ba
 	@# A hack to remove boost dependency.
 	@# NOTE: this hack is not needed if using FBCODE compiler config
 	perl -pi -e 's/^(#include <boost)/\/\/$$1/' third-party/folly/folly/functional/Invoke.h
+	@# NOTE: this hack is required for clang in some cases
+	perl -pi -e 's/int rv = syscall/int rv = (int)syscall/' third-party/folly/folly/detail/Futex.cpp
+	@# NOTE: this hack is required for gcc in some cases
+	perl -pi -e 's/(__has_include.<experimental.memory_resource>.)/__cpp_rtti && $$1/' third-party/folly/folly/memory/MemoryResource.h

 # ---------------------------------------------------------------------------
 #   Build size testing
--- a/2
+++ b/2
@ -334,6 +334,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
        "//folly/experimental/coro:collect",
        "//folly/experimental/coro:coroutine",
        "//folly/experimental/coro:task",
+        "//folly/synchronization:distributed_mutex",
    ], headers=None, link_whole=False, extra_test_libs=False)

 cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
@ -662,6 +663,7 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
        "//folly/experimental/coro:collect",
        "//folly/experimental/coro:coroutine",
        "//folly/experimental/coro:task",
+        "//folly/synchronization:distributed_mutex",
    ], headers=None, link_whole=True, extra_test_libs=False)

 cpp_library_wrapper(name="rocksdb_test_lib", srcs=[
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@ -150,7 +150,8 @@ def generate_targets(repo_path, deps_map):
            "//folly/experimental/coro:blocking_wait",
            "//folly/experimental/coro:collect",
            "//folly/experimental/coro:coroutine",
-            "//folly/experimental/coro:task"])
+            "//folly/experimental/coro:task",
+            "//folly/synchronization:distributed_mutex"])
    # rocksdb_whole_archive_lib
    TARGETS.add_library(
        "rocksdb_whole_archive_lib",
@ -163,7 +164,8 @@ def generate_targets(repo_path, deps_map):
            "//folly/experimental/coro:blocking_wait",
            "//folly/experimental/coro:collect",
            "//folly/experimental/coro:coroutine",
-            "//folly/experimental/coro:task"],
+            "//folly/experimental/coro:task",
+            "//folly/synchronization:distributed_mutex"],
        headers=None,
        extra_external_deps="",
        link_whole=True)
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@ -26,6 +26,7 @@
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/cachable_entry.h"
 #include "util/coding.h"
+#include "util/distributed_mutex.h"
 #include "util/gflags_compat.h"
 #include "util/hash.h"
 #include "util/mutexlock.h"
@ -587,7 +588,15 @@ class CacheBench {
  }

  void PrintEnv() const {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+    printf(
+        "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+    printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
+    printf("DMutex impl name    : %s\n", DMutex::kName());
    printf("Number of threads   : %u\n", FLAGS_threads);
    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
    printf("Cache size          : %s\n",
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@ -39,7 +39,7 @@ std::shared_ptr<Cache> NewClockCache(
 #include "port/port.h"
 #include "tbb/concurrent_hash_map.h"
 #include "util/autovector.h"
-#include "util/mutexlock.h"
+#include "util/distributed_mutex.h"

 namespace ROCKSDB_NAMESPACE {

@ -368,7 +368,7 @@ class ClockCacheShard final : public CacheShard {

  // Guards list_, head_, and recycle_. In addition, updating table_ also has
  // to hold the mutex, to avoid the cache being in inconsistent state.
-  mutable port::Mutex mutex_;
+  mutable DMutex mutex_;

  // The circular list of cache handles. Initially the list is empty. Once a
  // handle is needed by insertion, and no more handles are available in
@ -431,7 +431,7 @@ void ClockCacheShard::ApplyToSomeEntries(
                             DeleterFn deleter)>& callback,
    uint32_t average_entries_per_lock, uint32_t* state) {
  assert(average_entries_per_lock > 0);
-  MutexLock lock(&mutex_);
+  DMutexLock l(mutex_);

  // Figure out the range to iterate, update `state`
  size_t list_size = list_.size();
@ -532,7 +532,7 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage,
    pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
    // Cleanup if it is the last reference.
    if (!InCache(flags)) {
-      MutexLock l(&mutex_);
+      DMutexLock l(mutex_);
      RecycleHandle(handle, context);
    }
  }
@ -598,7 +598,7 @@ bool ClockCacheShard::EvictFromCache(size_t charge, CleanupContext* context) {
 void ClockCacheShard::SetCapacity(size_t capacity) {
  CleanupContext context;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    capacity_.store(capacity, std::memory_order_relaxed);
    EvictFromCache(0, &context);
  }
@ -618,7 +618,7 @@ CacheHandle* ClockCacheShard::Insert(
  uint32_t meta_charge =
      CacheHandle::CalcMetadataCharge(key, metadata_charge_policy_);
  size_t total_charge = charge + meta_charge;
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  bool success = EvictFromCache(total_charge, context);
  bool strict = strict_capacity_limit_.load(std::memory_order_relaxed);
  if (!success && (strict || !hold_reference)) {
@ -744,7 +744,7 @@ void ClockCacheShard::Erase(const Slice& key, uint32_t hash) {

 bool ClockCacheShard::EraseAndConfirm(const Slice& key, uint32_t hash,
                                      CleanupContext* context) {
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  HashTable::accessor accessor;
  bool erased = false;
  if (table_.find(accessor, ClockCacheKey(key, hash))) {
@ -758,7 +758,7 @@ bool ClockCacheShard::EraseAndConfirm(const Slice& key, uint32_t hash,
 void ClockCacheShard::EraseUnRefEntries() {
  CleanupContext context;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    table_.clear();
    for (auto& handle : list_) {
      UnsetInCache(&handle, &context);
--- a/cache/fast_lru_cache.cc
+++ b/cache/fast_lru_cache.cc
@ -16,7 +16,7 @@
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "port/lang.h"
-#include "util/mutexlock.h"
+#include "util/distributed_mutex.h"

 #define KEY_LENGTH \
  16  // TODO(guido) Make use of this symbol in other parts of the source code
@ -93,7 +93,7 @@ LRUCacheShard::LRUCacheShard(size_t capacity, size_t estimated_value_size,
 void LRUCacheShard::EraseUnRefEntries() {
  autovector<LRUHandle*> last_reference_list;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    while (lru_.next != &lru_) {
      LRUHandle* old = lru_.next;
      // LRU list contains only elements which can be evicted.
@ -120,7 +120,7 @@ void LRUCacheShard::ApplyToSomeEntries(
  // The state is essentially going to be the starting hash, which works
  // nicely even if we resize between calls because we use upper-most
  // hash bits for table indexes.
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  uint32_t length_bits = table_.GetLengthBits();
  uint32_t length = uint32_t{1} << length_bits;

@ -208,7 +208,7 @@ int LRUCacheShard::GetHashBits(
 void LRUCacheShard::SetCapacity(size_t capacity) {
  autovector<LRUHandle*> last_reference_list;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    capacity_ = capacity;
    EvictFromLRU(0, &last_reference_list);
  }
@ -220,7 +220,7 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
 }

 void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  strict_capacity_limit_ = strict_capacity_limit;
 }

@ -229,7 +229,7 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle,
  Status s = Status::OK();
  autovector<LRUHandle*> last_reference_list;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);

    // Free the space following strict LRU policy until enough space
    // is freed or the lru list is empty.
@ -289,7 +289,7 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle,
 Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
  LRUHandle* e = nullptr;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    e = table_.Lookup(key, hash);
    if (e != nullptr) {
      assert(e->InCache());
@ -305,7 +305,7 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {

 bool LRUCacheShard::Ref(Cache::Handle* h) {
  LRUHandle* e = reinterpret_cast<LRUHandle*>(h);
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  // To create another reference - entry must be already externally referenced.
  assert(e->HasRefs());
  e->Ref();
@ -319,7 +319,7 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
  bool last_reference = false;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    last_reference = e->Unref();
    if (last_reference && e->InCache()) {
      // The item is still in cache, and nobody else holds a reference to it.
@ -382,7 +382,7 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
  LRUHandle* e;
  bool last_reference = false;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    e = table_.Remove(key, hash);
    if (e != nullptr) {
      assert(e->InCache());
@ -405,12 +405,12 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
 }

 size_t LRUCacheShard::GetUsage() const {
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  return usage_;
 }

 size_t LRUCacheShard::GetPinnedUsage() const {
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  assert(usage_ >= lru_usage_);
  return usage_ - lru_usage_;
 }
--- a/cache/fast_lru_cache.h
+++ b/cache/fast_lru_cache.h
@ -17,6 +17,7 @@
 #include "port/port.h"
 #include "rocksdb/secondary_cache.h"
 #include "util/autovector.h"
+#include "util/distributed_mutex.h"

 namespace ROCKSDB_NAMESPACE {
 namespace fast_lru_cache {
@ -273,7 +274,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  // mutex_ protects the following state.
  // We don't count mutex_ as the cache's internal state so semantically we
  // don't mind mutex_ invoking the non-const actions.
-  mutable port::Mutex mutex_;
+  mutable DMutex mutex_;
 };

 class LRUCache
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@ -16,7 +16,7 @@
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "port/lang.h"
-#include "util/mutexlock.h"
+#include "util/distributed_mutex.h"

 namespace ROCKSDB_NAMESPACE {
 namespace lru_cache {
@ -135,7 +135,7 @@ LRUCacheShard::LRUCacheShard(
 void LRUCacheShard::EraseUnRefEntries() {
  autovector<LRUHandle*> last_reference_list;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    while (lru_.next != &lru_) {
      LRUHandle* old = lru_.next;
      // LRU list contains only elements which can be evicted.
@ -161,7 +161,7 @@ void LRUCacheShard::ApplyToSomeEntries(
  // The state is essentially going to be the starting hash, which works
  // nicely even if we resize between calls because we use upper-most
  // hash bits for table indexes.
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  uint32_t length_bits = table_.GetLengthBits();
  uint32_t length = uint32_t{1} << length_bits;

@ -193,13 +193,13 @@ void LRUCacheShard::ApplyToSomeEntries(
 }

 void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) {
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  *lru = &lru_;
  *lru_low_pri = lru_low_pri_;
 }

 size_t LRUCacheShard::TEST_GetLRUSize() {
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  LRUHandle* lru_handle = lru_.next;
  size_t lru_size = 0;
  while (lru_handle != &lru_) {
@ -210,7 +210,7 @@ size_t LRUCacheShard::TEST_GetLRUSize() {
 }

 double LRUCacheShard::GetHighPriPoolRatio() {
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  return high_pri_pool_ratio_;
 }

@ -285,7 +285,7 @@ void LRUCacheShard::EvictFromLRU(size_t charge,
 void LRUCacheShard::SetCapacity(size_t capacity) {
  autovector<LRUHandle*> last_reference_list;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    capacity_ = capacity;
    high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
    EvictFromLRU(0, &last_reference_list);
@ -304,7 +304,7 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
 }

 void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  strict_capacity_limit_ = strict_capacity_limit;
 }

@ -314,7 +314,7 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle,
  autovector<LRUHandle*> last_reference_list;

  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);

    // Free the space following strict LRU policy until enough space
    // is freed or the lru list is empty.
@ -402,7 +402,7 @@ void LRUCacheShard::Promote(LRUHandle* e) {
  } else {
    // Since the secondary cache lookup failed, mark the item as not in cache
    // Don't charge the cache as its only metadata that'll shortly be released
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    // TODO
    e->CalcTotalCharge(0, metadata_charge_policy_);
    e->SetInCache(false);
@ -416,7 +416,7 @@ Cache::Handle* LRUCacheShard::Lookup(
    bool wait, Statistics* stats) {
  LRUHandle* e = nullptr;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    e = table_.Lookup(key, hash);
    if (e != nullptr) {
      assert(e->InCache());
@ -489,7 +489,7 @@ Cache::Handle* LRUCacheShard::Lookup(

 bool LRUCacheShard::Ref(Cache::Handle* h) {
  LRUHandle* e = reinterpret_cast<LRUHandle*>(h);
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  // To create another reference - entry must be already externally referenced.
  assert(e->HasRefs());
  e->Ref();
@ -497,7 +497,7 @@ bool LRUCacheShard::Ref(Cache::Handle* h) {
 }

 void LRUCacheShard::SetHighPriorityPoolRatio(double high_pri_pool_ratio) {
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  high_pri_pool_ratio_ = high_pri_pool_ratio;
  high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
  MaintainPoolSize();
@ -510,7 +510,7 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
  bool last_reference = false;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    last_reference = e->Unref();
    if (last_reference && e->InCache()) {
      // The item is still in cache, and nobody else holds a reference to it.
@ -582,7 +582,7 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
  LRUHandle* e;
  bool last_reference = false;
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    e = table_.Remove(key, hash);
    if (e != nullptr) {
      assert(e->InCache());
@ -606,7 +606,7 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {

 bool LRUCacheShard::IsReady(Cache::Handle* handle) {
  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  bool ready = true;
  if (e->IsPending()) {
    assert(secondary_cache_);
@ -617,12 +617,12 @@ bool LRUCacheShard::IsReady(Cache::Handle* handle) {
 }

 size_t LRUCacheShard::GetUsage() const {
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  return usage_;
 }

 size_t LRUCacheShard::GetPinnedUsage() const {
-  MutexLock l(&mutex_);
+  DMutexLock l(mutex_);
  assert(usage_ >= lru_usage_);
  return usage_ - lru_usage_;
 }
@ -631,7 +631,7 @@ std::string LRUCacheShard::GetPrintableOptions() const {
  const int kBufferSize = 200;
  char buffer[kBufferSize];
  {
-    MutexLock l(&mutex_);
+    DMutexLock l(mutex_);
    snprintf(buffer, kBufferSize, "    high_pri_pool_ratio: %.3lf\n",
             high_pri_pool_ratio_);
  }
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@ -17,6 +17,7 @@
 #include "port/port.h"
 #include "rocksdb/secondary_cache.h"
 #include "util/autovector.h"
+#include "util/distributed_mutex.h"

 namespace ROCKSDB_NAMESPACE {
 namespace lru_cache {
@ -453,7 +454,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  // mutex_ protects the following state.
  // We don't count mutex_ as the cache's internal state so semantically we
  // don't mind mutex_ invoking the non-const actions.
-  mutable port::Mutex mutex_;
+  mutable DMutex mutex_;

  std::shared_ptr<SecondaryCache> secondary_cache_;
 };
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@ -101,6 +101,7 @@
 #include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/defer.h"
+#include "util/distributed_mutex.h"
 #include "util/hash_containers.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
@ -145,6 +146,8 @@ void DumpSupportInfo(Logger* logger) {
  }
  ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s",
                   crc32c::IsFastCrc32Supported().c_str());
+
+  ROCKS_LOG_HEADER(logger, "DMutex implementation: %s", DMutex::kName());
 }
 }  // namespace

--- a/port/port_posix.h
+++ b/port/port_posix.h
@ -95,6 +95,8 @@ class CondVar;

 class Mutex {
 public:
+  static const char* kName() { return "pthread_mutex_t"; }
+
  explicit Mutex(bool adaptive = kDefaultToAdaptiveMutex);
  // No copying
  Mutex(const Mutex&) = delete;
@ -111,6 +113,11 @@ class Mutex {
  // it does NOT verify that mutex is held by a calling thread
  void AssertHeld();

+  // Also implement std Lockable
+  inline void lock() { Lock(); }
+  inline void unlock() { Unlock(); }
+  inline bool try_lock() { return TryLock(); }
+
 private:
  friend class CondVar;
  pthread_mutex_t mu_;
--- a/port/win/port_win.h
+++ b/port/win/port_win.h
@ -79,12 +79,15 @@ class CondVar;

 class Mutex {
 public:
+  static const char* kName() { return "std::mutex"; }

-   /* implicit */ Mutex(bool adaptive = kDefaultToAdaptiveMutex)
+  explicit Mutex(bool IGNORED_adaptive = kDefaultToAdaptiveMutex)
 #ifndef NDEBUG
-     : locked_(false)
+      : locked_(false)
 #endif
-   { }
+  {
+    (void)IGNORED_adaptive;
+  }

  ~Mutex();

@ -120,6 +123,11 @@ class Mutex {
 #endif
  }

+  // Also implement std Lockable
+  inline void lock() { Lock(); }
+  inline void unlock() { Unlock(); }
+  inline bool try_lock() { return TryLock(); }
+
  // Mutex is move only with lock ownership transfer
  Mutex(const Mutex&) = delete;
  void operator=(const Mutex&) = delete;
--- a/src.mk
+++ b/src.mk
@ -374,9 +374,13 @@ TEST_LIB_SOURCES =                                              \

 FOLLY_SOURCES =                                                 \
  $(FOLLY_DIR)/folly/container/detail/F14Table.cpp              \
+  $(FOLLY_DIR)/folly/detail/Futex.cpp                           \
  $(FOLLY_DIR)/folly/lang/SafeAssert.cpp                        \
  $(FOLLY_DIR)/folly/lang/ToAscii.cpp                           \
  $(FOLLY_DIR)/folly/ScopeGuard.cpp                             \
+  $(FOLLY_DIR)/folly/synchronization/AtomicNotification.cpp     \
+  $(FOLLY_DIR)/folly/synchronization/DistributedMutex.cpp       \
+  $(FOLLY_DIR)/folly/synchronization/ParkingLot.cpp             \

 TOOLS_MAIN_SOURCES =                                                    \
  db_stress_tool/db_stress.cc                                           \
--- a/util/distributed_mutex.h
+++ b/util/distributed_mutex.h
@ -0,0 +1,48 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+// This file declares a wrapper around the efficient folly DistributedMutex
+// that falls back on a standard mutex when not available. See
+// https://github.com/facebook/folly/blob/main/folly/synchronization/DistributedMutex.h
+// for benefits and limitations.
+
+// At the moment, only scoped locking is supported using DMutexLock
+// RAII wrapper, because lock/unlock APIs will vary.
+
+#ifdef USE_FOLLY
+
+#include <folly/synchronization/DistributedMutex.h>
+
+namespace ROCKSDB_NAMESPACE {
+
+class DMutex : public folly::DistributedMutex {
+ public:
+  static const char* kName() { return "folly::DistributedMutex"; }
+
+  explicit DMutex(bool IGNORED_adaptive = false) { (void)IGNORED_adaptive; }
+
+  // currently no-op
+  void AssertHeld() {}
+};
+using DMutexLock = std::lock_guard<folly::DistributedMutex>;
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#else
+
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using DMutex = port::Mutex;
+using DMutexLock = std::lock_guard<DMutex>;
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif