LRUCache midpoint insertion

Summary: Implement midpoint insertion strategy where new blocks will be insert to the middle of LRU list, then move the head on the first hit in cache. Closes https://github.com/facebook/rocksdb/pull/3877 Differential Revision: D8100895 Pulled By: yiwu-arbug fbshipit-source-id: f4bd83cb8be469e5d02072cfc8bd66011391f3da
2018-05-24 15:45:49 -07:00 · 2018-05-24 15:45:49 -07:00 · bc7e8d472e
parent 3db8504cde
commit bc7e8d472e
7 changed files with 107 additions and 36 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -2,6 +2,7 @@
 ## Unreleased
 ### Public API Change
 * For users of `Statistics` objects created via `CreateDBStatistics()`, the format of the string returned by its `ToString()` method has changed.
+* With LRUCache, when high_pri_pool_ratio > 0, midpoint insertion strategy will be enabled to put low-pri items to the tail of low-pri list (the midpoint) when they first inserted into the cache. This is to make cache entries never get hit age out faster, improving cache efficiency when large background scan presents.

 ## 5.14.0 (5/16/2018)
 ### Public API Change
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@ -199,7 +199,7 @@ void LRUCacheShard::LRU_Remove(LRUHandle* e) {
 void LRUCacheShard::LRU_Insert(LRUHandle* e) {
  assert(e->next == nullptr);
  assert(e->prev == nullptr);
-  if (high_pri_pool_ratio_ > 0 && e->IsHighPri()) {
+  if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) {
    // Inset "e" to head of LRU list.
    e->next = &lru_;
    e->prev = lru_.prev;
@ -246,18 +246,6 @@ void LRUCacheShard::EvictFromLRU(size_t charge,
  }
 }

-void* LRUCacheShard::operator new(size_t size) {
-  return port::cacheline_aligned_alloc(size);
-}
-
-void* LRUCacheShard::operator new(size_t /*size*/, void* ptr) { return ptr; }
-
-void LRUCacheShard::operator delete(void *memblock) {
-  port::cacheline_aligned_free(memblock);
-}
-
-void LRUCacheShard::operator delete(void* /*memblock*/, void* /*ptr*/) {}
-
 void LRUCacheShard::SetCapacity(size_t capacity) {
  autovector<LRUHandle*> last_reference_list;
  {
@ -287,6 +275,7 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
      LRU_Remove(e);
    }
    e->refs++;
+    e->SetHit();
  }
  return reinterpret_cast<Cache::Handle*>(e);
 }
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@ -77,6 +77,7 @@ struct LRUHandle {
  bool InCache() { return flags & 1; }
  bool IsHighPri() { return flags & 2; }
  bool InHighPriPool() { return flags & 4; }
+  bool HasHit() { return flags & 8; }

  void SetInCache(bool in_cache) {
    if (in_cache) {
@ -102,6 +103,8 @@ struct LRUHandle {
    }
  }

+  void SetHit() { flags |= 8; }
+
  void Free() {
    assert((refs == 1 && InCache()) || (refs == 0 && !InCache()));
    if (deleter) {
@ -206,18 +209,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard : public CacheShard {
  //  Retrives high pri pool ratio
  double GetHighPriPoolRatio();

-  // Overloading to aligned it to cache line size
-  // They are used by tests.
-  void* operator new(size_t);
-
-  // placement new
-  void* operator new(size_t, void*);
-
-  void operator delete(void *);
-
-  // placement delete, does nothing.
-  void operator delete(void*, void*);
-
 private:
  void LRU_Remove(LRUHandle* e);
  void LRU_Insert(LRUHandle* e);
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@ -15,11 +15,22 @@ namespace rocksdb {
 class LRUCacheTest : public testing::Test {
 public:
  LRUCacheTest() {}
-  ~LRUCacheTest() {}
+  ~LRUCacheTest() { DeleteCache(); }
+
+  void DeleteCache() {
+    if (cache_ != nullptr) {
+      cache_->~LRUCacheShard();
+      port::cacheline_aligned_free(cache_);
+      cache_ = nullptr;
+    }
+  }

  void NewCache(size_t capacity, double high_pri_pool_ratio = 0.0) {
-    cache_.reset(new LRUCacheShard(capacity, false /*strict_capcity_limit*/,
-                                   high_pri_pool_ratio));
+    DeleteCache();
+    cache_ = reinterpret_cast<LRUCacheShard*>(
+        port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
+    new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/,
+                               high_pri_pool_ratio);
  }

  void Insert(const std::string& key,
@ -75,7 +86,7 @@ class LRUCacheTest : public testing::Test {
  }

 private:
-  std::unique_ptr<LRUCacheShard> cache_;
+  LRUCacheShard* cache_ = nullptr;
 };

 TEST_F(LRUCacheTest, BasicLRU) {
@ -104,6 +115,29 @@ TEST_F(LRUCacheTest, BasicLRU) {
  ValidateLRUList({"e", "z", "d", "u", "v"});
 }

+TEST_F(LRUCacheTest, MidpointInsertion) {
+  // Allocate 2 cache entries to high-pri pool.
+  NewCache(5, 0.45);
+
+  Insert("a", Cache::Priority::LOW);
+  Insert("b", Cache::Priority::LOW);
+  Insert("c", Cache::Priority::LOW);
+  Insert("x", Cache::Priority::HIGH);
+  Insert("y", Cache::Priority::HIGH);
+  ValidateLRUList({"a", "b", "c", "x", "y"}, 2);
+
+  // Low-pri entries inserted to the tail of low-pri list (the midpoint).
+  // After lookup, it will move to the tail of the full list.
+  Insert("d", Cache::Priority::LOW);
+  ValidateLRUList({"b", "c", "d", "x", "y"}, 2);
+  ASSERT_TRUE(Lookup("d"));
+  ValidateLRUList({"b", "c", "x", "y", "d"}, 2);
+
+  // High-pri entries will be inserted to the tail of full list.
+  Insert("z", Cache::Priority::HIGH);
+  ValidateLRUList({"c", "x", "y", "d", "z"}, 2);
+}
+
 TEST_F(LRUCacheTest, EntriesWithPriority) {
  // Allocate 2 cache entries to high-pri pool.
  NewCache(5, 0.45);
@ -130,15 +164,15 @@ TEST_F(LRUCacheTest, EntriesWithPriority) {
  Insert("a", Cache::Priority::LOW);
  ValidateLRUList({"v", "X", "a", "Y", "Z"}, 2);

-  // Low-pri entries will be inserted to head of low-pri pool after lookup.
+  // Low-pri entries will be inserted to head of high-pri pool after lookup.
  ASSERT_TRUE(Lookup("v"));
-  ValidateLRUList({"X", "a", "v", "Y", "Z"}, 2);
+  ValidateLRUList({"X", "a", "Y", "Z", "v"}, 2);

  // High-pri entries will be inserted to the head of the list after lookup.
  ASSERT_TRUE(Lookup("X"));
-  ValidateLRUList({"a", "v", "Y", "Z", "X"}, 2);
+  ValidateLRUList({"a", "Y", "Z", "v", "X"}, 2);
  ASSERT_TRUE(Lookup("Z"));
-  ValidateLRUList({"a", "v", "Y", "X", "Z"}, 2);
+  ValidateLRUList({"a", "Y", "v", "X", "Z"}, 2);

  Erase("Y");
  ValidateLRUList({"a", "v", "X", "Z"}, 2);
@ -151,7 +185,7 @@ TEST_F(LRUCacheTest, EntriesWithPriority) {
  Insert("g", Cache::Priority::LOW);
  ValidateLRUList({"d", "e", "f", "g", "Z"}, 1);
  ASSERT_TRUE(Lookup("d"));
-  ValidateLRUList({"e", "f", "g", "d", "Z"}, 1);
+  ValidateLRUList({"e", "f", "g", "Z", "d"}, 2);
 }

 }  // namespace rocksdb
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@ -390,7 +390,10 @@ class MockCache : public LRUCache {
  static uint32_t high_pri_insert_count;
  static uint32_t low_pri_insert_count;

-  MockCache() : LRUCache(1 << 25, 0, false, 0.0) {}
+  MockCache()
+      : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/,
+                 false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/) {
+  }

  virtual Status Insert(const Slice& key, void* value, size_t charge,
                        void (*deleter)(const Slice& key, void* value),
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@ -47,6 +47,15 @@ struct LRUCacheOptions {
  bool strict_capacity_limit = false;

  // Percentage of cache reserved for high priority entries.
+  // If greater than zero, the LRU list will be split into a high-pri
+  // list and a low-pri list. High-pri entries will be insert to the
+  // tail of high-pri list, while low-pri entries will be first inserted to
+  // the low-pri list (the midpoint). This is refered to as
+  // midpoint insertion strategy to make entries never get hit in cache
+  // age out faster.
+  //
+  // See also
+  // BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority.
  double high_pri_pool_ratio = 0.0;

  LRUCacheOptions() {}
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@ -100,13 +100,13 @@ DEFINE_string(
    "readreverse,"
    "compact,"
    "compactall,"
-    "readrandom,"
    "multireadrandom,"
    "readseq,"
    "readtocache,"
    "readreverse,"
    "readwhilewriting,"
    "readwhilemerging,"
+    "readwhilescanning,"
    "readrandomwriterandom,"
    "updaterandom,"
    "xorupdaterandom,"
@ -149,6 +149,8 @@ DEFINE_string(
    "reads\n"
    "\treadwhilemerging      -- 1 merger, N threads doing random "
    "reads\n"
+    "\treadwhilescanning     -- 1 thread doing full table scan, "
+    "N threads doing random reads\n"
    "\treadrandomwriterandom -- N threads doing random-read, "
    "random-write\n"
    "\tupdaterandom  -- N threads doing read-modify-write for random "
@ -2524,6 +2526,9 @@ void VerifyDBFromDB(std::string& truth_db_name) {
      } else if (name == "readwhilemerging") {
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::ReadWhileMerging;
+      } else if (name == "readwhilescanning") {
+        num_threads++;  // Add extra thread for scaning
+        method = &Benchmark::ReadWhileScanning;
      } else if (name == "readrandomwriterandom") {
        method = &Benchmark::ReadRandomWriteRandom;
      } else if (name == "readrandommergerandom") {
@ -4507,6 +4512,45 @@ void VerifyDBFromDB(std::string& truth_db_name) {
    thread->stats.AddBytes(bytes);
  }

+  void ReadWhileScanning(ThreadState* thread) {
+    if (thread->tid > 0) {
+      ReadRandom(thread);
+    } else {
+      BGScan(thread);
+    }
+  }
+
+  void BGScan(ThreadState* thread) {
+    if (FLAGS_num_multi_db > 0) {
+      fprintf(stderr, "Not supporting multiple DBs.\n");
+      abort();
+    }
+    assert(db_.db != nullptr);
+    ReadOptions read_options;
+    Iterator* iter = db_.db->NewIterator(read_options);
+
+    fprintf(stderr, "num reads to do %lu\n", reads_);
+    Duration duration(FLAGS_duration, reads_);
+    uint64_t num_seek_to_first = 0;
+    uint64_t num_next = 0;
+    while (!duration.Done(1)) {
+      if (!iter->Valid()) {
+        iter->SeekToFirst();
+        num_seek_to_first++;
+      } else if (!iter->status().ok()) {
+        fprintf(stderr, "Iterator error: %s\n",
+                iter->status().ToString().c_str());
+        abort();
+      } else {
+        iter->Next();
+        num_next++;
+      }
+
+      thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
+    }
+    delete iter;
+  }
+
  // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
  // in DB atomically i.e in a single batch. Also refer GetMany.
  Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,