diff --git a/HISTORY.md b/HISTORY.md index 5c9228cd1a..7272fedb4d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## Unreleased ### Public API Change * For users of `Statistics` objects created via `CreateDBStatistics()`, the format of the string returned by its `ToString()` method has changed. +* With LRUCache, when high_pri_pool_ratio > 0, midpoint insertion strategy will be enabled to put low-pri items to the tail of low-pri list (the midpoint) when they first inserted into the cache. This is to make cache entries never get hit age out faster, improving cache efficiency when large background scan presents. ## 5.14.0 (5/16/2018) ### Public API Change diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc index 11d18efdd5..a128296f91 100644 --- a/cache/lru_cache.cc +++ b/cache/lru_cache.cc @@ -199,7 +199,7 @@ void LRUCacheShard::LRU_Remove(LRUHandle* e) { void LRUCacheShard::LRU_Insert(LRUHandle* e) { assert(e->next == nullptr); assert(e->prev == nullptr); - if (high_pri_pool_ratio_ > 0 && e->IsHighPri()) { + if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) { // Inset "e" to head of LRU list. e->next = &lru_; e->prev = lru_.prev; @@ -246,18 +246,6 @@ void LRUCacheShard::EvictFromLRU(size_t charge, } } -void* LRUCacheShard::operator new(size_t size) { - return port::cacheline_aligned_alloc(size); -} - -void* LRUCacheShard::operator new(size_t /*size*/, void* ptr) { return ptr; } - -void LRUCacheShard::operator delete(void *memblock) { - port::cacheline_aligned_free(memblock); -} - -void LRUCacheShard::operator delete(void* /*memblock*/, void* /*ptr*/) {} - void LRUCacheShard::SetCapacity(size_t capacity) { autovector last_reference_list; { @@ -287,6 +275,7 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) { LRU_Remove(e); } e->refs++; + e->SetHit(); } return reinterpret_cast(e); } diff --git a/cache/lru_cache.h b/cache/lru_cache.h index ac8cb9ee55..f5219ad358 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -77,6 +77,7 @@ struct LRUHandle { bool InCache() { return flags & 1; } bool IsHighPri() { return flags & 2; } bool InHighPriPool() { return flags & 4; } + bool HasHit() { return flags & 8; } void SetInCache(bool in_cache) { if (in_cache) { @@ -102,6 +103,8 @@ struct LRUHandle { } } + void SetHit() { flags |= 8; } + void Free() { assert((refs == 1 && InCache()) || (refs == 0 && !InCache())); if (deleter) { @@ -206,18 +209,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard : public CacheShard { // Retrives high pri pool ratio double GetHighPriPoolRatio(); - // Overloading to aligned it to cache line size - // They are used by tests. - void* operator new(size_t); - - // placement new - void* operator new(size_t, void*); - - void operator delete(void *); - - // placement delete, does nothing. - void operator delete(void*, void*); - private: void LRU_Remove(LRUHandle* e); void LRU_Insert(LRUHandle* e); diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 8d0f3ec1b3..a21009aa9e 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -15,11 +15,22 @@ namespace rocksdb { class LRUCacheTest : public testing::Test { public: LRUCacheTest() {} - ~LRUCacheTest() {} + ~LRUCacheTest() { DeleteCache(); } + + void DeleteCache() { + if (cache_ != nullptr) { + cache_->~LRUCacheShard(); + port::cacheline_aligned_free(cache_); + cache_ = nullptr; + } + } void NewCache(size_t capacity, double high_pri_pool_ratio = 0.0) { - cache_.reset(new LRUCacheShard(capacity, false /*strict_capcity_limit*/, - high_pri_pool_ratio)); + DeleteCache(); + cache_ = reinterpret_cast( + port::cacheline_aligned_alloc(sizeof(LRUCacheShard))); + new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/, + high_pri_pool_ratio); } void Insert(const std::string& key, @@ -75,7 +86,7 @@ class LRUCacheTest : public testing::Test { } private: - std::unique_ptr cache_; + LRUCacheShard* cache_ = nullptr; }; TEST_F(LRUCacheTest, BasicLRU) { @@ -104,6 +115,29 @@ TEST_F(LRUCacheTest, BasicLRU) { ValidateLRUList({"e", "z", "d", "u", "v"}); } +TEST_F(LRUCacheTest, MidpointInsertion) { + // Allocate 2 cache entries to high-pri pool. + NewCache(5, 0.45); + + Insert("a", Cache::Priority::LOW); + Insert("b", Cache::Priority::LOW); + Insert("c", Cache::Priority::LOW); + Insert("x", Cache::Priority::HIGH); + Insert("y", Cache::Priority::HIGH); + ValidateLRUList({"a", "b", "c", "x", "y"}, 2); + + // Low-pri entries inserted to the tail of low-pri list (the midpoint). + // After lookup, it will move to the tail of the full list. + Insert("d", Cache::Priority::LOW); + ValidateLRUList({"b", "c", "d", "x", "y"}, 2); + ASSERT_TRUE(Lookup("d")); + ValidateLRUList({"b", "c", "x", "y", "d"}, 2); + + // High-pri entries will be inserted to the tail of full list. + Insert("z", Cache::Priority::HIGH); + ValidateLRUList({"c", "x", "y", "d", "z"}, 2); +} + TEST_F(LRUCacheTest, EntriesWithPriority) { // Allocate 2 cache entries to high-pri pool. NewCache(5, 0.45); @@ -130,15 +164,15 @@ TEST_F(LRUCacheTest, EntriesWithPriority) { Insert("a", Cache::Priority::LOW); ValidateLRUList({"v", "X", "a", "Y", "Z"}, 2); - // Low-pri entries will be inserted to head of low-pri pool after lookup. + // Low-pri entries will be inserted to head of high-pri pool after lookup. ASSERT_TRUE(Lookup("v")); - ValidateLRUList({"X", "a", "v", "Y", "Z"}, 2); + ValidateLRUList({"X", "a", "Y", "Z", "v"}, 2); // High-pri entries will be inserted to the head of the list after lookup. ASSERT_TRUE(Lookup("X")); - ValidateLRUList({"a", "v", "Y", "Z", "X"}, 2); + ValidateLRUList({"a", "Y", "Z", "v", "X"}, 2); ASSERT_TRUE(Lookup("Z")); - ValidateLRUList({"a", "v", "Y", "X", "Z"}, 2); + ValidateLRUList({"a", "Y", "v", "X", "Z"}, 2); Erase("Y"); ValidateLRUList({"a", "v", "X", "Z"}, 2); @@ -151,7 +185,7 @@ TEST_F(LRUCacheTest, EntriesWithPriority) { Insert("g", Cache::Priority::LOW); ValidateLRUList({"d", "e", "f", "g", "Z"}, 1); ASSERT_TRUE(Lookup("d")); - ValidateLRUList({"e", "f", "g", "d", "Z"}, 1); + ValidateLRUList({"e", "f", "g", "Z", "d"}, 2); } } // namespace rocksdb diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index cb38c19fbd..ac8c2825e3 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -390,7 +390,10 @@ class MockCache : public LRUCache { static uint32_t high_pri_insert_count; static uint32_t low_pri_insert_count; - MockCache() : LRUCache(1 << 25, 0, false, 0.0) {} + MockCache() + : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/, + false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/) { + } virtual Status Insert(const Slice& key, void* value, size_t charge, void (*deleter)(const Slice& key, void* value), diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 86dafe3959..da3b934d83 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -47,6 +47,15 @@ struct LRUCacheOptions { bool strict_capacity_limit = false; // Percentage of cache reserved for high priority entries. + // If greater than zero, the LRU list will be split into a high-pri + // list and a low-pri list. High-pri entries will be insert to the + // tail of high-pri list, while low-pri entries will be first inserted to + // the low-pri list (the midpoint). This is refered to as + // midpoint insertion strategy to make entries never get hit in cache + // age out faster. + // + // See also + // BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority. double high_pri_pool_ratio = 0.0; LRUCacheOptions() {} diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 2aeadf3440..0b73d834cd 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -100,13 +100,13 @@ DEFINE_string( "readreverse," "compact," "compactall," - "readrandom," "multireadrandom," "readseq," "readtocache," "readreverse," "readwhilewriting," "readwhilemerging," + "readwhilescanning," "readrandomwriterandom," "updaterandom," "xorupdaterandom," @@ -149,6 +149,8 @@ DEFINE_string( "reads\n" "\treadwhilemerging -- 1 merger, N threads doing random " "reads\n" + "\treadwhilescanning -- 1 thread doing full table scan, " + "N threads doing random reads\n" "\treadrandomwriterandom -- N threads doing random-read, " "random-write\n" "\tupdaterandom -- N threads doing read-modify-write for random " @@ -2524,6 +2526,9 @@ void VerifyDBFromDB(std::string& truth_db_name) { } else if (name == "readwhilemerging") { num_threads++; // Add extra thread for writing method = &Benchmark::ReadWhileMerging; + } else if (name == "readwhilescanning") { + num_threads++; // Add extra thread for scaning + method = &Benchmark::ReadWhileScanning; } else if (name == "readrandomwriterandom") { method = &Benchmark::ReadRandomWriteRandom; } else if (name == "readrandommergerandom") { @@ -4507,6 +4512,45 @@ void VerifyDBFromDB(std::string& truth_db_name) { thread->stats.AddBytes(bytes); } + void ReadWhileScanning(ThreadState* thread) { + if (thread->tid > 0) { + ReadRandom(thread); + } else { + BGScan(thread); + } + } + + void BGScan(ThreadState* thread) { + if (FLAGS_num_multi_db > 0) { + fprintf(stderr, "Not supporting multiple DBs.\n"); + abort(); + } + assert(db_.db != nullptr); + ReadOptions read_options; + Iterator* iter = db_.db->NewIterator(read_options); + + fprintf(stderr, "num reads to do %lu\n", reads_); + Duration duration(FLAGS_duration, reads_); + uint64_t num_seek_to_first = 0; + uint64_t num_next = 0; + while (!duration.Done(1)) { + if (!iter->Valid()) { + iter->SeekToFirst(); + num_seek_to_first++; + } else if (!iter->status().ok()) { + fprintf(stderr, "Iterator error: %s\n", + iter->status().ToString().c_str()); + abort(); + } else { + iter->Next(); + num_next++; + } + + thread->stats.FinishedOps(&db_, db_.db, 1, kSeek); + } + delete iter; + } + // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V) // in DB atomically i.e in a single batch. Also refer GetMany. Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,