mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-26 16:30:56 +00:00
87b82f28a1
Summary: ### **Summary:** To minimize the internal fragmentation caused by the variable size of the compressed blocks, the original block is split according to the jemalloc bin size in `Insert()` and then merged back in `Lookup()`. Based on the analysis of the results of the following tests, from the overall internal fragmentation perspective, this PR does mitigate the internal fragmentation issue. _Do more myshadow tests with the latest commit. I finished several myshadow AB Testing and the results are promising. For the config of 4GB primary cache and 3GB secondary cache, Jemalloc resident stats shows consistently ~0.15GB memory saving; the allocated and active stats show similar memory savings. The CPU usage is almost the same before and after this PR._ To evaluate the issue of memory fragmentations and the benefits of this PR, I conducted two sets of local tests as follows. **T1** Keys: 16 bytes each (+ 0 bytes user-defined timestamp) Values: 100 bytes each (50 bytes after compression) Entries: 90000000 RawSize: 9956.4 MB (estimated) FileSize: 5664.8 MB (estimated) | Test Name | Primary Cache Size (MB) | Compressed Secondary Cache Size (MB) | | - | - | - | | T1_3 | 4000 | 4000 | | T1_4 | 2000 | 3000 | Populate the DB: ./db_bench --benchmarks=fillrandom --num=90000000 -db=/mem_fragmentation/db_bench_1 Overwrite it to a stable state: ./db_bench --benchmarks=overwrite --num=90000000 -use_existing_db -db=/mem_fragmentation/db_bench_1 Run read tests with differnt cache setting: T1_3: MALLOC_CONF="prof:true,prof_stats:true" ../rocksdb/db_bench --benchmarks=seekrandom --threads=16 --num=90000000 -use_existing_db --benchmark_write_rate_limit=52000000 -use_direct_reads --cache_size=4000000000 -compressed_secondary_cache_size=4000000000 -use_compressed_secondary_cache -db=/mem_fragmentation/db_bench_1 --print_malloc_stats=true > ~/temp/mem_frag/20220710/jemalloc_stats_json_T1_3_20220710 -duration=1800 & T1_4: MALLOC_CONF="prof:true,prof_stats:true" ../rocksdb/db_bench --benchmarks=seekrandom --threads=16 --num=90000000 -use_existing_db --benchmark_write_rate_limit=52000000 -use_direct_reads --cache_size=2000000000 -compressed_secondary_cache_size=3000000000 -use_compressed_secondary_cache -db=/mem_fragmentation/db_bench_1 --print_malloc_stats=true > ~/temp/mem_frag/20220710/jemalloc_stats_json_T1_4_20220710 -duration=1800 & For T1_3 and T1_4, I also conducted the tests before and after this PR. The following table show the important jemalloc stats. | Test Name | T1_3 | T1_3 after mem defrag | T1_4 | T1_4 after mem defrag | | - | - | - | - | - | | allocated (MB) | 8728 | 8076 | 5518 | 5043 | | available (MB) | 8753 | 8092 | 5536 | 5051 | | external fragmentation rate | 0.003 | 0.002 | 0.003 | 0.0016 | | resident (MB) | 8956 | 8365 | 5655 | 5235 | **T2** Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 256 bytes each (128 bytes after compression) Entries: 40000000 RawSize: 10986.3 MB (estimated) FileSize: 6103.5 MB (estimated) | Test Name | Primary Cache Size (MB) | Compressed Secondary Cache Size (MB) | | - | - | - | | T2_3 | 4000 | 4000 | | T2_4 | 2000 | 3000 | Create DB (10GB): ./db_bench -benchmarks=fillrandom -use_direct_reads=true -num=40000000 -key_size=32 -value_size=256 -db=/mem_fragmentation/db_bench_2 Overwrite it to a stable state: ./db_bench --benchmarks=overwrite --num=40000000 -use_existing_db -key_size=32 -value_size=256 -db=/mem_fragmentation/db_bench_2 Run read tests with differnt cache setting: T2_3: MALLOC_CONF="prof:true,prof_stats:true" ./db_bench --benchmarks="mixgraph" -use_direct_io_for_flush_and_compaction=true -use_direct_reads=true -cache_size=4000000000 -compressed_secondary_cache_size=4000000000 -use_compressed_secondary_cache -keyrange_dist_a=14.18 -keyrange_dist_b=-2.917 -keyrange_dist_c=0.0164 -keyrange_dist_d=-0.08082 -keyrange_num=30 -value_k=0.2615 -value_sigma=25.45 -iter_k=2.517 -iter_sigma=14.236 -mix_get_ratio=0.85 -mix_put_ratio=0.14 -mix_seek_ratio=0.01 -sine_mix_rate_interval_milliseconds=5000 -sine_a=1000 -sine_b=0.000073 -sine_d=400000 -reads=80000000 -num=40000000 -key_size=32 -value_size=256 -use_existing_db=true -db=/mem_fragmentation/db_bench_2 --print_malloc_stats=true > ~/temp/mem_frag/jemalloc_stats_T2_3 -duration=1800 & T2_4: MALLOC_CONF="prof:true,prof_stats:true" ./db_bench --benchmarks="mixgraph" -use_direct_io_for_flush_and_compaction=true -use_direct_reads=true -cache_size=2000000000 -compressed_secondary_cache_size=3000000000 -use_compressed_secondary_cache -keyrange_dist_a=14.18 -keyrange_dist_b=-2.917 -keyrange_dist_c=0.0164 -keyrange_dist_d=-0.08082 -keyrange_num=30 -value_k=0.2615 -value_sigma=25.45 -iter_k=2.517 -iter_sigma=14.236 -mix_get_ratio=0.85 -mix_put_ratio=0.14 -mix_seek_ratio=0.01 -sine_mix_rate_interval_milliseconds=5000 -sine_a=1000 -sine_b=0.000073 -sine_d=400000 -reads=80000000 -num=40000000 -key_size=32 -value_size=256 -use_existing_db=true -db=/mem_fragmentation/db_bench_2 --print_malloc_stats=true > ~/temp/mem_frag/jemalloc_stats_T2_4 -duration=1800 & For T2_3 and T2_4, I also conducted the tests before and after this PR. The following table show the important jemalloc stats. | Test Name | T2_3 | T2_3 after mem defrag | T2_4 | T2_4 after mem defrag | | - | - | - | - | - | | allocated (MB) | 8425 | 8093 | 5426 | 5149 | | available (MB) | 8489 | 8138 | 5435 | 5158 | | external fragmentation rate | 0.008 | 0.0055 | 0.0017 | 0.0017 | | resident (MB) | 8676 | 8392 | 5541 | 5321 | Pull Request resolved: https://github.com/facebook/rocksdb/pull/10287 Test Plan: Unit tests. Reviewed By: anand1976 Differential Revision: D37743362 Pulled By: gitbw95 fbshipit-source-id: 0010c5af08addeacc5ebbc4ffe5be882fb1d38ad
119 lines
4.2 KiB
C++
119 lines
4.2 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
|
|
#include <array>
|
|
#include <cstddef>
|
|
#include <memory>
|
|
|
|
#include "cache/lru_cache.h"
|
|
#include "memory/memory_allocator.h"
|
|
#include "rocksdb/secondary_cache.h"
|
|
#include "rocksdb/slice.h"
|
|
#include "rocksdb/status.h"
|
|
#include "util/compression.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
|
|
public:
|
|
CompressedSecondaryCacheResultHandle(void* value, size_t size)
|
|
: value_(value), size_(size) {}
|
|
virtual ~CompressedSecondaryCacheResultHandle() override = default;
|
|
|
|
CompressedSecondaryCacheResultHandle(
|
|
const CompressedSecondaryCacheResultHandle&) = delete;
|
|
CompressedSecondaryCacheResultHandle& operator=(
|
|
const CompressedSecondaryCacheResultHandle&) = delete;
|
|
|
|
bool IsReady() override { return true; }
|
|
|
|
void Wait() override {}
|
|
|
|
void* Value() override { return value_; }
|
|
|
|
size_t Size() override { return size_; }
|
|
|
|
private:
|
|
void* value_;
|
|
size_t size_;
|
|
};
|
|
|
|
// The CompressedSecondaryCache is a concrete implementation of
|
|
// rocksdb::SecondaryCache.
|
|
//
|
|
// Users can also cast a pointer to it and call methods on
|
|
// it directly, especially custom methods that may be added
|
|
// in the future. For example -
|
|
// std::unique_ptr<rocksdb::SecondaryCache> cache =
|
|
// NewCompressedSecondaryCache(opts);
|
|
// static_cast<CompressedSecondaryCache*>(cache.get())->Erase(key);
|
|
|
|
class CompressedSecondaryCache : public SecondaryCache {
|
|
public:
|
|
CompressedSecondaryCache(
|
|
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
|
|
double high_pri_pool_ratio,
|
|
std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
|
|
bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
|
|
CacheMetadataChargePolicy metadata_charge_policy =
|
|
kDefaultCacheMetadataChargePolicy,
|
|
CompressionType compression_type = CompressionType::kLZ4Compression,
|
|
uint32_t compress_format_version = 2);
|
|
virtual ~CompressedSecondaryCache() override;
|
|
|
|
const char* Name() const override { return "CompressedSecondaryCache"; }
|
|
|
|
Status Insert(const Slice& key, void* value,
|
|
const Cache::CacheItemHelper* helper) override;
|
|
|
|
std::unique_ptr<SecondaryCacheResultHandle> Lookup(
|
|
const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
|
|
bool& is_in_sec_cache) override;
|
|
|
|
void Erase(const Slice& key) override;
|
|
|
|
void WaitAll(std::vector<SecondaryCacheResultHandle*> /*handles*/) override {}
|
|
|
|
std::string GetPrintableOptions() const override;
|
|
|
|
private:
|
|
friend class CompressedSecondaryCacheTest;
|
|
static constexpr std::array<uint16_t, 33> malloc_bin_sizes_{
|
|
32, 64, 96, 128, 160, 192, 224, 256, 320, 384, 448,
|
|
512, 640, 768, 896, 1024, 1280, 1536, 1792, 2048, 2560, 3072,
|
|
3584, 4096, 5120, 6144, 7168, 8192, 10240, 12288, 14336, 16384, 32768};
|
|
|
|
struct CacheValueChunk {
|
|
// TODO try "CacheAllocationPtr next;".
|
|
CacheValueChunk* next;
|
|
size_t size;
|
|
// Beginning of the chunk data (MUST BE THE LAST FIELD IN THIS STRUCT!)
|
|
char data[1];
|
|
|
|
void Free() { delete[] reinterpret_cast<char*>(this); }
|
|
};
|
|
|
|
// Split value into chunks to better fit into jemalloc bins. The chunks
|
|
// are stored in CacheValueChunk and extra charge is needed for each chunk,
|
|
// so the cache charge is recalculated here.
|
|
CacheValueChunk* SplitValueIntoChunks(const Slice& value,
|
|
const CompressionType compression_type,
|
|
size_t& charge);
|
|
|
|
// After merging chunks, the extra charge for each chunk is removed, so
|
|
// the charge is recalculated.
|
|
CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head,
|
|
size_t& charge);
|
|
|
|
// An implementation of Cache::DeleterFn.
|
|
static void DeletionCallback(const Slice& /*key*/, void* obj);
|
|
std::shared_ptr<Cache> cache_;
|
|
CompressedSecondaryCacheOptions cache_options_;
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|