mirror of https://github.com/facebook/rocksdb.git
Shard JemallocNodumpAllocator (#11400)
Summary:
RocksDB's jemalloc no-dump allocator (`NewJemallocNodumpAllocator()`) was using a single manual arena. This arena's lock contention could be very high when thread caching is disabled for RocksDB blocks (e.g., when using `MALLOC_CONF='tcache_max:4096'` and `rocksdb_block_size=16384`).
This PR changes the jemalloc no-dump allocator to use a configurable number of manual arenas. That number is required to be a power of two so we can avoid division. The allocator shards allocation requests randomly across those manual arenas.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11400
Test Plan:
- mysqld setup
- Branch: fb-mysql-8.0.28 (653eba2e56
)
- Build: `mysqlbuild.sh --clean --release`
- Set env var `MALLOC_CONF='tcache_max:$tcache_max'`
- Added CLI args `--rocksdb_cache_dump=false --rocksdb_block_cache_size=4294967296 --rocksdb_block_size=16384`
- Ran under /usr/bin/time
- Large database scenario
- Setup command: `mysqlslap -h 127.0.0.1 -P 13020 --auto-generate-sql=1 --auto-generate-sql-load-type=write --auto-generate-sql-guid-primary=1 --number-char-cols=8 --auto-generate-sql-execute-number=262144 --concurrency=32 --no-drop`
- Benchmark command: `mysqlslap -h 127.0.0.1 -P 13020 --query='select count(*) from mysqlslap.t1;' --number-of-queries=320 --concurrency=32`
- Results:
| tcache_max | num_arenas | Peak RSS MB (% change) | Query latency seconds (% change) |
|---|---|---|---|
| 4096 | **(baseline)** | 4541 | 37.1 |
| 4096 | 1 | 4535 (-0.1%) | 36.7 (-1%) |
| 4096 | 8 | 4687 (+3%) | 10.2 (-73%) |
| 16384 | **(baseline)** | 4514 | 8.4 |
| 16384 | 1 | 4526 (+0.3%) | 8.5 (+1%) |
| 16384 | 8 | 4580 (+1%) | 8.5 (+1%) |
Reviewed By: pdillinger
Differential Revision: D45220794
Pulled By: ajkr
fbshipit-source-id: 9a50c9872bdef5d299e52b115a65ee8a5557d58d
This commit is contained in:
parent
d3ed796855
commit
925d8252e5
|
@ -2,6 +2,7 @@
|
|||
## Unreleased
|
||||
### New Features
|
||||
* Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287).
|
||||
* Added `JemallocAllocatorOptions::num_arenas`. Setting `num_arenas > 1` may mitigate mutex contention in the allocator, particularly in scenarios where block allocations commonly bypass jemalloc tcache.
|
||||
|
||||
## 8.2.0 (04/24/2023)
|
||||
### Public API Changes
|
||||
|
|
|
@ -55,6 +55,11 @@ struct JemallocAllocatorOptions {
|
|||
// Upper bound of allocation size to use tcache, if limit_tcache_size=true.
|
||||
// When used with block cache, it is recommended to set it to block_size.
|
||||
size_t tcache_size_upper_bound = 16 * 1024;
|
||||
|
||||
// Number of arenas across which we spread allocation requests. Increasing
|
||||
// this setting can mitigate arena mutex contention. The value must be
|
||||
// positive.
|
||||
size_t num_arenas = 1;
|
||||
};
|
||||
|
||||
// Generate memory allocator which allocates through Jemalloc and utilize
|
||||
|
@ -70,7 +75,8 @@ struct JemallocAllocatorOptions {
|
|||
// core dump. Side benefit of using single arena would be reduction of jemalloc
|
||||
// metadata for some workloads.
|
||||
//
|
||||
// To mitigate mutex contention for using one single arena, jemalloc tcache
|
||||
// To mitigate mutex contention for using one single arena (see also
|
||||
// `JemallocAllocatorOptions::num_arenas` above), jemalloc tcache
|
||||
// (thread-local cache) is enabled to cache unused allocations for future use.
|
||||
// The tcache normally incurs 0.5M extra memory usage per-thread. The usage
|
||||
// can be reduced by limiting allocation sizes to cache.
|
||||
|
|
|
@ -14,6 +14,8 @@
|
|||
#include "rocksdb/utilities/customizable_util.h"
|
||||
#include "rocksdb/utilities/object_registry.h"
|
||||
#include "rocksdb/utilities/options_type.h"
|
||||
#include "util/fastrange.h"
|
||||
#include "util/random.h"
|
||||
#include "util/string_util.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
@ -35,6 +37,9 @@ static std::unordered_map<std::string, OptionTypeInfo> jemalloc_type_info = {
|
|||
{offsetof(struct JemallocAllocatorOptions, tcache_size_upper_bound),
|
||||
OptionType::kSizeT, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"num_arenas",
|
||||
{offsetof(struct JemallocAllocatorOptions, num_arenas), OptionType::kSizeT,
|
||||
OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
|
||||
};
|
||||
bool JemallocNodumpAllocator::IsSupported(std::string* why) {
|
||||
#ifndef ROCKSDB_JEMALLOC
|
||||
|
@ -59,11 +64,13 @@ bool JemallocNodumpAllocator::IsSupported(std::string* why) {
|
|||
|
||||
JemallocNodumpAllocator::JemallocNodumpAllocator(
|
||||
JemallocAllocatorOptions& options)
|
||||
: options_(options),
|
||||
: options_(options)
|
||||
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
||||
tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache),
|
||||
,
|
||||
tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {
|
||||
#else // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
||||
{
|
||||
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
||||
arena_index_(0) {
|
||||
RegisterOptions(&options_, &jemalloc_type_info);
|
||||
}
|
||||
|
||||
|
@ -75,9 +82,9 @@ JemallocNodumpAllocator::~JemallocNodumpAllocator() {
|
|||
for (void* tcache_index : tcache_list) {
|
||||
DestroyThreadSpecificCache(tcache_index);
|
||||
}
|
||||
if (arena_index_ > 0) {
|
||||
for (auto arena_index : arena_indexes_) {
|
||||
// Destroy arena. Silently ignore error.
|
||||
Status s = DestroyArena(arena_index_);
|
||||
Status s = DestroyArena(arena_index);
|
||||
assert(s.ok());
|
||||
s.PermitUncheckedError();
|
||||
}
|
||||
|
@ -90,7 +97,8 @@ size_t JemallocNodumpAllocator::UsableSize(void* p,
|
|||
|
||||
void* JemallocNodumpAllocator::Allocate(size_t size) {
|
||||
int tcache_flag = GetThreadSpecificCache(size);
|
||||
return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag);
|
||||
uint32_t arena_index = GetArenaIndex();
|
||||
return mallocx(size, MALLOCX_ARENA(arena_index) | tcache_flag);
|
||||
}
|
||||
|
||||
void JemallocNodumpAllocator::Deallocate(void* p) {
|
||||
|
@ -105,45 +113,71 @@ void JemallocNodumpAllocator::Deallocate(void* p) {
|
|||
dallocx(p, tcache_flag);
|
||||
}
|
||||
|
||||
uint32_t JemallocNodumpAllocator::GetArenaIndex() const {
|
||||
if (arena_indexes_.size() == 1) {
|
||||
return arena_indexes_[0];
|
||||
}
|
||||
|
||||
static std::atomic<uint32_t> next_seed = 0;
|
||||
// Core-local may work in place of `thread_local` as we should be able to
|
||||
// tolerate occasional stale reads in thread migration cases. However we need
|
||||
// to make Random thread-safe and prevent cacheline bouncing. Whether this is
|
||||
// worthwhile is still an open question.
|
||||
thread_local Random tl_random(next_seed.fetch_add(1));
|
||||
return arena_indexes_[FastRange32(tl_random.Next(), arena_indexes_.size())];
|
||||
}
|
||||
|
||||
Status JemallocNodumpAllocator::InitializeArenas() {
|
||||
// Create arena.
|
||||
size_t arena_index_size = sizeof(arena_index_);
|
||||
int ret =
|
||||
mallctl("arenas.create", &arena_index_, &arena_index_size, nullptr, 0);
|
||||
if (ret != 0) {
|
||||
return Status::Incomplete("Failed to create jemalloc arena, error code: " +
|
||||
std::to_string(ret));
|
||||
}
|
||||
assert(arena_index_ != 0);
|
||||
assert(!init_);
|
||||
init_ = true;
|
||||
|
||||
// Read existing hooks.
|
||||
std::string key = "arena." + std::to_string(arena_index_) + ".extent_hooks";
|
||||
extent_hooks_t* hooks;
|
||||
size_t hooks_size = sizeof(hooks);
|
||||
ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
|
||||
if (ret != 0) {
|
||||
return Status::Incomplete("Failed to read existing hooks, error code: " +
|
||||
std::to_string(ret));
|
||||
}
|
||||
for (size_t i = 0; i < options_.num_arenas; i++) {
|
||||
// Create arena.
|
||||
unsigned arena_index;
|
||||
size_t arena_index_size = sizeof(arena_index);
|
||||
int ret =
|
||||
mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0);
|
||||
if (ret != 0) {
|
||||
return Status::Incomplete(
|
||||
"Failed to create jemalloc arena, error code: " +
|
||||
std::to_string(ret));
|
||||
}
|
||||
arena_indexes_.push_back(arena_index);
|
||||
|
||||
// Store existing alloc.
|
||||
extent_alloc_t* original_alloc = hooks->alloc;
|
||||
extent_alloc_t* expected = nullptr;
|
||||
bool success =
|
||||
JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
|
||||
expected, original_alloc);
|
||||
if (!success && original_alloc != expected) {
|
||||
return Status::Incomplete("Original alloc conflict.");
|
||||
}
|
||||
// Read existing hooks.
|
||||
std::string key =
|
||||
"arena." + std::to_string(arena_indexes_[i]) + ".extent_hooks";
|
||||
extent_hooks_t* hooks;
|
||||
size_t hooks_size = sizeof(hooks);
|
||||
ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
|
||||
if (ret != 0) {
|
||||
return Status::Incomplete("Failed to read existing hooks, error code: " +
|
||||
std::to_string(ret));
|
||||
}
|
||||
|
||||
// Set the custom hook.
|
||||
arena_hooks_.reset(new extent_hooks_t(*hooks));
|
||||
arena_hooks_->alloc = &JemallocNodumpAllocator::Alloc;
|
||||
extent_hooks_t* hooks_ptr = arena_hooks_.get();
|
||||
ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
|
||||
if (ret != 0) {
|
||||
return Status::Incomplete("Failed to set custom hook, error code: " +
|
||||
std::to_string(ret));
|
||||
// Store existing alloc.
|
||||
extent_alloc_t* original_alloc = hooks->alloc;
|
||||
extent_alloc_t* expected = nullptr;
|
||||
bool success =
|
||||
JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
|
||||
expected, original_alloc);
|
||||
if (!success && original_alloc != expected) {
|
||||
// This could happen if jemalloc creates new arenas with different initial
|
||||
// values in their `alloc` function pointers. See `original_alloc_` API
|
||||
// doc for more details.
|
||||
return Status::Incomplete("Original alloc conflict.");
|
||||
}
|
||||
|
||||
// Set the custom hook.
|
||||
per_arena_hooks_.emplace_back();
|
||||
per_arena_hooks_.back().reset(new extent_hooks_t(*hooks));
|
||||
per_arena_hooks_.back()->alloc = &JemallocNodumpAllocator::Alloc;
|
||||
extent_hooks_t* hooks_ptr = per_arena_hooks_.back().get();
|
||||
ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
|
||||
if (ret != 0) {
|
||||
return Status::Incomplete("Failed to set custom hook, error code: " +
|
||||
std::to_string(ret));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
@ -161,6 +195,8 @@ Status JemallocNodumpAllocator::PrepareOptions(
|
|||
options_.tcache_size_upper_bound) {
|
||||
return Status::InvalidArgument(
|
||||
"tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
|
||||
} else if (options_.num_arenas < 1) {
|
||||
return Status::InvalidArgument("num_arenas must be a positive integer");
|
||||
} else if (IsMutable()) {
|
||||
Status s = MemoryAllocator::PrepareOptions(config_options);
|
||||
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
||||
|
@ -221,7 +257,7 @@ void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr,
|
|||
return result;
|
||||
}
|
||||
|
||||
Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) {
|
||||
Status JemallocNodumpAllocator::DestroyArena(uint32_t arena_index) {
|
||||
assert(arena_index != 0);
|
||||
std::string key = "arena." + std::to_string(arena_index) + ".destroy";
|
||||
int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0);
|
||||
|
|
|
@ -24,6 +24,10 @@
|
|||
#endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
// Allocation requests are randomly sharded across
|
||||
// `JemallocAllocatorOptions::num_arenas` arenas to reduce contention on per-
|
||||
// arena mutexes.
|
||||
class JemallocNodumpAllocator : public BaseMemoryAllocator {
|
||||
public:
|
||||
explicit JemallocNodumpAllocator(JemallocAllocatorOptions& options);
|
||||
|
@ -38,7 +42,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator {
|
|||
return IsSupported(&unused);
|
||||
}
|
||||
static bool IsSupported(std::string* why);
|
||||
bool IsMutable() const { return arena_index_ == 0; }
|
||||
bool IsMutable() const { return !init_; }
|
||||
|
||||
Status PrepareOptions(const ConfigOptions& config_options) override;
|
||||
|
||||
|
@ -52,9 +56,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator {
|
|||
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
||||
Status InitializeArenas();
|
||||
|
||||
friend Status NewJemallocNodumpAllocator(
|
||||
JemallocAllocatorOptions& options,
|
||||
std::shared_ptr<MemoryAllocator>* memory_allocator);
|
||||
uint32_t GetArenaIndex() const;
|
||||
|
||||
// Custom alloc hook to replace jemalloc default alloc.
|
||||
static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size,
|
||||
|
@ -62,7 +64,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator {
|
|||
unsigned arena_ind);
|
||||
|
||||
// Destroy arena on destruction of the allocator, or on failure.
|
||||
static Status DestroyArena(unsigned arena_index);
|
||||
static Status DestroyArena(uint32_t arena_index);
|
||||
|
||||
// Destroy tcache on destruction of the allocator, or thread exit.
|
||||
static void DestroyThreadSpecificCache(void* ptr);
|
||||
|
@ -78,17 +80,20 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator {
|
|||
// NewJemallocNodumpAllocator is thread-safe.
|
||||
//
|
||||
// Hack: original_alloc_ needs to be static for Alloc() to access it.
|
||||
// alloc needs to be static to pass to jemalloc as function pointer.
|
||||
// alloc needs to be static to pass to jemalloc as function pointer. We can
|
||||
// use a single process-wide value as long as we assume that any newly created
|
||||
// arena has the same original value in its `alloc` function pointer.
|
||||
static std::atomic<extent_alloc_t*> original_alloc_;
|
||||
|
||||
// Custom hooks has to outlive corresponding arena.
|
||||
std::unique_ptr<extent_hooks_t> arena_hooks_;
|
||||
std::vector<std::unique_ptr<extent_hooks_t>> per_arena_hooks_;
|
||||
|
||||
// Hold thread-local tcache index.
|
||||
ThreadLocalPtr tcache_;
|
||||
|
||||
std::vector<uint32_t> arena_indexes_;
|
||||
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
||||
|
||||
// Arena index.
|
||||
unsigned arena_index_;
|
||||
bool init_ = false;
|
||||
};
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
Loading…
Reference in New Issue