diff --git a/HISTORY.md b/HISTORY.md index 7df7857249..7aef62cae1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## Unreleased ### New Features * Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287). +* Added `JemallocAllocatorOptions::num_arenas`. Setting `num_arenas > 1` may mitigate mutex contention in the allocator, particularly in scenarios where block allocations commonly bypass jemalloc tcache. ## 8.2.0 (04/24/2023) ### Public API Changes diff --git a/include/rocksdb/memory_allocator.h b/include/rocksdb/memory_allocator.h index 5cb799e427..d126abfe6d 100644 --- a/include/rocksdb/memory_allocator.h +++ b/include/rocksdb/memory_allocator.h @@ -55,6 +55,11 @@ struct JemallocAllocatorOptions { // Upper bound of allocation size to use tcache, if limit_tcache_size=true. // When used with block cache, it is recommended to set it to block_size. size_t tcache_size_upper_bound = 16 * 1024; + + // Number of arenas across which we spread allocation requests. Increasing + // this setting can mitigate arena mutex contention. The value must be + // positive. + size_t num_arenas = 1; }; // Generate memory allocator which allocates through Jemalloc and utilize @@ -70,7 +75,8 @@ struct JemallocAllocatorOptions { // core dump. Side benefit of using single arena would be reduction of jemalloc // metadata for some workloads. // -// To mitigate mutex contention for using one single arena, jemalloc tcache +// To mitigate mutex contention for using one single arena (see also +// `JemallocAllocatorOptions::num_arenas` above), jemalloc tcache // (thread-local cache) is enabled to cache unused allocations for future use. // The tcache normally incurs 0.5M extra memory usage per-thread. The usage // can be reduced by limiting allocation sizes to cache. diff --git a/memory/jemalloc_nodump_allocator.cc b/memory/jemalloc_nodump_allocator.cc index cdad14576d..d05248224d 100644 --- a/memory/jemalloc_nodump_allocator.cc +++ b/memory/jemalloc_nodump_allocator.cc @@ -14,6 +14,8 @@ #include "rocksdb/utilities/customizable_util.h" #include "rocksdb/utilities/object_registry.h" #include "rocksdb/utilities/options_type.h" +#include "util/fastrange.h" +#include "util/random.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -35,6 +37,9 @@ static std::unordered_map jemalloc_type_info = { {offsetof(struct JemallocAllocatorOptions, tcache_size_upper_bound), OptionType::kSizeT, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"num_arenas", + {offsetof(struct JemallocAllocatorOptions, num_arenas), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, }; bool JemallocNodumpAllocator::IsSupported(std::string* why) { #ifndef ROCKSDB_JEMALLOC @@ -59,11 +64,13 @@ bool JemallocNodumpAllocator::IsSupported(std::string* why) { JemallocNodumpAllocator::JemallocNodumpAllocator( JemallocAllocatorOptions& options) - : options_(options), + : options_(options) #ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR - tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache), + , + tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) { +#else // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +{ #endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR - arena_index_(0) { RegisterOptions(&options_, &jemalloc_type_info); } @@ -75,9 +82,9 @@ JemallocNodumpAllocator::~JemallocNodumpAllocator() { for (void* tcache_index : tcache_list) { DestroyThreadSpecificCache(tcache_index); } - if (arena_index_ > 0) { + for (auto arena_index : arena_indexes_) { // Destroy arena. Silently ignore error. - Status s = DestroyArena(arena_index_); + Status s = DestroyArena(arena_index); assert(s.ok()); s.PermitUncheckedError(); } @@ -90,7 +97,8 @@ size_t JemallocNodumpAllocator::UsableSize(void* p, void* JemallocNodumpAllocator::Allocate(size_t size) { int tcache_flag = GetThreadSpecificCache(size); - return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag); + uint32_t arena_index = GetArenaIndex(); + return mallocx(size, MALLOCX_ARENA(arena_index) | tcache_flag); } void JemallocNodumpAllocator::Deallocate(void* p) { @@ -105,45 +113,71 @@ void JemallocNodumpAllocator::Deallocate(void* p) { dallocx(p, tcache_flag); } +uint32_t JemallocNodumpAllocator::GetArenaIndex() const { + if (arena_indexes_.size() == 1) { + return arena_indexes_[0]; + } + + static std::atomic next_seed = 0; + // Core-local may work in place of `thread_local` as we should be able to + // tolerate occasional stale reads in thread migration cases. However we need + // to make Random thread-safe and prevent cacheline bouncing. Whether this is + // worthwhile is still an open question. + thread_local Random tl_random(next_seed.fetch_add(1)); + return arena_indexes_[FastRange32(tl_random.Next(), arena_indexes_.size())]; +} + Status JemallocNodumpAllocator::InitializeArenas() { - // Create arena. - size_t arena_index_size = sizeof(arena_index_); - int ret = - mallctl("arenas.create", &arena_index_, &arena_index_size, nullptr, 0); - if (ret != 0) { - return Status::Incomplete("Failed to create jemalloc arena, error code: " + - std::to_string(ret)); - } - assert(arena_index_ != 0); + assert(!init_); + init_ = true; - // Read existing hooks. - std::string key = "arena." + std::to_string(arena_index_) + ".extent_hooks"; - extent_hooks_t* hooks; - size_t hooks_size = sizeof(hooks); - ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0); - if (ret != 0) { - return Status::Incomplete("Failed to read existing hooks, error code: " + - std::to_string(ret)); - } + for (size_t i = 0; i < options_.num_arenas; i++) { + // Create arena. + unsigned arena_index; + size_t arena_index_size = sizeof(arena_index); + int ret = + mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete( + "Failed to create jemalloc arena, error code: " + + std::to_string(ret)); + } + arena_indexes_.push_back(arena_index); - // Store existing alloc. - extent_alloc_t* original_alloc = hooks->alloc; - extent_alloc_t* expected = nullptr; - bool success = - JemallocNodumpAllocator::original_alloc_.compare_exchange_strong( - expected, original_alloc); - if (!success && original_alloc != expected) { - return Status::Incomplete("Original alloc conflict."); - } + // Read existing hooks. + std::string key = + "arena." + std::to_string(arena_indexes_[i]) + ".extent_hooks"; + extent_hooks_t* hooks; + size_t hooks_size = sizeof(hooks); + ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to read existing hooks, error code: " + + std::to_string(ret)); + } - // Set the custom hook. - arena_hooks_.reset(new extent_hooks_t(*hooks)); - arena_hooks_->alloc = &JemallocNodumpAllocator::Alloc; - extent_hooks_t* hooks_ptr = arena_hooks_.get(); - ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr)); - if (ret != 0) { - return Status::Incomplete("Failed to set custom hook, error code: " + - std::to_string(ret)); + // Store existing alloc. + extent_alloc_t* original_alloc = hooks->alloc; + extent_alloc_t* expected = nullptr; + bool success = + JemallocNodumpAllocator::original_alloc_.compare_exchange_strong( + expected, original_alloc); + if (!success && original_alloc != expected) { + // This could happen if jemalloc creates new arenas with different initial + // values in their `alloc` function pointers. See `original_alloc_` API + // doc for more details. + return Status::Incomplete("Original alloc conflict."); + } + + // Set the custom hook. + per_arena_hooks_.emplace_back(); + per_arena_hooks_.back().reset(new extent_hooks_t(*hooks)); + per_arena_hooks_.back()->alloc = &JemallocNodumpAllocator::Alloc; + extent_hooks_t* hooks_ptr = per_arena_hooks_.back().get(); + ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr)); + if (ret != 0) { + return Status::Incomplete("Failed to set custom hook, error code: " + + std::to_string(ret)); + } } return Status::OK(); } @@ -161,6 +195,8 @@ Status JemallocNodumpAllocator::PrepareOptions( options_.tcache_size_upper_bound) { return Status::InvalidArgument( "tcache_size_lower_bound larger or equal to tcache_size_upper_bound."); + } else if (options_.num_arenas < 1) { + return Status::InvalidArgument("num_arenas must be a positive integer"); } else if (IsMutable()) { Status s = MemoryAllocator::PrepareOptions(config_options); #ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR @@ -221,7 +257,7 @@ void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr, return result; } -Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) { +Status JemallocNodumpAllocator::DestroyArena(uint32_t arena_index) { assert(arena_index != 0); std::string key = "arena." + std::to_string(arena_index) + ".destroy"; int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0); diff --git a/memory/jemalloc_nodump_allocator.h b/memory/jemalloc_nodump_allocator.h index a1e1547d7b..2bdbaeb328 100644 --- a/memory/jemalloc_nodump_allocator.h +++ b/memory/jemalloc_nodump_allocator.h @@ -24,6 +24,10 @@ #endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX namespace ROCKSDB_NAMESPACE { + +// Allocation requests are randomly sharded across +// `JemallocAllocatorOptions::num_arenas` arenas to reduce contention on per- +// arena mutexes. class JemallocNodumpAllocator : public BaseMemoryAllocator { public: explicit JemallocNodumpAllocator(JemallocAllocatorOptions& options); @@ -38,7 +42,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator { return IsSupported(&unused); } static bool IsSupported(std::string* why); - bool IsMutable() const { return arena_index_ == 0; } + bool IsMutable() const { return !init_; } Status PrepareOptions(const ConfigOptions& config_options) override; @@ -52,9 +56,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator { #ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR Status InitializeArenas(); - friend Status NewJemallocNodumpAllocator( - JemallocAllocatorOptions& options, - std::shared_ptr* memory_allocator); + uint32_t GetArenaIndex() const; // Custom alloc hook to replace jemalloc default alloc. static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size, @@ -62,7 +64,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator { unsigned arena_ind); // Destroy arena on destruction of the allocator, or on failure. - static Status DestroyArena(unsigned arena_index); + static Status DestroyArena(uint32_t arena_index); // Destroy tcache on destruction of the allocator, or thread exit. static void DestroyThreadSpecificCache(void* ptr); @@ -78,17 +80,20 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator { // NewJemallocNodumpAllocator is thread-safe. // // Hack: original_alloc_ needs to be static for Alloc() to access it. - // alloc needs to be static to pass to jemalloc as function pointer. + // alloc needs to be static to pass to jemalloc as function pointer. We can + // use a single process-wide value as long as we assume that any newly created + // arena has the same original value in its `alloc` function pointer. static std::atomic original_alloc_; // Custom hooks has to outlive corresponding arena. - std::unique_ptr arena_hooks_; + std::vector> per_arena_hooks_; // Hold thread-local tcache index. ThreadLocalPtr tcache_; + + std::vector arena_indexes_; #endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR - // Arena index. - unsigned arena_index_; + bool init_ = false; }; } // namespace ROCKSDB_NAMESPACE