mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-25 14:31:35 +00:00
925d8252e5
Summary:
RocksDB's jemalloc no-dump allocator (`NewJemallocNodumpAllocator()`) was using a single manual arena. This arena's lock contention could be very high when thread caching is disabled for RocksDB blocks (e.g., when using `MALLOC_CONF='tcache_max:4096'` and `rocksdb_block_size=16384`).
This PR changes the jemalloc no-dump allocator to use a configurable number of manual arenas. That number is required to be a power of two so we can avoid division. The allocator shards allocation requests randomly across those manual arenas.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11400
Test Plan:
- mysqld setup
- Branch: fb-mysql-8.0.28 (653eba2e56
)
- Build: `mysqlbuild.sh --clean --release`
- Set env var `MALLOC_CONF='tcache_max:$tcache_max'`
- Added CLI args `--rocksdb_cache_dump=false --rocksdb_block_cache_size=4294967296 --rocksdb_block_size=16384`
- Ran under /usr/bin/time
- Large database scenario
- Setup command: `mysqlslap -h 127.0.0.1 -P 13020 --auto-generate-sql=1 --auto-generate-sql-load-type=write --auto-generate-sql-guid-primary=1 --number-char-cols=8 --auto-generate-sql-execute-number=262144 --concurrency=32 --no-drop`
- Benchmark command: `mysqlslap -h 127.0.0.1 -P 13020 --query='select count(*) from mysqlslap.t1;' --number-of-queries=320 --concurrency=32`
- Results:
| tcache_max | num_arenas | Peak RSS MB (% change) | Query latency seconds (% change) |
|---|---|---|---|
| 4096 | **(baseline)** | 4541 | 37.1 |
| 4096 | 1 | 4535 (-0.1%) | 36.7 (-1%) |
| 4096 | 8 | 4687 (+3%) | 10.2 (-73%) |
| 16384 | **(baseline)** | 4514 | 8.4 |
| 16384 | 1 | 4526 (+0.3%) | 8.5 (+1%) |
| 16384 | 8 | 4580 (+1%) | 8.5 (+1%) |
Reviewed By: pdillinger
Differential Revision: D45220794
Pulled By: ajkr
fbshipit-source-id: 9a50c9872bdef5d299e52b115a65ee8a5557d58d
304 lines
10 KiB
C++
304 lines
10 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "memory/jemalloc_nodump_allocator.h"
|
|
|
|
#include <string>
|
|
#include <thread>
|
|
|
|
#include "port/likely.h"
|
|
#include "port/port.h"
|
|
#include "rocksdb/convenience.h"
|
|
#include "rocksdb/utilities/customizable_util.h"
|
|
#include "rocksdb/utilities/object_registry.h"
|
|
#include "rocksdb/utilities/options_type.h"
|
|
#include "util/fastrange.h"
|
|
#include "util/random.h"
|
|
#include "util/string_util.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
std::atomic<extent_alloc_t*> JemallocNodumpAllocator::original_alloc_{nullptr};
|
|
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
|
|
static std::unordered_map<std::string, OptionTypeInfo> jemalloc_type_info = {
|
|
{"limit_tcache_size",
|
|
{offsetof(struct JemallocAllocatorOptions, limit_tcache_size),
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
|
OptionTypeFlags::kNone}},
|
|
{"tcache_size_lower_bound",
|
|
{offsetof(struct JemallocAllocatorOptions, tcache_size_lower_bound),
|
|
OptionType::kSizeT, OptionVerificationType::kNormal,
|
|
OptionTypeFlags::kNone}},
|
|
{"tcache_size_upper_bound",
|
|
{offsetof(struct JemallocAllocatorOptions, tcache_size_upper_bound),
|
|
OptionType::kSizeT, OptionVerificationType::kNormal,
|
|
OptionTypeFlags::kNone}},
|
|
{"num_arenas",
|
|
{offsetof(struct JemallocAllocatorOptions, num_arenas), OptionType::kSizeT,
|
|
OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
|
|
};
|
|
bool JemallocNodumpAllocator::IsSupported(std::string* why) {
|
|
#ifndef ROCKSDB_JEMALLOC
|
|
*why = "Not compiled with ROCKSDB_JEMALLOC";
|
|
return false;
|
|
#else
|
|
static const std::string unsupported =
|
|
"JemallocNodumpAllocator only available with jemalloc version >= 5 "
|
|
"and MADV_DONTDUMP is available.";
|
|
if (!HasJemalloc()) {
|
|
*why = unsupported;
|
|
return false;
|
|
}
|
|
#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
*why = unsupported;
|
|
return false;
|
|
#else
|
|
return true;
|
|
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
#endif // ROCKSDB_MALLOC
|
|
}
|
|
|
|
JemallocNodumpAllocator::JemallocNodumpAllocator(
|
|
JemallocAllocatorOptions& options)
|
|
: options_(options)
|
|
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
,
|
|
tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {
|
|
#else // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
{
|
|
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
RegisterOptions(&options_, &jemalloc_type_info);
|
|
}
|
|
|
|
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
JemallocNodumpAllocator::~JemallocNodumpAllocator() {
|
|
// Destroy tcache before destroying arena.
|
|
autovector<void*> tcache_list;
|
|
tcache_.Scrape(&tcache_list, nullptr);
|
|
for (void* tcache_index : tcache_list) {
|
|
DestroyThreadSpecificCache(tcache_index);
|
|
}
|
|
for (auto arena_index : arena_indexes_) {
|
|
// Destroy arena. Silently ignore error.
|
|
Status s = DestroyArena(arena_index);
|
|
assert(s.ok());
|
|
s.PermitUncheckedError();
|
|
}
|
|
}
|
|
|
|
size_t JemallocNodumpAllocator::UsableSize(void* p,
|
|
size_t /*allocation_size*/) const {
|
|
return malloc_usable_size(static_cast<void*>(p));
|
|
}
|
|
|
|
void* JemallocNodumpAllocator::Allocate(size_t size) {
|
|
int tcache_flag = GetThreadSpecificCache(size);
|
|
uint32_t arena_index = GetArenaIndex();
|
|
return mallocx(size, MALLOCX_ARENA(arena_index) | tcache_flag);
|
|
}
|
|
|
|
void JemallocNodumpAllocator::Deallocate(void* p) {
|
|
// Obtain tcache.
|
|
size_t size = 0;
|
|
if (options_.limit_tcache_size) {
|
|
size = malloc_usable_size(p);
|
|
}
|
|
int tcache_flag = GetThreadSpecificCache(size);
|
|
// No need to pass arena index to dallocx(). Jemalloc will find arena index
|
|
// from its own metadata.
|
|
dallocx(p, tcache_flag);
|
|
}
|
|
|
|
uint32_t JemallocNodumpAllocator::GetArenaIndex() const {
|
|
if (arena_indexes_.size() == 1) {
|
|
return arena_indexes_[0];
|
|
}
|
|
|
|
static std::atomic<uint32_t> next_seed = 0;
|
|
// Core-local may work in place of `thread_local` as we should be able to
|
|
// tolerate occasional stale reads in thread migration cases. However we need
|
|
// to make Random thread-safe and prevent cacheline bouncing. Whether this is
|
|
// worthwhile is still an open question.
|
|
thread_local Random tl_random(next_seed.fetch_add(1));
|
|
return arena_indexes_[FastRange32(tl_random.Next(), arena_indexes_.size())];
|
|
}
|
|
|
|
Status JemallocNodumpAllocator::InitializeArenas() {
|
|
assert(!init_);
|
|
init_ = true;
|
|
|
|
for (size_t i = 0; i < options_.num_arenas; i++) {
|
|
// Create arena.
|
|
unsigned arena_index;
|
|
size_t arena_index_size = sizeof(arena_index);
|
|
int ret =
|
|
mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0);
|
|
if (ret != 0) {
|
|
return Status::Incomplete(
|
|
"Failed to create jemalloc arena, error code: " +
|
|
std::to_string(ret));
|
|
}
|
|
arena_indexes_.push_back(arena_index);
|
|
|
|
// Read existing hooks.
|
|
std::string key =
|
|
"arena." + std::to_string(arena_indexes_[i]) + ".extent_hooks";
|
|
extent_hooks_t* hooks;
|
|
size_t hooks_size = sizeof(hooks);
|
|
ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
|
|
if (ret != 0) {
|
|
return Status::Incomplete("Failed to read existing hooks, error code: " +
|
|
std::to_string(ret));
|
|
}
|
|
|
|
// Store existing alloc.
|
|
extent_alloc_t* original_alloc = hooks->alloc;
|
|
extent_alloc_t* expected = nullptr;
|
|
bool success =
|
|
JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
|
|
expected, original_alloc);
|
|
if (!success && original_alloc != expected) {
|
|
// This could happen if jemalloc creates new arenas with different initial
|
|
// values in their `alloc` function pointers. See `original_alloc_` API
|
|
// doc for more details.
|
|
return Status::Incomplete("Original alloc conflict.");
|
|
}
|
|
|
|
// Set the custom hook.
|
|
per_arena_hooks_.emplace_back();
|
|
per_arena_hooks_.back().reset(new extent_hooks_t(*hooks));
|
|
per_arena_hooks_.back()->alloc = &JemallocNodumpAllocator::Alloc;
|
|
extent_hooks_t* hooks_ptr = per_arena_hooks_.back().get();
|
|
ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
|
|
if (ret != 0) {
|
|
return Status::Incomplete("Failed to set custom hook, error code: " +
|
|
std::to_string(ret));
|
|
}
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
|
|
Status JemallocNodumpAllocator::PrepareOptions(
|
|
const ConfigOptions& config_options) {
|
|
std::string message;
|
|
|
|
if (!IsSupported(&message)) {
|
|
return Status::NotSupported(message);
|
|
} else if (options_.limit_tcache_size &&
|
|
options_.tcache_size_lower_bound >=
|
|
options_.tcache_size_upper_bound) {
|
|
return Status::InvalidArgument(
|
|
"tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
|
|
} else if (options_.num_arenas < 1) {
|
|
return Status::InvalidArgument("num_arenas must be a positive integer");
|
|
} else if (IsMutable()) {
|
|
Status s = MemoryAllocator::PrepareOptions(config_options);
|
|
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
if (s.ok()) {
|
|
s = InitializeArenas();
|
|
}
|
|
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
return s;
|
|
} else {
|
|
// Already prepared
|
|
return Status::OK();
|
|
}
|
|
}
|
|
|
|
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) {
|
|
// We always enable tcache. The only corner case is when there are a ton of
|
|
// threads accessing with low frequency, then it could consume a lot of
|
|
// memory (may reach # threads * ~1MB) without bringing too much benefit.
|
|
if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound ||
|
|
size > options_.tcache_size_upper_bound)) {
|
|
return MALLOCX_TCACHE_NONE;
|
|
}
|
|
unsigned* tcache_index = reinterpret_cast<unsigned*>(tcache_.Get());
|
|
if (UNLIKELY(tcache_index == nullptr)) {
|
|
// Instantiate tcache.
|
|
tcache_index = new unsigned(0);
|
|
size_t tcache_index_size = sizeof(unsigned);
|
|
int ret =
|
|
mallctl("tcache.create", tcache_index, &tcache_index_size, nullptr, 0);
|
|
if (ret != 0) {
|
|
// No good way to expose the error. Silently disable tcache.
|
|
delete tcache_index;
|
|
return MALLOCX_TCACHE_NONE;
|
|
}
|
|
tcache_.Reset(static_cast<void*>(tcache_index));
|
|
}
|
|
return MALLOCX_TCACHE(*tcache_index);
|
|
}
|
|
void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr,
|
|
size_t size, size_t alignment, bool* zero,
|
|
bool* commit, unsigned arena_ind) {
|
|
extent_alloc_t* original_alloc =
|
|
original_alloc_.load(std::memory_order_relaxed);
|
|
assert(original_alloc != nullptr);
|
|
void* result = original_alloc(extent, new_addr, size, alignment, zero, commit,
|
|
arena_ind);
|
|
if (result != nullptr) {
|
|
int ret = madvise(result, size, MADV_DONTDUMP);
|
|
if (ret != 0) {
|
|
fprintf(
|
|
stderr,
|
|
"JemallocNodumpAllocator failed to set MADV_DONTDUMP, error code: %d",
|
|
ret);
|
|
assert(false);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
Status JemallocNodumpAllocator::DestroyArena(uint32_t arena_index) {
|
|
assert(arena_index != 0);
|
|
std::string key = "arena." + std::to_string(arena_index) + ".destroy";
|
|
int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0);
|
|
if (ret != 0) {
|
|
return Status::Incomplete("Failed to destroy jemalloc arena, error code: " +
|
|
std::to_string(ret));
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
void JemallocNodumpAllocator::DestroyThreadSpecificCache(void* ptr) {
|
|
assert(ptr != nullptr);
|
|
unsigned* tcache_index = static_cast<unsigned*>(ptr);
|
|
size_t tcache_index_size = sizeof(unsigned);
|
|
int ret __attribute__((__unused__)) =
|
|
mallctl("tcache.destroy", nullptr, 0, tcache_index, tcache_index_size);
|
|
// Silently ignore error.
|
|
assert(ret == 0);
|
|
delete tcache_index;
|
|
}
|
|
|
|
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
|
|
|
|
Status NewJemallocNodumpAllocator(
|
|
JemallocAllocatorOptions& options,
|
|
std::shared_ptr<MemoryAllocator>* memory_allocator) {
|
|
if (memory_allocator == nullptr) {
|
|
return Status::InvalidArgument("memory_allocator must be non-null.");
|
|
}
|
|
#ifndef ROCKSDB_JEMALLOC
|
|
(void)options;
|
|
return Status::NotSupported("Not compiled with JEMALLOC");
|
|
#else
|
|
std::unique_ptr<MemoryAllocator> allocator(
|
|
new JemallocNodumpAllocator(options));
|
|
Status s = allocator->PrepareOptions(ConfigOptions());
|
|
if (s.ok()) {
|
|
memory_allocator->reset(allocator.release());
|
|
}
|
|
return s;
|
|
#endif
|
|
}
|
|
} // namespace ROCKSDB_NAMESPACE
|