rocksdb/memory/jemalloc_nodump_allocator.h

100 lines
3.5 KiB
C
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <atomic>
#include <vector>
#include "port/jemalloc_helper.h"
#include "port/port.h"
#include "rocksdb/memory_allocator.h"
#include "util/thread_local.h"
#include "utilities/memory_allocators.h"
#if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX)
#include <sys/mman.h>
#if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP)
#define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
#endif // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP
#endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX
namespace ROCKSDB_NAMESPACE {
Shard JemallocNodumpAllocator (#11400) Summary: RocksDB's jemalloc no-dump allocator (`NewJemallocNodumpAllocator()`) was using a single manual arena. This arena's lock contention could be very high when thread caching is disabled for RocksDB blocks (e.g., when using `MALLOC_CONF='tcache_max:4096'` and `rocksdb_block_size=16384`). This PR changes the jemalloc no-dump allocator to use a configurable number of manual arenas. That number is required to be a power of two so we can avoid division. The allocator shards allocation requests randomly across those manual arenas. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11400 Test Plan: - mysqld setup - Branch: fb-mysql-8.0.28 (https://github.com/facebook/mysql-5.6/commit/653eba2e56cfba4eac0c851ac9a70b2da9607527) - Build: `mysqlbuild.sh --clean --release` - Set env var `MALLOC_CONF='tcache_max:$tcache_max'` - Added CLI args `--rocksdb_cache_dump=false --rocksdb_block_cache_size=4294967296 --rocksdb_block_size=16384` - Ran under /usr/bin/time - Large database scenario - Setup command: `mysqlslap -h 127.0.0.1 -P 13020 --auto-generate-sql=1 --auto-generate-sql-load-type=write --auto-generate-sql-guid-primary=1 --number-char-cols=8 --auto-generate-sql-execute-number=262144 --concurrency=32 --no-drop` - Benchmark command: `mysqlslap -h 127.0.0.1 -P 13020 --query='select count(*) from mysqlslap.t1;' --number-of-queries=320 --concurrency=32` - Results: | tcache_max | num_arenas | Peak RSS MB (% change) | Query latency seconds (% change) | |---|---|---|---| | 4096 | **(baseline)** | 4541 | 37.1 | | 4096 | 1 | 4535 (-0.1%) | 36.7 (-1%) | | 4096 | 8 | 4687 (+3%) | 10.2 (-73%) | | 16384 | **(baseline)** | 4514 | 8.4 | | 16384 | 1 | 4526 (+0.3%) | 8.5 (+1%) | | 16384 | 8 | 4580 (+1%) | 8.5 (+1%) | Reviewed By: pdillinger Differential Revision: D45220794 Pulled By: ajkr fbshipit-source-id: 9a50c9872bdef5d299e52b115a65ee8a5557d58d
2023-05-01 17:14:43 +00:00
// Allocation requests are randomly sharded across
// `JemallocAllocatorOptions::num_arenas` arenas to reduce contention on per-
// arena mutexes.
class JemallocNodumpAllocator : public BaseMemoryAllocator {
public:
explicit JemallocNodumpAllocator(JemallocAllocatorOptions& options);
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
~JemallocNodumpAllocator();
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
static const char* kClassName() { return "JemallocNodumpAllocator"; }
const char* Name() const override { return kClassName(); }
static bool IsSupported() {
std::string unused;
return IsSupported(&unused);
}
static bool IsSupported(std::string* why);
Shard JemallocNodumpAllocator (#11400) Summary: RocksDB's jemalloc no-dump allocator (`NewJemallocNodumpAllocator()`) was using a single manual arena. This arena's lock contention could be very high when thread caching is disabled for RocksDB blocks (e.g., when using `MALLOC_CONF='tcache_max:4096'` and `rocksdb_block_size=16384`). This PR changes the jemalloc no-dump allocator to use a configurable number of manual arenas. That number is required to be a power of two so we can avoid division. The allocator shards allocation requests randomly across those manual arenas. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11400 Test Plan: - mysqld setup - Branch: fb-mysql-8.0.28 (https://github.com/facebook/mysql-5.6/commit/653eba2e56cfba4eac0c851ac9a70b2da9607527) - Build: `mysqlbuild.sh --clean --release` - Set env var `MALLOC_CONF='tcache_max:$tcache_max'` - Added CLI args `--rocksdb_cache_dump=false --rocksdb_block_cache_size=4294967296 --rocksdb_block_size=16384` - Ran under /usr/bin/time - Large database scenario - Setup command: `mysqlslap -h 127.0.0.1 -P 13020 --auto-generate-sql=1 --auto-generate-sql-load-type=write --auto-generate-sql-guid-primary=1 --number-char-cols=8 --auto-generate-sql-execute-number=262144 --concurrency=32 --no-drop` - Benchmark command: `mysqlslap -h 127.0.0.1 -P 13020 --query='select count(*) from mysqlslap.t1;' --number-of-queries=320 --concurrency=32` - Results: | tcache_max | num_arenas | Peak RSS MB (% change) | Query latency seconds (% change) | |---|---|---|---| | 4096 | **(baseline)** | 4541 | 37.1 | | 4096 | 1 | 4535 (-0.1%) | 36.7 (-1%) | | 4096 | 8 | 4687 (+3%) | 10.2 (-73%) | | 16384 | **(baseline)** | 4514 | 8.4 | | 16384 | 1 | 4526 (+0.3%) | 8.5 (+1%) | | 16384 | 8 | 4580 (+1%) | 8.5 (+1%) | Reviewed By: pdillinger Differential Revision: D45220794 Pulled By: ajkr fbshipit-source-id: 9a50c9872bdef5d299e52b115a65ee8a5557d58d
2023-05-01 17:14:43 +00:00
bool IsMutable() const { return !init_; }
Status PrepareOptions(const ConfigOptions& config_options) override;
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
void* Allocate(size_t size) override;
void Deallocate(void* p) override;
size_t UsableSize(void* p, size_t allocation_size) const override;
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
private:
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
Status InitializeArenas();
Shard JemallocNodumpAllocator (#11400) Summary: RocksDB's jemalloc no-dump allocator (`NewJemallocNodumpAllocator()`) was using a single manual arena. This arena's lock contention could be very high when thread caching is disabled for RocksDB blocks (e.g., when using `MALLOC_CONF='tcache_max:4096'` and `rocksdb_block_size=16384`). This PR changes the jemalloc no-dump allocator to use a configurable number of manual arenas. That number is required to be a power of two so we can avoid division. The allocator shards allocation requests randomly across those manual arenas. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11400 Test Plan: - mysqld setup - Branch: fb-mysql-8.0.28 (https://github.com/facebook/mysql-5.6/commit/653eba2e56cfba4eac0c851ac9a70b2da9607527) - Build: `mysqlbuild.sh --clean --release` - Set env var `MALLOC_CONF='tcache_max:$tcache_max'` - Added CLI args `--rocksdb_cache_dump=false --rocksdb_block_cache_size=4294967296 --rocksdb_block_size=16384` - Ran under /usr/bin/time - Large database scenario - Setup command: `mysqlslap -h 127.0.0.1 -P 13020 --auto-generate-sql=1 --auto-generate-sql-load-type=write --auto-generate-sql-guid-primary=1 --number-char-cols=8 --auto-generate-sql-execute-number=262144 --concurrency=32 --no-drop` - Benchmark command: `mysqlslap -h 127.0.0.1 -P 13020 --query='select count(*) from mysqlslap.t1;' --number-of-queries=320 --concurrency=32` - Results: | tcache_max | num_arenas | Peak RSS MB (% change) | Query latency seconds (% change) | |---|---|---|---| | 4096 | **(baseline)** | 4541 | 37.1 | | 4096 | 1 | 4535 (-0.1%) | 36.7 (-1%) | | 4096 | 8 | 4687 (+3%) | 10.2 (-73%) | | 16384 | **(baseline)** | 4514 | 8.4 | | 16384 | 1 | 4526 (+0.3%) | 8.5 (+1%) | | 16384 | 8 | 4580 (+1%) | 8.5 (+1%) | Reviewed By: pdillinger Differential Revision: D45220794 Pulled By: ajkr fbshipit-source-id: 9a50c9872bdef5d299e52b115a65ee8a5557d58d
2023-05-01 17:14:43 +00:00
uint32_t GetArenaIndex() const;
// Custom alloc hook to replace jemalloc default alloc.
static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size,
size_t alignment, bool* zero, bool* commit,
unsigned arena_ind);
// Destroy arena on destruction of the allocator, or on failure.
Shard JemallocNodumpAllocator (#11400) Summary: RocksDB's jemalloc no-dump allocator (`NewJemallocNodumpAllocator()`) was using a single manual arena. This arena's lock contention could be very high when thread caching is disabled for RocksDB blocks (e.g., when using `MALLOC_CONF='tcache_max:4096'` and `rocksdb_block_size=16384`). This PR changes the jemalloc no-dump allocator to use a configurable number of manual arenas. That number is required to be a power of two so we can avoid division. The allocator shards allocation requests randomly across those manual arenas. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11400 Test Plan: - mysqld setup - Branch: fb-mysql-8.0.28 (https://github.com/facebook/mysql-5.6/commit/653eba2e56cfba4eac0c851ac9a70b2da9607527) - Build: `mysqlbuild.sh --clean --release` - Set env var `MALLOC_CONF='tcache_max:$tcache_max'` - Added CLI args `--rocksdb_cache_dump=false --rocksdb_block_cache_size=4294967296 --rocksdb_block_size=16384` - Ran under /usr/bin/time - Large database scenario - Setup command: `mysqlslap -h 127.0.0.1 -P 13020 --auto-generate-sql=1 --auto-generate-sql-load-type=write --auto-generate-sql-guid-primary=1 --number-char-cols=8 --auto-generate-sql-execute-number=262144 --concurrency=32 --no-drop` - Benchmark command: `mysqlslap -h 127.0.0.1 -P 13020 --query='select count(*) from mysqlslap.t1;' --number-of-queries=320 --concurrency=32` - Results: | tcache_max | num_arenas | Peak RSS MB (% change) | Query latency seconds (% change) | |---|---|---|---| | 4096 | **(baseline)** | 4541 | 37.1 | | 4096 | 1 | 4535 (-0.1%) | 36.7 (-1%) | | 4096 | 8 | 4687 (+3%) | 10.2 (-73%) | | 16384 | **(baseline)** | 4514 | 8.4 | | 16384 | 1 | 4526 (+0.3%) | 8.5 (+1%) | | 16384 | 8 | 4580 (+1%) | 8.5 (+1%) | Reviewed By: pdillinger Differential Revision: D45220794 Pulled By: ajkr fbshipit-source-id: 9a50c9872bdef5d299e52b115a65ee8a5557d58d
2023-05-01 17:14:43 +00:00
static Status DestroyArena(uint32_t arena_index);
// Destroy tcache on destruction of the allocator, or thread exit.
static void DestroyThreadSpecificCache(void* ptr);
// Get or create tcache. Return flag suitable to use with `mallocx`:
// either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc).
int GetThreadSpecificCache(size_t size);
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
JemallocAllocatorOptions options_;
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
// A function pointer to jemalloc default alloc. Use atomic to make sure
// NewJemallocNodumpAllocator is thread-safe.
//
// Hack: original_alloc_ needs to be static for Alloc() to access it.
Shard JemallocNodumpAllocator (#11400) Summary: RocksDB's jemalloc no-dump allocator (`NewJemallocNodumpAllocator()`) was using a single manual arena. This arena's lock contention could be very high when thread caching is disabled for RocksDB blocks (e.g., when using `MALLOC_CONF='tcache_max:4096'` and `rocksdb_block_size=16384`). This PR changes the jemalloc no-dump allocator to use a configurable number of manual arenas. That number is required to be a power of two so we can avoid division. The allocator shards allocation requests randomly across those manual arenas. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11400 Test Plan: - mysqld setup - Branch: fb-mysql-8.0.28 (https://github.com/facebook/mysql-5.6/commit/653eba2e56cfba4eac0c851ac9a70b2da9607527) - Build: `mysqlbuild.sh --clean --release` - Set env var `MALLOC_CONF='tcache_max:$tcache_max'` - Added CLI args `--rocksdb_cache_dump=false --rocksdb_block_cache_size=4294967296 --rocksdb_block_size=16384` - Ran under /usr/bin/time - Large database scenario - Setup command: `mysqlslap -h 127.0.0.1 -P 13020 --auto-generate-sql=1 --auto-generate-sql-load-type=write --auto-generate-sql-guid-primary=1 --number-char-cols=8 --auto-generate-sql-execute-number=262144 --concurrency=32 --no-drop` - Benchmark command: `mysqlslap -h 127.0.0.1 -P 13020 --query='select count(*) from mysqlslap.t1;' --number-of-queries=320 --concurrency=32` - Results: | tcache_max | num_arenas | Peak RSS MB (% change) | Query latency seconds (% change) | |---|---|---|---| | 4096 | **(baseline)** | 4541 | 37.1 | | 4096 | 1 | 4535 (-0.1%) | 36.7 (-1%) | | 4096 | 8 | 4687 (+3%) | 10.2 (-73%) | | 16384 | **(baseline)** | 4514 | 8.4 | | 16384 | 1 | 4526 (+0.3%) | 8.5 (+1%) | | 16384 | 8 | 4580 (+1%) | 8.5 (+1%) | Reviewed By: pdillinger Differential Revision: D45220794 Pulled By: ajkr fbshipit-source-id: 9a50c9872bdef5d299e52b115a65ee8a5557d58d
2023-05-01 17:14:43 +00:00
// alloc needs to be static to pass to jemalloc as function pointer. We can
// use a single process-wide value as long as we assume that any newly created
// arena has the same original value in its `alloc` function pointer.
static std::atomic<extent_alloc_t*> original_alloc_;
// Custom hooks has to outlive corresponding arena.
Shard JemallocNodumpAllocator (#11400) Summary: RocksDB's jemalloc no-dump allocator (`NewJemallocNodumpAllocator()`) was using a single manual arena. This arena's lock contention could be very high when thread caching is disabled for RocksDB blocks (e.g., when using `MALLOC_CONF='tcache_max:4096'` and `rocksdb_block_size=16384`). This PR changes the jemalloc no-dump allocator to use a configurable number of manual arenas. That number is required to be a power of two so we can avoid division. The allocator shards allocation requests randomly across those manual arenas. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11400 Test Plan: - mysqld setup - Branch: fb-mysql-8.0.28 (https://github.com/facebook/mysql-5.6/commit/653eba2e56cfba4eac0c851ac9a70b2da9607527) - Build: `mysqlbuild.sh --clean --release` - Set env var `MALLOC_CONF='tcache_max:$tcache_max'` - Added CLI args `--rocksdb_cache_dump=false --rocksdb_block_cache_size=4294967296 --rocksdb_block_size=16384` - Ran under /usr/bin/time - Large database scenario - Setup command: `mysqlslap -h 127.0.0.1 -P 13020 --auto-generate-sql=1 --auto-generate-sql-load-type=write --auto-generate-sql-guid-primary=1 --number-char-cols=8 --auto-generate-sql-execute-number=262144 --concurrency=32 --no-drop` - Benchmark command: `mysqlslap -h 127.0.0.1 -P 13020 --query='select count(*) from mysqlslap.t1;' --number-of-queries=320 --concurrency=32` - Results: | tcache_max | num_arenas | Peak RSS MB (% change) | Query latency seconds (% change) | |---|---|---|---| | 4096 | **(baseline)** | 4541 | 37.1 | | 4096 | 1 | 4535 (-0.1%) | 36.7 (-1%) | | 4096 | 8 | 4687 (+3%) | 10.2 (-73%) | | 16384 | **(baseline)** | 4514 | 8.4 | | 16384 | 1 | 4526 (+0.3%) | 8.5 (+1%) | | 16384 | 8 | 4580 (+1%) | 8.5 (+1%) | Reviewed By: pdillinger Differential Revision: D45220794 Pulled By: ajkr fbshipit-source-id: 9a50c9872bdef5d299e52b115a65ee8a5557d58d
2023-05-01 17:14:43 +00:00
std::vector<std::unique_ptr<extent_hooks_t>> per_arena_hooks_;
// Hold thread-local tcache index.
ThreadLocalPtr tcache_;
Shard JemallocNodumpAllocator (#11400) Summary: RocksDB's jemalloc no-dump allocator (`NewJemallocNodumpAllocator()`) was using a single manual arena. This arena's lock contention could be very high when thread caching is disabled for RocksDB blocks (e.g., when using `MALLOC_CONF='tcache_max:4096'` and `rocksdb_block_size=16384`). This PR changes the jemalloc no-dump allocator to use a configurable number of manual arenas. That number is required to be a power of two so we can avoid division. The allocator shards allocation requests randomly across those manual arenas. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11400 Test Plan: - mysqld setup - Branch: fb-mysql-8.0.28 (https://github.com/facebook/mysql-5.6/commit/653eba2e56cfba4eac0c851ac9a70b2da9607527) - Build: `mysqlbuild.sh --clean --release` - Set env var `MALLOC_CONF='tcache_max:$tcache_max'` - Added CLI args `--rocksdb_cache_dump=false --rocksdb_block_cache_size=4294967296 --rocksdb_block_size=16384` - Ran under /usr/bin/time - Large database scenario - Setup command: `mysqlslap -h 127.0.0.1 -P 13020 --auto-generate-sql=1 --auto-generate-sql-load-type=write --auto-generate-sql-guid-primary=1 --number-char-cols=8 --auto-generate-sql-execute-number=262144 --concurrency=32 --no-drop` - Benchmark command: `mysqlslap -h 127.0.0.1 -P 13020 --query='select count(*) from mysqlslap.t1;' --number-of-queries=320 --concurrency=32` - Results: | tcache_max | num_arenas | Peak RSS MB (% change) | Query latency seconds (% change) | |---|---|---|---| | 4096 | **(baseline)** | 4541 | 37.1 | | 4096 | 1 | 4535 (-0.1%) | 36.7 (-1%) | | 4096 | 8 | 4687 (+3%) | 10.2 (-73%) | | 16384 | **(baseline)** | 4514 | 8.4 | | 16384 | 1 | 4526 (+0.3%) | 8.5 (+1%) | | 16384 | 8 | 4580 (+1%) | 8.5 (+1%) | Reviewed By: pdillinger Differential Revision: D45220794 Pulled By: ajkr fbshipit-source-id: 9a50c9872bdef5d299e52b115a65ee8a5557d58d
2023-05-01 17:14:43 +00:00
std::vector<uint32_t> arena_indexes_;
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
Shard JemallocNodumpAllocator (#11400) Summary: RocksDB's jemalloc no-dump allocator (`NewJemallocNodumpAllocator()`) was using a single manual arena. This arena's lock contention could be very high when thread caching is disabled for RocksDB blocks (e.g., when using `MALLOC_CONF='tcache_max:4096'` and `rocksdb_block_size=16384`). This PR changes the jemalloc no-dump allocator to use a configurable number of manual arenas. That number is required to be a power of two so we can avoid division. The allocator shards allocation requests randomly across those manual arenas. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11400 Test Plan: - mysqld setup - Branch: fb-mysql-8.0.28 (https://github.com/facebook/mysql-5.6/commit/653eba2e56cfba4eac0c851ac9a70b2da9607527) - Build: `mysqlbuild.sh --clean --release` - Set env var `MALLOC_CONF='tcache_max:$tcache_max'` - Added CLI args `--rocksdb_cache_dump=false --rocksdb_block_cache_size=4294967296 --rocksdb_block_size=16384` - Ran under /usr/bin/time - Large database scenario - Setup command: `mysqlslap -h 127.0.0.1 -P 13020 --auto-generate-sql=1 --auto-generate-sql-load-type=write --auto-generate-sql-guid-primary=1 --number-char-cols=8 --auto-generate-sql-execute-number=262144 --concurrency=32 --no-drop` - Benchmark command: `mysqlslap -h 127.0.0.1 -P 13020 --query='select count(*) from mysqlslap.t1;' --number-of-queries=320 --concurrency=32` - Results: | tcache_max | num_arenas | Peak RSS MB (% change) | Query latency seconds (% change) | |---|---|---|---| | 4096 | **(baseline)** | 4541 | 37.1 | | 4096 | 1 | 4535 (-0.1%) | 36.7 (-1%) | | 4096 | 8 | 4687 (+3%) | 10.2 (-73%) | | 16384 | **(baseline)** | 4514 | 8.4 | | 16384 | 1 | 4526 (+0.3%) | 8.5 (+1%) | | 16384 | 8 | 4580 (+1%) | 8.5 (+1%) | Reviewed By: pdillinger Differential Revision: D45220794 Pulled By: ajkr fbshipit-source-id: 9a50c9872bdef5d299e52b115a65ee8a5557d58d
2023-05-01 17:14:43 +00:00
bool init_ = false;
};
} // namespace ROCKSDB_NAMESPACE