diff --git a/.circleci/config.yml b/.circleci/config.yml index cdb800aa22..6cf1407cfc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -584,7 +584,7 @@ jobs: name: "Test RocksDB" shell: powershell.exe command: | - build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16 + build_tools\run_ci_db_test.ps1 -SuiteRun arena_test,db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16 build-linux-java: executor: linux-docker diff --git a/CMakeLists.txt b/CMakeLists.txt index 73d5b52a92..61de269f88 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -796,6 +796,7 @@ set(SOURCES options/options.cc options/options_helper.cc options/options_parser.cc + port/mmap.cc port/stack_trace.cc table/adaptive/adaptive_table_factory.cc table/block_based/binary_search_index_reader.cc diff --git a/TARGETS b/TARGETS index 18f5692881..8c2805146b 100644 --- a/TARGETS +++ b/TARGETS @@ -163,6 +163,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "options/options.cc", "options/options_helper.cc", "options/options_parser.cc", + "port/mmap.cc", "port/port_posix.cc", "port/stack_trace.cc", "port/win/env_default.cc", @@ -502,6 +503,7 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[ "options/options.cc", "options/options_helper.cc", "options/options_parser.cc", + "port/mmap.cc", "port/port_posix.cc", "port/stack_trace.cc", "port/win/env_default.cc", diff --git a/db/db_test_util.h b/db/db_test_util.h index 73d982024a..1648d8b101 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -49,6 +49,9 @@ #include "util/string_util.h" #include "utilities/merge_operators.h" +// In case defined by Windows headers +#undef small + namespace ROCKSDB_NAMESPACE { class MockEnv; diff --git a/db/memtable.cc b/db/memtable.cc index a4bcaa322e..3fc1c8fea7 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -76,7 +76,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, : comparator_(cmp), moptions_(ioptions, mutable_cf_options), refs_(0), - kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)), + kArenaBlockSize(Arena::OptimizeBlockSize(moptions_.arena_block_size)), mem_tracker_(write_buffer_manager), arena_(moptions_.arena_block_size, (write_buffer_manager != nullptr && diff --git a/memory/arena.cc b/memory/arena.cc index 10b8969b4d..d42868edf0 100644 --- a/memory/arena.cc +++ b/memory/arena.cc @@ -8,9 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "memory/arena.h" -#ifndef OS_WIN -#include -#endif + #include #include "logging/logging.h" @@ -22,16 +20,7 @@ namespace ROCKSDB_NAMESPACE { -// MSVC complains that it is already defined since it is static in the header. -#ifndef _MSC_VER -const size_t Arena::kInlineSize; -#endif - -const size_t Arena::kMinBlockSize = 4096; -const size_t Arena::kMaxBlockSize = 2u << 30; -static const int kAlignUnit = alignof(max_align_t); - -size_t OptimizeBlockSize(size_t block_size) { +size_t Arena::OptimizeBlockSize(size_t block_size) { // Make sure block_size is in optimal range block_size = std::max(Arena::kMinBlockSize, block_size); block_size = std::min(Arena::kMaxBlockSize, block_size); @@ -53,14 +42,12 @@ Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size) blocks_memory_ += alloc_bytes_remaining_; aligned_alloc_ptr_ = inline_block_; unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_; -#ifdef MAP_HUGETLB - hugetlb_size_ = huge_page_size; - if (hugetlb_size_ && kBlockSize > hugetlb_size_) { - hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_; + if (MemMapping::kHugePageSupported) { + hugetlb_size_ = huge_page_size; + if (hugetlb_size_ && kBlockSize > hugetlb_size_) { + hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_; + } } -#else - (void)huge_page_size; -#endif if (tracker_ != nullptr) { tracker_->Allocate(kInlineSize); } @@ -71,21 +58,6 @@ Arena::~Arena() { assert(tracker_->is_freed()); tracker_->FreeMem(); } - for (const auto& block : blocks_) { - delete[] block; - } - -#ifdef MAP_HUGETLB - for (const auto& mmap_info : huge_blocks_) { - if (mmap_info.addr_ == nullptr) { - continue; - } - auto ret = munmap(mmap_info.addr_, mmap_info.length_); - if (ret != 0) { - // TODO(sdong): Better handling - } - } -#endif } char* Arena::AllocateFallback(size_t bytes, bool aligned) { @@ -99,12 +71,10 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) { // We waste the remaining space in the current block. size_t size = 0; char* block_head = nullptr; -#ifdef MAP_HUGETLB - if (hugetlb_size_) { + if (MemMapping::kHugePageSupported && hugetlb_size_ > 0) { size = hugetlb_size_; block_head = AllocateFromHugePage(size); } -#endif if (!block_head) { size = kBlockSize; block_head = AllocateNewBlock(size); @@ -123,45 +93,22 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) { } char* Arena::AllocateFromHugePage(size_t bytes) { -#ifdef MAP_HUGETLB - if (hugetlb_size_ == 0) { - return nullptr; + MemMapping mm = MemMapping::AllocateHuge(bytes); + auto addr = static_cast(mm.Get()); + if (addr) { + huge_blocks_.push_back(std::move(mm)); + blocks_memory_ += bytes; + if (tracker_ != nullptr) { + tracker_->Allocate(bytes); + } } - // Reserve space in `huge_blocks_` before calling `mmap`. - // Use `emplace_back()` instead of `reserve()` to let std::vector manage its - // own memory and do fewer reallocations. - // - // - If `emplace_back` throws, no memory leaks because we haven't called - // `mmap` yet. - // - If `mmap` throws, no memory leaks because the vector will be cleaned up - // via RAII. - huge_blocks_.emplace_back(nullptr /* addr */, 0 /* length */); - - void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE), - (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), -1, 0); - - if (addr == MAP_FAILED) { - return nullptr; - } - huge_blocks_.back() = MmapInfo(addr, bytes); - blocks_memory_ += bytes; - if (tracker_ != nullptr) { - tracker_->Allocate(bytes); - } - return reinterpret_cast(addr); -#else - (void)bytes; - return nullptr; -#endif + return addr; } char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, Logger* logger) { - assert((kAlignUnit & (kAlignUnit - 1)) == - 0); // Pointer size should be a power of 2 - -#ifdef MAP_HUGETLB - if (huge_page_size > 0 && bytes > 0) { + if (MemMapping::kHugePageSupported && hugetlb_size_ > 0 && + huge_page_size > 0 && bytes > 0) { // Allocate from a huge page TLB table. size_t reserved_size = ((bytes - 1U) / huge_page_size + 1U) * huge_page_size; @@ -177,10 +124,6 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, return addr; } } -#else - (void)huge_page_size; - (void)logger; -#endif size_t current_mod = reinterpret_cast(aligned_alloc_ptr_) & (kAlignUnit - 1); @@ -200,17 +143,10 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, } char* Arena::AllocateNewBlock(size_t block_bytes) { - // Reserve space in `blocks_` before allocating memory via new. - // Use `emplace_back()` instead of `reserve()` to let std::vector manage its - // own memory and do fewer reallocations. - // - // - If `emplace_back` throws, no memory leaks because we haven't called `new` - // yet. - // - If `new` throws, no memory leaks because the vector will be cleaned up - // via RAII. - blocks_.emplace_back(nullptr); + auto uniq = std::make_unique(block_bytes); + char* block = uniq.get(); + blocks_.push_back(std::move(uniq)); - char* block = new char[block_bytes]; size_t allocated_size; #ifdef ROCKSDB_MALLOC_USABLE_SIZE allocated_size = malloc_usable_size(block); @@ -227,7 +163,6 @@ char* Arena::AllocateNewBlock(size_t block_bytes) { if (tracker_ != nullptr) { tracker_->Allocate(allocated_size); } - blocks_.back() = block; return block; } diff --git a/memory/arena.h b/memory/arena.h index 1de04c4770..39399aa71b 100644 --- a/memory/arena.h +++ b/memory/arena.h @@ -12,16 +12,13 @@ // size, it uses malloc to directly get the requested size. #pragma once -#ifndef OS_WIN -#include -#endif -#include -#include -#include + #include -#include +#include + #include "memory/allocator.h" -#include "util/mutexlock.h" +#include "port/mmap.h" +#include "rocksdb/env.h" namespace ROCKSDB_NAMESPACE { @@ -31,9 +28,13 @@ class Arena : public Allocator { Arena(const Arena&) = delete; void operator=(const Arena&) = delete; - static const size_t kInlineSize = 2048; - static const size_t kMinBlockSize; - static const size_t kMaxBlockSize; + static constexpr size_t kInlineSize = 2048; + static constexpr size_t kMinBlockSize = 4096; + static constexpr size_t kMaxBlockSize = 2u << 30; + + static constexpr unsigned kAlignUnit = alignof(std::max_align_t); + static_assert((kAlignUnit & (kAlignUnit - 1)) == 0, + "Pointer size should be power of 2"); // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the // supported hugepage size of the system), block allocation will try huge @@ -63,7 +64,7 @@ class Arena : public Allocator { // by the arena (exclude the space allocated but not yet used for future // allocations). size_t ApproximateMemoryUsage() const { - return blocks_memory_ + blocks_.capacity() * sizeof(char*) - + return blocks_memory_ + blocks_.size() * sizeof(char*) - alloc_bytes_remaining_; } @@ -81,21 +82,19 @@ class Arena : public Allocator { return blocks_.empty() && huge_blocks_.empty(); } + // check and adjust the block_size so that the return value is + // 1. in the range of [kMinBlockSize, kMaxBlockSize]. + // 2. the multiple of align unit. + static size_t OptimizeBlockSize(size_t block_size); + private: - char inline_block_[kInlineSize] __attribute__((__aligned__(alignof(max_align_t)))); + alignas(std::max_align_t) char inline_block_[kInlineSize]; // Number of bytes allocated in one block const size_t kBlockSize; - // Array of new[] allocated memory blocks - using Blocks = std::vector; - Blocks blocks_; - - struct MmapInfo { - void* addr_; - size_t length_; - - MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {} - }; - std::vector huge_blocks_; + // Allocated memory blocks + std::deque> blocks_; + // Huge page allocations + std::deque huge_blocks_; size_t irregular_block_num = 0; // Stats for current active block. @@ -108,15 +107,15 @@ class Arena : public Allocator { // How many bytes left in currently active block? size_t alloc_bytes_remaining_ = 0; -#ifdef MAP_HUGETLB size_t hugetlb_size_ = 0; -#endif // MAP_HUGETLB + char* AllocateFromHugePage(size_t bytes); char* AllocateFallback(size_t bytes, bool aligned); char* AllocateNewBlock(size_t block_bytes); // Bytes of memory in blocks allocated so far size_t blocks_memory_ = 0; + // Non-owned AllocTracker* tracker_; }; @@ -133,9 +132,4 @@ inline char* Arena::Allocate(size_t bytes) { return AllocateFallback(bytes, false /* unaligned */); } -// check and adjust the block_size so that the return value is -// 1. in the range of [kMinBlockSize, kMaxBlockSize]. -// 2. the multiple of align unit. -extern size_t OptimizeBlockSize(size_t block_size); - } // namespace ROCKSDB_NAMESPACE diff --git a/memory/arena_test.cc b/memory/arena_test.cc index 3fa6483fc5..5f5e10632d 100644 --- a/memory/arena_test.cc +++ b/memory/arena_test.cc @@ -8,6 +8,11 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "memory/arena.h" + +#ifndef OS_WIN +#include +#endif +#include "port/port.h" #include "test_util/testharness.h" #include "util/random.h" @@ -196,6 +201,61 @@ TEST_F(ArenaTest, Simple) { SimpleTest(0); SimpleTest(kHugePageSize); } + +// Number of minor page faults since last call +size_t PopMinorPageFaultCount() { +#ifdef RUSAGE_SELF + static long prev = 0; + struct rusage usage; + EXPECT_EQ(getrusage(RUSAGE_SELF, &usage), 0); + size_t rv = usage.ru_minflt - prev; + prev = usage.ru_minflt; + return rv; +#else + // Conservative + return SIZE_MAX; +#endif // RUSAGE_SELF +} + +TEST(MmapTest, AllocateLazyZeroed) { + // Doesn't have to be page aligned + constexpr size_t len = 1234567; + MemMapping m = MemMapping::AllocateLazyZeroed(len); + auto arr = static_cast(m.Get()); + + // Should generally work + ASSERT_NE(arr, nullptr); + + // Start counting page faults + PopMinorPageFaultCount(); + + // Access half of the allocation + size_t i = 0; + for (; i < len / 2; ++i) { + ASSERT_EQ(arr[i], 0); + arr[i] = static_cast(i & 255); + } + + // Appropriate page faults (maybe more) + size_t faults = PopMinorPageFaultCount(); + ASSERT_GE(faults, len / 2 / port::kPageSize); + + // Access rest of the allocation + for (; i < len; ++i) { + ASSERT_EQ(arr[i], 0); + arr[i] = static_cast(i & 255); + } + + // Appropriate page faults (maybe more) + faults = PopMinorPageFaultCount(); + ASSERT_GE(faults, len / 2 / port::kPageSize); + + // Verify data + for (i = 0; i < len; ++i) { + ASSERT_EQ(arr[i], static_cast(i & 255)); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/port/mmap.cc b/port/mmap.cc new file mode 100644 index 0000000000..36e8f32617 --- /dev/null +++ b/port/mmap.cc @@ -0,0 +1,98 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "port/mmap.h" + +#include +#include +#include +#include +#include + +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +MemMapping::~MemMapping() { +#ifdef OS_WIN + if (addr_ != nullptr) { + (void)::UnmapViewOfFile(addr_); + } + if (page_file_handle_ != NULL) { + (void)::CloseHandle(page_file_handle_); + } +#else // OS_WIN -> !OS_WIN + if (addr_ != nullptr) { + auto status = munmap(addr_, length_); + assert(status == 0); + if (status != 0) { + // TODO: handle error? + } + } +#endif // OS_WIN +} + +MemMapping::MemMapping(MemMapping&& other) noexcept { + *this = std::move(other); +} + +MemMapping& MemMapping::operator=(MemMapping&& other) noexcept { + if (&other == this) { + return *this; + } + this->~MemMapping(); + std::memcpy(this, &other, sizeof(*this)); + new (&other) MemMapping(); + return *this; +} + +MemMapping MemMapping::AllocateAnonymous(size_t length, bool huge) { + MemMapping mm; + mm.length_ = length; + assert(mm.addr_ == nullptr); + if (length == 0) { + // OK to leave addr as nullptr + return mm; + } + int huge_flag = 0; +#ifdef OS_WIN + if (huge) { +#ifdef FILE_MAP_LARGE_PAGES + huge_flag = FILE_MAP_LARGE_PAGES; +#endif // FILE_MAP_LARGE_PAGES + } + mm.page_file_handle_ = ::CreateFileMapping( + INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE | SEC_COMMIT, + Upper32of64(length), Lower32of64(length), nullptr); + if (mm.page_file_handle_ == NULL) { + // Failure + return mm; + } + mm.addr_ = ::MapViewOfFile(mm.page_file_handle_, FILE_MAP_WRITE | huge_flag, + 0, 0, length); +#else // OS_WIN -> !OS_WIN + if (huge) { +#ifdef MAP_HUGETLB + huge_flag = MAP_HUGETLB; +#endif // MAP_HUGE_TLB + } + mm.addr_ = mmap(nullptr, length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | huge_flag, -1, 0); + if (mm.addr_ == MAP_FAILED) { + mm.addr_ = nullptr; + } +#endif // OS_WIN + return mm; +} + +MemMapping MemMapping::AllocateHuge(size_t length) { + return AllocateAnonymous(length, /*huge*/ true); +} + +MemMapping MemMapping::AllocateLazyZeroed(size_t length) { + return AllocateAnonymous(length, /*huge*/ false); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/port/mmap.h b/port/mmap.h new file mode 100644 index 0000000000..720b670886 --- /dev/null +++ b/port/mmap.h @@ -0,0 +1,70 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifdef OS_WIN +#include +// ^^^ Must come first +#include +#else +#include +#endif // OS_WIN + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// An RAII wrapper for mmaped memory +class MemMapping { + public: + static constexpr bool kHugePageSupported = +#if defined(MAP_HUGETLB) || defined(FILE_MAP_LARGE_PAGES) + true; +#else + false; +#endif + + // Allocate memory requesting to be backed by huge pages + static MemMapping AllocateHuge(size_t length); + + // Allocate memory that is only lazily mapped to resident memory and + // guaranteed to be zero-initialized. Note that some platforms like + // Linux allow memory over-commit, where only the used portion of memory + // matters, while other platforms require enough swap space (page file) to + // back the full mapping. + static MemMapping AllocateLazyZeroed(size_t length); + + // No copies + MemMapping(const MemMapping&) = delete; + MemMapping& operator=(const MemMapping&) = delete; + // Move + MemMapping(MemMapping&&) noexcept; + MemMapping& operator=(MemMapping&&) noexcept; + + // Releases the mapping + ~MemMapping(); + + inline void* Get() const { return addr_; } + inline size_t Length() const { return length_; } + + private: + MemMapping() {} + + // The mapped memory, or nullptr on failure / not supported + void* addr_ = nullptr; + // The known usable number of bytes starting at that address + size_t length_ = 0; + +#ifdef OS_WIN + HANDLE page_file_handle_ = NULL; +#endif // OS_WIN + + static MemMapping AllocateAnonymous(size_t length, bool huge); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src.mk b/src.mk index b94bc43ca6..fce6ba5fe2 100644 --- a/src.mk +++ b/src.mk @@ -154,6 +154,7 @@ LIB_SOURCES = \ options/options.cc \ options/options_helper.cc \ options/options_parser.cc \ + port/mmap.cc \ port/port_posix.cc \ port/win/env_default.cc \ port/win/env_win.cc \