2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-29 00:54:09 +00:00
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2020-04-22 00:35:28 +00:00
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
|
|
|
|
2017-10-13 21:41:07 +00:00
|
|
|
#include <stdint.h>
|
|
|
|
|
2020-04-22 00:35:28 +00:00
|
|
|
#include <cinttypes>
|
2013-10-29 00:54:09 +00:00
|
|
|
#include <memory>
|
2014-03-01 02:19:07 +00:00
|
|
|
#include <string>
|
|
|
|
|
2021-11-16 19:14:02 +00:00
|
|
|
#include "cache/cache_entry_roles.h"
|
2022-04-06 17:33:00 +00:00
|
|
|
#include "cache/cache_reservation_manager.h"
|
2021-09-29 11:01:57 +00:00
|
|
|
#include "logging/logging.h"
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
#include "options/options_helper.h"
|
2015-01-15 00:24:24 +00:00
|
|
|
#include "port/port.h"
|
2014-08-25 21:22:05 +00:00
|
|
|
#include "rocksdb/cache.h"
|
2017-07-28 23:23:50 +00:00
|
|
|
#include "rocksdb/convenience.h"
|
2021-09-29 11:01:57 +00:00
|
|
|
#include "rocksdb/filter_policy.h"
|
2017-07-28 23:23:50 +00:00
|
|
|
#include "rocksdb/flush_block_policy.h"
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
#include "rocksdb/rocksdb_namespace.h"
|
2021-11-16 19:14:02 +00:00
|
|
|
#include "rocksdb/table.h"
|
2020-09-14 23:59:00 +00:00
|
|
|
#include "rocksdb/utilities/options_type.h"
|
2019-05-30 21:47:29 +00:00
|
|
|
#include "table/block_based/block_based_table_builder.h"
|
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
2015-01-15 00:24:24 +00:00
|
|
|
#include "table/format.h"
|
2018-07-20 21:31:27 +00:00
|
|
|
#include "util/mutexlock.h"
|
2017-07-28 23:23:50 +00:00
|
|
|
#include "util/string_util.h"
|
2013-10-29 00:54:09 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2013-10-29 00:54:09 +00:00
|
|
|
|
2018-07-20 21:31:27 +00:00
|
|
|
void TailPrefetchStats::RecordEffectiveSize(size_t len) {
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
if (num_records_ < kNumTracked) {
|
|
|
|
num_records_++;
|
|
|
|
}
|
|
|
|
records_[next_++] = len;
|
|
|
|
if (next_ == kNumTracked) {
|
|
|
|
next_ = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
|
|
|
|
std::vector<size_t> sorted;
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
|
|
|
|
if (num_records_ == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
sorted.assign(records_, records_ + num_records_);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Of the historic size, we find the maximum one that satisifis the condtiion
|
|
|
|
// that if prefetching all, less than 1/8 will be wasted.
|
|
|
|
std::sort(sorted.begin(), sorted.end());
|
|
|
|
|
|
|
|
// Assuming we have 5 data points, and after sorting it looks like this:
|
|
|
|
//
|
|
|
|
// +---+
|
|
|
|
// +---+ | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// +---+ | | | |
|
|
|
|
// | | | | | |
|
|
|
|
// +---+ | | | | | |
|
|
|
|
// | | | | | | | |
|
|
|
|
// +---+ | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// +---+ +---+ +---+ +---+ +---+
|
|
|
|
//
|
|
|
|
// and we use every of the value as a candidate, and estimate how much we
|
|
|
|
// wasted, compared to read. For example, when we use the 3rd record
|
2023-07-19 20:04:48 +00:00
|
|
|
// as candidate. This area is what we read:
|
2018-07-20 21:31:27 +00:00
|
|
|
// +---+
|
|
|
|
// +---+ | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// *** *** *** ***+ *** *** *** *** **
|
|
|
|
// * | | | | | |
|
|
|
|
// +---+ | | | | | *
|
|
|
|
// * | | | | | | | |
|
|
|
|
// +---+ | | | | | | | *
|
|
|
|
// * | | | | X | | | | |
|
|
|
|
// | | | | | | | | | *
|
|
|
|
// * | | | | | | | | |
|
|
|
|
// | | | | | | | | | *
|
|
|
|
// * | | | | | | | | |
|
|
|
|
// *** *** ***-*** ***--*** ***--*** +****
|
|
|
|
// which is (size of the record) X (number of records).
|
|
|
|
//
|
|
|
|
// While wasted is this area:
|
|
|
|
// +---+
|
|
|
|
// +---+ | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// | | | |
|
|
|
|
// *** *** *** ****---+ | | | |
|
|
|
|
// * * | | | | |
|
|
|
|
// * *-*** *** | | | | |
|
|
|
|
// * * | | | | | | |
|
|
|
|
// *--** *** | | | | | | |
|
|
|
|
// | | | | | X | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// +---+ +---+ +---+ +---+ +---+
|
|
|
|
//
|
|
|
|
// Which can be calculated iteratively.
|
2023-07-19 20:04:48 +00:00
|
|
|
// The difference between wasted using 4th and 3rd record, will
|
2018-07-20 21:31:27 +00:00
|
|
|
// be following area:
|
|
|
|
// +---+
|
|
|
|
// +--+ +-+ ++ +-+ +-+ +---+ | |
|
|
|
|
// + xxxxxxxxxxxxxxxxxxxxxxxx | | | |
|
|
|
|
// xxxxxxxxxxxxxxxxxxxxxxxx | | | |
|
|
|
|
// + xxxxxxxxxxxxxxxxxxxxxxxx | | | |
|
|
|
|
// | xxxxxxxxxxxxxxxxxxxxxxxx | | | |
|
|
|
|
// +-+ +-+ +-+ ++ +---+ +--+ | | |
|
|
|
|
// | | | | | | |
|
|
|
|
// +---+ ++ | | | | | |
|
|
|
|
// | | | | | | X | | |
|
|
|
|
// +---+ ++ | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// | | | | | | | | | |
|
|
|
|
// +---+ +---+ +---+ +---+ +---+
|
|
|
|
//
|
2023-07-19 20:04:48 +00:00
|
|
|
// which will be the size difference between 4th and 3rd record,
|
|
|
|
// times 3, which is number of records before the 4th.
|
2018-07-20 21:31:27 +00:00
|
|
|
// Here we assume that all data within the prefetch range will be useful. In
|
|
|
|
// reality, it may not be the case when a partial block is inside the range,
|
|
|
|
// or there are data in the middle that is not read. We ignore those cases
|
|
|
|
// for simplicity.
|
|
|
|
assert(!sorted.empty());
|
|
|
|
size_t prev_size = sorted[0];
|
|
|
|
size_t max_qualified_size = sorted[0];
|
|
|
|
size_t wasted = 0;
|
|
|
|
for (size_t i = 1; i < sorted.size(); i++) {
|
|
|
|
size_t read = sorted[i] * sorted.size();
|
|
|
|
wasted += (sorted[i] - prev_size) * i;
|
|
|
|
if (wasted <= read / 8) {
|
|
|
|
max_qualified_size = sorted[i];
|
|
|
|
}
|
|
|
|
prev_size = sorted[i];
|
|
|
|
}
|
|
|
|
const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB
|
|
|
|
return std::min(kMaxPrefetchSize, max_qualified_size);
|
|
|
|
}
|
|
|
|
|
2020-10-11 21:52:49 +00:00
|
|
|
|
|
|
|
const std::string kOptNameMetadataCacheOpts = "metadata_cache_options";
|
|
|
|
|
|
|
|
static std::unordered_map<std::string, PinningTier>
|
|
|
|
pinning_tier_type_string_map = {
|
|
|
|
{"kFallback", PinningTier::kFallback},
|
|
|
|
{"kNone", PinningTier::kNone},
|
|
|
|
{"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar},
|
|
|
|
{"kAll", PinningTier::kAll}};
|
|
|
|
|
2020-05-05 22:02:04 +00:00
|
|
|
static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
|
|
|
|
block_base_table_index_type_string_map = {
|
|
|
|
{"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
|
|
|
|
{"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
|
|
|
|
{"kTwoLevelIndexSearch",
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch},
|
|
|
|
{"kBinarySearchWithFirstKey",
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
|
|
|
|
|
|
|
|
static std::unordered_map<std::string,
|
|
|
|
BlockBasedTableOptions::DataBlockIndexType>
|
|
|
|
block_base_table_data_block_index_type_string_map = {
|
|
|
|
{"kDataBlockBinarySearch",
|
|
|
|
BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
|
|
|
|
{"kDataBlockBinaryAndHash",
|
|
|
|
BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};
|
|
|
|
|
|
|
|
static std::unordered_map<std::string,
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode>
|
|
|
|
block_base_table_index_shortening_mode_string_map = {
|
|
|
|
{"kNoShortening",
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
|
|
|
|
{"kShortenSeparators",
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
|
|
|
|
{"kShortenSeparatorsAndSuccessor",
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::
|
|
|
|
kShortenSeparatorsAndSuccessor}};
|
2020-10-11 21:52:49 +00:00
|
|
|
|
|
|
|
static std::unordered_map<std::string, OptionTypeInfo>
|
|
|
|
metadata_cache_options_type_info = {
|
|
|
|
{"top_level_index_pinning",
|
|
|
|
OptionTypeInfo::Enum<PinningTier>(
|
|
|
|
offsetof(struct MetadataCacheOptions, top_level_index_pinning),
|
|
|
|
&pinning_tier_type_string_map)},
|
|
|
|
{"partition_pinning",
|
|
|
|
OptionTypeInfo::Enum<PinningTier>(
|
|
|
|
offsetof(struct MetadataCacheOptions, partition_pinning),
|
|
|
|
&pinning_tier_type_string_map)},
|
|
|
|
{"unpartitioned_pinning",
|
|
|
|
OptionTypeInfo::Enum<PinningTier>(
|
|
|
|
offsetof(struct MetadataCacheOptions, unpartitioned_pinning),
|
|
|
|
&pinning_tier_type_string_map)}};
|
|
|
|
|
2021-06-18 04:55:42 +00:00
|
|
|
static std::unordered_map<std::string,
|
|
|
|
BlockBasedTableOptions::PrepopulateBlockCache>
|
|
|
|
block_base_table_prepopulate_block_cache_string_map = {
|
|
|
|
{"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable},
|
|
|
|
{"kFlushOnly",
|
|
|
|
BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}};
|
|
|
|
|
2020-05-05 22:02:04 +00:00
|
|
|
|
2020-04-03 17:48:46 +00:00
|
|
|
static std::unordered_map<std::string, OptionTypeInfo>
|
|
|
|
block_based_table_type_info = {
|
|
|
|
/* currently not supported
|
|
|
|
std::shared_ptr<Cache> block_cache = nullptr;
|
2022-05-17 22:01:51 +00:00
|
|
|
CacheUsageOptions cache_usage_options;
|
2020-04-03 17:48:46 +00:00
|
|
|
*/
|
|
|
|
{"flush_block_policy_factory",
|
2021-07-12 16:03:41 +00:00
|
|
|
OptionTypeInfo::AsCustomSharedPtr<FlushBlockPolicyFactory>(
|
|
|
|
offsetof(struct BlockBasedTableOptions,
|
|
|
|
flush_block_policy_factory),
|
|
|
|
OptionVerificationType::kByName, OptionTypeFlags::kCompareNever)},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"cache_index_and_filter_blocks",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
cache_index_and_filter_blocks),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"cache_index_and_filter_blocks_with_high_priority",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
cache_index_and_filter_blocks_with_high_priority),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"pin_l0_filter_and_index_blocks_in_cache",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
pin_l0_filter_and_index_blocks_in_cache),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-05-05 22:02:04 +00:00
|
|
|
{"index_type", OptionTypeInfo::Enum<BlockBasedTableOptions::IndexType>(
|
|
|
|
offsetof(struct BlockBasedTableOptions, index_type),
|
|
|
|
&block_base_table_index_type_string_map)},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"hash_index_allow_collision",
|
2022-03-01 21:58:02 +00:00
|
|
|
{0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"data_block_index_type",
|
2020-05-05 22:02:04 +00:00
|
|
|
OptionTypeInfo::Enum<BlockBasedTableOptions::DataBlockIndexType>(
|
|
|
|
offsetof(struct BlockBasedTableOptions, data_block_index_type),
|
|
|
|
&block_base_table_data_block_index_type_string_map)},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"index_shortening",
|
2020-05-05 22:02:04 +00:00
|
|
|
OptionTypeInfo::Enum<BlockBasedTableOptions::IndexShorteningMode>(
|
|
|
|
offsetof(struct BlockBasedTableOptions, index_shortening),
|
|
|
|
&block_base_table_index_shortening_mode_string_map)},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"data_block_hash_table_util_ratio",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
data_block_hash_table_util_ratio),
|
|
|
|
OptionType::kDouble, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"checksum",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, checksum),
|
|
|
|
OptionType::kChecksumType, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"no_block_cache",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, no_block_cache),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"block_size",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_size),
|
|
|
|
OptionType::kSizeT, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kMutable}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"block_size_deviation",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_size_deviation),
|
|
|
|
OptionType::kInt, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"block_restart_interval",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_restart_interval),
|
|
|
|
OptionType::kInt, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kMutable}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"index_block_restart_interval",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
|
|
|
|
OptionType::kInt, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"index_per_partition",
|
|
|
|
{0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"metadata_block_size",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, metadata_block_size),
|
|
|
|
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"partition_filters",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, partition_filters),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
Minimize memory internal fragmentation for Bloom filters (#6427)
Summary:
New experimental option BBTO::optimize_filters_for_memory builds
filters that maximize their use of "usable size" from malloc_usable_size,
which is also used to compute block cache charges.
Rather than always "rounding up," we track state in the
BloomFilterPolicy object to mix essentially "rounding down" and
"rounding up" so that the average FP rate of all generated filters is
the same as without the option. (YMMV as heavily accessed filters might
be unluckily lower accuracy.)
Thus, the option near-minimizes what the block cache considers as
"memory used" for a given target Bloom filter false positive rate and
Bloom filter implementation. There are no forward or backward
compatibility issues with this change, though it only works on the
format_version=5 Bloom filter.
With Jemalloc, we see about 10% reduction in memory footprint (and block
cache charge) for Bloom filters, but 1-2% increase in storage footprint,
due to encoding efficiency losses (FP rate is non-linear with bits/key).
Why not weighted random round up/down rather than state tracking? By
only requiring malloc_usable_size, we don't actually know what the next
larger and next smaller usable sizes for the allocator are. We pick a
requested size, accept and use whatever usable size it has, and use the
difference to inform our next choice. This allows us to narrow in on the
right balance without tracking/predicting usable sizes.
Why not weight history of generated filter false positive rates by
number of keys? This could lead to excess skew in small filters after
generating a large filter.
Results from filter_bench with jemalloc (irrelevant details omitted):
(normal keys/filter, but high variance)
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=30000 -vary_key_count_ratio=0.9
Build avg ns/key: 29.6278
Number of filters: 5516
Total size (MB): 200.046
Reported total allocated memory (MB): 220.597
Reported internal fragmentation: 10.2732%
Bits/key stored: 10.0097
Average FP rate %: 0.965228
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=30000 -vary_key_count_ratio=0.9 -optimize_filters_for_memory
Build avg ns/key: 30.5104
Number of filters: 5464
Total size (MB): 200.015
Reported total allocated memory (MB): 200.322
Reported internal fragmentation: 0.153709%
Bits/key stored: 10.1011
Average FP rate %: 0.966313
(very few keys / filter, optimization not as effective due to ~59 byte
internal fragmentation in blocked Bloom filter representation)
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000 -vary_key_count_ratio=0.9
Build avg ns/key: 29.5649
Number of filters: 162950
Total size (MB): 200.001
Reported total allocated memory (MB): 224.624
Reported internal fragmentation: 12.3117%
Bits/key stored: 10.2951
Average FP rate %: 0.821534
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000 -vary_key_count_ratio=0.9 -optimize_filters_for_memory
Build avg ns/key: 31.8057
Number of filters: 159849
Total size (MB): 200
Reported total allocated memory (MB): 208.846
Reported internal fragmentation: 4.42297%
Bits/key stored: 10.4948
Average FP rate %: 0.811006
(high keys/filter)
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000000 -vary_key_count_ratio=0.9
Build avg ns/key: 29.7017
Number of filters: 164
Total size (MB): 200.352
Reported total allocated memory (MB): 221.5
Reported internal fragmentation: 10.5552%
Bits/key stored: 10.0003
Average FP rate %: 0.969358
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000000 -vary_key_count_ratio=0.9 -optimize_filters_for_memory
Build avg ns/key: 30.7131
Number of filters: 160
Total size (MB): 200.928
Reported total allocated memory (MB): 200.938
Reported internal fragmentation: 0.00448054%
Bits/key stored: 10.1852
Average FP rate %: 0.963387
And from db_bench (block cache) with jemalloc:
$ ./db_bench -db=/dev/shm/dbbench.no_optimize -benchmarks=fillrandom -format_version=5 -value_size=90 -bloom_bits=10 -num=2000000 -threads=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false
$ ./db_bench -db=/dev/shm/dbbench -benchmarks=fillrandom -format_version=5 -value_size=90 -bloom_bits=10 -num=2000000 -threads=8 -optimize_filters_for_memory -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false
$ (for FILE in /dev/shm/dbbench.no_optimize/*.sst; do ./sst_dump --file=$FILE --show_properties | grep 'filter block' ; done) | awk '{ t += $4; } END { print t; }'
17063835
$ (for FILE in /dev/shm/dbbench/*.sst; do ./sst_dump --file=$FILE --show_properties | grep 'filter block' ; done) | awk '{ t += $4; } END { print t; }'
17430747
$ #^ 2.1% additional filter storage
$ ./db_bench -db=/dev/shm/dbbench.no_optimize -use_existing_db -benchmarks=readrandom,stats -statistics -bloom_bits=10 -num=2000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false -duration=10 -cache_index_and_filter_blocks -cache_size=1000000000
rocksdb.block.cache.index.add COUNT : 33
rocksdb.block.cache.index.bytes.insert COUNT : 8440400
rocksdb.block.cache.filter.add COUNT : 33
rocksdb.block.cache.filter.bytes.insert COUNT : 21087528
rocksdb.bloom.filter.useful COUNT : 4963889
rocksdb.bloom.filter.full.positive COUNT : 1214081
rocksdb.bloom.filter.full.true.positive COUNT : 1161999
$ #^ 1.04 % observed FP rate
$ ./db_bench -db=/dev/shm/dbbench -use_existing_db -benchmarks=readrandom,stats -statistics -bloom_bits=10 -num=2000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false -optimize_filters_for_memory -duration=10 -cache_index_and_filter_blocks -cache_size=1000000000
rocksdb.block.cache.index.add COUNT : 33
rocksdb.block.cache.index.bytes.insert COUNT : 8448592
rocksdb.block.cache.filter.add COUNT : 33
rocksdb.block.cache.filter.bytes.insert COUNT : 18220328
rocksdb.bloom.filter.useful COUNT : 5360933
rocksdb.bloom.filter.full.positive COUNT : 1321315
rocksdb.bloom.filter.full.true.positive COUNT : 1262999
$ #^ 1.08 % observed FP rate, 13.6% less memory usage for filters
(Due to specific key density, this example tends to generate filters that are "worse than average" for internal fragmentation. "Better than average" cases can show little or no improvement.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6427
Test Plan: unit test added, 'make check' with gcc, clang and valgrind
Reviewed By: siying
Differential Revision: D22124374
Pulled By: pdillinger
fbshipit-source-id: f3e3aa152f9043ddf4fae25799e76341d0d8714e
2020-06-22 20:30:57 +00:00
|
|
|
{"optimize_filters_for_memory",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"filter_policy",
|
2022-02-18 20:23:48 +00:00
|
|
|
OptionTypeInfo::AsCustomSharedPtr<const FilterPolicy>(
|
|
|
|
offsetof(struct BlockBasedTableOptions, filter_policy),
|
|
|
|
OptionVerificationType::kByNameAllowFromNull,
|
|
|
|
OptionTypeFlags::kNone)},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"whole_key_filtering",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, whole_key_filtering),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
Detect (new) Bloom/Ribbon Filter construction corruption (#9342)
Summary:
Note: rebase on and merge after https://github.com/facebook/rocksdb/pull/9349, https://github.com/facebook/rocksdb/pull/9345, (optional) https://github.com/facebook/rocksdb/pull/9393
**Context:**
(Quoted from pdillinger) Layers of information during new Bloom/Ribbon Filter construction in building block-based tables includes the following:
a) set of keys to add to filter
b) set of hashes to add to filter (64-bit hash applied to each key)
c) set of Bloom indices to set in filter, with duplicates
d) set of Bloom indices to set in filter, deduplicated
e) final filter and its checksum
This PR aims to detect corruption (e.g, unexpected hardware/software corruption on data structures residing in the memory for a long time) from b) to e) and leave a) as future works for application level.
- b)'s corruption is detected by verifying the xor checksum of the hash entries calculated as the entries accumulate before being added to the filter. (i.e, `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()`)
- c) - e)'s corruption is detected by verifying the hash entries indeed exists in the constructed filter by re-querying these hash entries in the filter (i.e, `FilterBitsBuilder::MaybePostVerify()`) after computing the block checksum (except for PartitionFilter, which is done right after each `FilterBitsBuilder::Finish` for impl simplicity - see code comment for more). For this stage of detection, we assume hash entries are not corrupted after checking on b) since the time interval from b) to c) is relatively short IMO.
Option to enable this feature of detection is `BlockBasedTableOptions::detect_filter_construct_corruption` which is false by default.
**Summary:**
- Implemented new functions `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()` and `FilterBitsBuilder::MaybePostVerify()`
- Ensured hash entries, final filter and banding and their [cache reservation ](https://github.com/facebook/rocksdb/issues/9073) are released properly despite corruption
- See [Filter.construction.artifacts.release.point.pdf ](https://github.com/facebook/rocksdb/files/7923487/Design.Filter.construction.artifacts.release.point.pdf) for high-level design
- Bundled and refactored hash entries's related artifact in XXPH3FilterBitsBuilder into `HashEntriesInfo` for better control on lifetime of these artifact during `SwapEntires`, `ResetEntries`
- Ensured RocksDB block-based table builder calls `FilterBitsBuilder::MaybePostVerify()` after constructing the filter by `FilterBitsBuilder::Finish()`
- When encountering such filter construction corruption, stop writing the filter content to files and mark such a block-based table building non-ok by storing the corruption status in the builder.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9342
Test Plan:
- Added new unit test `DBFilterConstructionCorruptionTestWithParam.DetectCorruption`
- Included this new feature in `DBFilterConstructionReserveMemoryTestWithParam.ReserveMemory` as this feature heavily touch ReserveMemory's impl
- For fallback case, I run `./filter_bench -impl=3 -detect_filter_construct_corruption=true -reserve_table_builder_memory=true -strict_capacity_limit=true -quick -runs 10 | grep 'Build avg'` to make sure nothing break.
- Added to `filter_bench`: increased filter construction time by **30%**, mostly by `MaybePostVerify()`
- FastLocalBloom
- Before change: `./filter_bench -impl=2 -quick -runs 10 | grep 'Build avg'`: **28.86643s**
- After change:
- `./filter_bench -impl=2 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless): **27.6644s (-4% perf improvement might be due to now we don't drop bloom hash entry in `AddAllEntries` along iteration but in bulk later, same with the bypassing-MaybePostVerify case below)**
- `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (expect acceptable increase): **34.41159s (+20%)**
- `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (by-passing MaybePostVerify, expect minor increase): **27.13431s (-6%)**
- Standard128Ribbon
- Before change: `./filter_bench -impl=3 -quick -runs 10 | grep 'Build avg'`: **122.5384s**
- After change:
- `./filter_bench -impl=3 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless - verified by removing MaybePostVerify under this case and found only +-1ns difference): **124.3588s (+2%)**
- `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(expect acceptable increase): **159.4946s (+30%)**
- `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(by-passing MaybePostVerify, expect minor increase) : **125.258s (+2%)**
- Added to `db_stress`: `make crash_test`, `./db_stress --detect_filter_construct_corruption=true`
- Manually smoke-tested: manually corrupted the filter construction in some db level tests with basic PUT and background flush. As expected, the error did get returned to users in subsequent PUT and Flush status.
Reviewed By: pdillinger
Differential Revision: D33746928
Pulled By: hx235
fbshipit-source-id: cb056426be5a7debc1cd16f23bc250f36a08ca57
2022-02-02 01:41:20 +00:00
|
|
|
{"detect_filter_construct_corruption",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
detect_filter_construct_corruption),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2022-03-04 18:35:08 +00:00
|
|
|
OptionTypeFlags::kMutable}},
|
2022-05-19 05:25:54 +00:00
|
|
|
{"reserve_table_builder_memory",
|
|
|
|
{0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
|
|
|
|
OptionTypeFlags::kNone}},
|
|
|
|
{"reserve_table_reader_memory",
|
|
|
|
{0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
|
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"skip_table_builder_flush",
|
|
|
|
{0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"format_version",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, format_version),
|
|
|
|
OptionType::kUInt32T, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"verify_compression",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, verify_compression),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"read_amp_bytes_per_bit",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
|
2020-11-10 19:13:10 +00:00
|
|
|
OptionType::kUInt32T, OptionVerificationType::kNormal,
|
2020-11-13 19:51:24 +00:00
|
|
|
OptionTypeFlags::kNone,
|
|
|
|
[](const ConfigOptions& /*opts*/, const std::string& /*name*/,
|
2021-05-13 21:28:50 +00:00
|
|
|
const std::string& value, void* addr) {
|
2020-11-13 19:51:24 +00:00
|
|
|
// A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13
|
|
|
|
// and 6.14. The bug will write out 8 bytes to OPTIONS file from the
|
|
|
|
// starting address of BlockBasedTableOptions.read_amp_bytes_per_bit
|
|
|
|
// which is actually a uint32. Consequently, the value of
|
|
|
|
// read_amp_bytes_per_bit written in the OPTIONS file is wrong.
|
|
|
|
// From 6.15, RocksDB will try to parse the read_amp_bytes_per_bit
|
|
|
|
// from OPTIONS file as a uint32. To be able to load OPTIONS file
|
|
|
|
// generated by affected releases before the fix, we need to
|
|
|
|
// manually parse read_amp_bytes_per_bit with this special hack.
|
|
|
|
uint64_t read_amp_bytes_per_bit = ParseUint64(value);
|
2021-05-13 21:28:50 +00:00
|
|
|
*(static_cast<uint32_t*>(addr)) =
|
2020-11-17 08:43:20 +00:00
|
|
|
static_cast<uint32_t>(read_amp_bytes_per_bit);
|
2020-11-13 19:51:24 +00:00
|
|
|
return Status::OK();
|
|
|
|
}}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"enable_index_compression",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, enable_index_compression),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"block_align",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_align),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-04-03 17:48:46 +00:00
|
|
|
{"pin_top_level_index_and_filter",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
pin_top_level_index_and_filter),
|
|
|
|
OptionType::kBoolean, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
OptionTypeFlags::kNone}},
|
2020-10-11 21:52:49 +00:00
|
|
|
{kOptNameMetadataCacheOpts,
|
|
|
|
OptionTypeInfo::Struct(
|
|
|
|
kOptNameMetadataCacheOpts, &metadata_cache_options_type_info,
|
|
|
|
offsetof(struct BlockBasedTableOptions, metadata_cache_options),
|
|
|
|
OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
|
2020-04-29 01:02:11 +00:00
|
|
|
{"block_cache",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_cache),
|
|
|
|
OptionType::kUnknown, OptionVerificationType::kNormal,
|
2020-09-14 23:59:00 +00:00
|
|
|
(OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
|
2022-06-22 22:45:21 +00:00
|
|
|
// Parses the input value as a Cache
|
2020-04-29 01:02:11 +00:00
|
|
|
[](const ConfigOptions& opts, const std::string&,
|
2021-05-13 21:28:50 +00:00
|
|
|
const std::string& value, void* addr) {
|
|
|
|
auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
|
2020-04-29 01:02:11 +00:00
|
|
|
return Cache::CreateFromString(opts, value, cache);
|
|
|
|
}}},
|
|
|
|
{"block_cache_compressed",
|
2023-01-25 01:09:19 +00:00
|
|
|
{0, OptionType::kUnknown, OptionVerificationType::kDeprecated,
|
|
|
|
OptionTypeFlags::kNone}},
|
2021-02-24 00:52:35 +00:00
|
|
|
{"max_auto_readahead_size",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
|
|
|
|
OptionType::kSizeT, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kMutable}},
|
2021-06-18 04:55:42 +00:00
|
|
|
{"prepopulate_block_cache",
|
|
|
|
OptionTypeInfo::Enum<BlockBasedTableOptions::PrepopulateBlockCache>(
|
|
|
|
offsetof(struct BlockBasedTableOptions, prepopulate_block_cache),
|
2021-08-06 02:43:44 +00:00
|
|
|
&block_base_table_prepopulate_block_cache_string_map,
|
|
|
|
OptionTypeFlags::kMutable)},
|
2022-04-16 00:28:09 +00:00
|
|
|
{"initial_auto_readahead_size",
|
|
|
|
{offsetof(struct BlockBasedTableOptions, initial_auto_readahead_size),
|
|
|
|
OptionType::kSizeT, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kMutable}},
|
2022-09-01 18:56:00 +00:00
|
|
|
{"num_file_reads_for_auto_readahead",
|
|
|
|
{offsetof(struct BlockBasedTableOptions,
|
|
|
|
num_file_reads_for_auto_readahead),
|
|
|
|
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kMutable}},
|
2021-06-18 04:55:42 +00:00
|
|
|
|
2020-09-14 23:59:00 +00:00
|
|
|
};
|
2020-04-03 17:48:46 +00:00
|
|
|
|
2020-01-14 19:19:47 +00:00
|
|
|
// TODO(myabandeh): We should return an error instead of silently changing the
|
|
|
|
// options
|
2014-03-01 00:39:27 +00:00
|
|
|
BlockBasedTableFactory::BlockBasedTableFactory(
|
2015-11-18 00:41:54 +00:00
|
|
|
const BlockBasedTableOptions& _table_options)
|
|
|
|
: table_options_(_table_options) {
|
2020-09-14 23:59:00 +00:00
|
|
|
InitializeOptions();
|
2021-04-26 10:12:35 +00:00
|
|
|
RegisterOptions(&table_options_, &block_based_table_type_info);
|
2022-04-06 17:33:00 +00:00
|
|
|
|
2022-05-17 22:01:51 +00:00
|
|
|
const auto table_reader_charged =
|
|
|
|
table_options_.cache_usage_options.options_overrides
|
|
|
|
.at(CacheEntryRole::kBlockBasedTableReader)
|
|
|
|
.charged;
|
|
|
|
if (table_options_.block_cache &&
|
|
|
|
table_reader_charged == CacheEntryRoleOptions::Decision::kEnabled) {
|
2022-04-06 17:33:00 +00:00
|
|
|
table_reader_cache_res_mgr_.reset(new ConcurrentCacheReservationManager(
|
|
|
|
std::make_shared<CacheReservationManagerImpl<
|
|
|
|
CacheEntryRole::kBlockBasedTableReader>>(
|
|
|
|
table_options_.block_cache)));
|
|
|
|
}
|
2020-09-14 23:59:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void BlockBasedTableFactory::InitializeOptions() {
|
2014-03-01 00:39:27 +00:00
|
|
|
if (table_options_.flush_block_policy_factory == nullptr) {
|
|
|
|
table_options_.flush_block_policy_factory.reset(
|
|
|
|
new FlushBlockBySizePolicyFactory());
|
|
|
|
}
|
2014-08-25 21:22:05 +00:00
|
|
|
if (table_options_.no_block_cache) {
|
|
|
|
table_options_.block_cache.reset();
|
|
|
|
} else if (table_options_.block_cache == nullptr) {
|
2019-06-27 17:16:21 +00:00
|
|
|
LRUCacheOptions co;
|
2023-04-04 22:33:24 +00:00
|
|
|
// 32MB, the recommended minimum size for 64 shards, to reduce contention
|
|
|
|
co.capacity = 32 << 20;
|
2019-06-27 17:16:21 +00:00
|
|
|
table_options_.block_cache = NewLRUCache(co);
|
2014-08-25 21:22:05 +00:00
|
|
|
}
|
|
|
|
if (table_options_.block_size_deviation < 0 ||
|
|
|
|
table_options_.block_size_deviation > 100) {
|
|
|
|
table_options_.block_size_deviation = 0;
|
|
|
|
}
|
2016-01-04 18:51:00 +00:00
|
|
|
if (table_options_.block_restart_interval < 1) {
|
|
|
|
table_options_.block_restart_interval = 1;
|
|
|
|
}
|
2016-02-05 18:22:37 +00:00
|
|
|
if (table_options_.index_block_restart_interval < 1) {
|
|
|
|
table_options_.index_block_restart_interval = 1;
|
|
|
|
}
|
2020-01-14 19:19:47 +00:00
|
|
|
if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
|
|
|
|
table_options_.index_block_restart_interval != 1) {
|
2022-10-25 18:50:38 +00:00
|
|
|
// Currently kHashSearch is incompatible with
|
|
|
|
// index_block_restart_interval > 1
|
2020-01-14 19:19:47 +00:00
|
|
|
table_options_.index_block_restart_interval = 1;
|
|
|
|
}
|
2017-06-24 01:18:21 +00:00
|
|
|
if (table_options_.partition_filters &&
|
|
|
|
table_options_.index_type !=
|
|
|
|
BlockBasedTableOptions::kTwoLevelIndexSearch) {
|
|
|
|
// We do not support partitioned filters without partitioning indexes
|
|
|
|
table_options_.partition_filters = false;
|
|
|
|
}
|
2022-05-17 22:01:51 +00:00
|
|
|
auto& options_overrides =
|
|
|
|
table_options_.cache_usage_options.options_overrides;
|
|
|
|
const auto options = table_options_.cache_usage_options.options;
|
|
|
|
for (std::uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
|
|
|
|
CacheEntryRole role = static_cast<CacheEntryRole>(i);
|
|
|
|
auto options_overrides_iter = options_overrides.find(role);
|
|
|
|
if (options_overrides_iter == options_overrides.end()) {
|
|
|
|
options_overrides.insert({role, options});
|
|
|
|
} else if (options_overrides_iter->second.charged ==
|
|
|
|
CacheEntryRoleOptions::Decision::kFallback) {
|
|
|
|
options_overrides_iter->second.charged = options.charged;
|
|
|
|
}
|
|
|
|
}
|
2014-03-01 00:39:27 +00:00
|
|
|
}
|
|
|
|
|
2020-09-14 23:59:00 +00:00
|
|
|
Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) {
|
|
|
|
InitializeOptions();
|
|
|
|
return TableFactory::PrepareOptions(opts);
|
|
|
|
}
|
|
|
|
|
2021-11-16 19:14:02 +00:00
|
|
|
namespace {
|
|
|
|
// Different cache kinds use the same keys for physically different values, so
|
|
|
|
// they must not share an underlying key space with each other.
|
|
|
|
Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {
|
|
|
|
int cache_count = (bbto.block_cache != nullptr) +
|
|
|
|
(bbto.persistent_cache != nullptr);
|
|
|
|
if (cache_count <= 1) {
|
|
|
|
// Nothing to share / overlap
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// More complex test of shared key space, in case the instances are wrappers
|
|
|
|
// for some shared underlying cache.
|
Major Cache refactoring, CPU efficiency improvement (#10975)
Summary:
This is several refactorings bundled into one to avoid having to incrementally re-modify uses of Cache several times. Overall, there are breaking changes to Cache class, and it becomes more of low-level interface for implementing caches, especially block cache. New internal APIs make using Cache cleaner than before, and more insulated from block cache evolution. Hopefully, this is the last really big block cache refactoring, because of rather effectively decoupling the implementations from the uses. This change also removes the EXPERIMENTAL designation on the SecondaryCache support in Cache. It seems reasonably mature at this point but still subject to change/evolution (as I warn in the API docs for Cache).
The high-level motivation for this refactoring is to minimize code duplication / compounding complexity in adding SecondaryCache support to HyperClockCache (in a later PR). Other benefits listed below.
* static_cast lines of code +29 -35 (net removed 6)
* reinterpret_cast lines of code +6 -32 (net removed 26)
## cache.h and secondary_cache.h
* Always use CacheItemHelper with entries instead of just a Deleter. There are several motivations / justifications:
* Simpler for implementations to deal with just one Insert and one Lookup.
* Simpler and more efficient implementation because we don't have to track which entries are using helpers and which are using deleters
* Gets rid of hack to classify cache entries by their deleter. Instead, the CacheItemHelper includes a CacheEntryRole. This simplifies a lot of code (cache_entry_roles.h almost eliminated). Fixes https://github.com/facebook/rocksdb/issues/9428.
* Makes it trivial to adjust SecondaryCache behavior based on kind of block (e.g. don't re-compress filter blocks).
* It is arguably less convenient for many direct users of Cache, but direct users of Cache are now rare with introduction of typed_cache.h (below).
* I considered and rejected an alternative approach in which we reduce customizability by assuming each secondary cache compatible value starts with a Slice referencing the uncompressed block contents (already true or mostly true), but we apparently intend to stack secondary caches. Saving an entry from a compressed secondary to a lower tier requires custom handling offered by SaveToCallback, etc.
* Make CreateCallback part of the helper and introduce CreateContext to work with it (alternative to https://github.com/facebook/rocksdb/issues/10562). This cleans up the interface while still allowing context to be provided for loading/parsing values into primary cache. This model works for async lookup in BlockBasedTable reader (reader owns a CreateContext) under the assumption that it always waits on secondary cache operations to finish. (Otherwise, the CreateContext could be destroyed while async operation depending on it continues.) This likely contributes most to the observed performance improvement because it saves an std::function backed by a heap allocation.
* Use char* for serialized data, e.g. in SaveToCallback, where void* was confusingly used. (We use `char*` for serialized byte data all over RocksDB, with many advantages over `void*`. `memcpy` etc. are legacy APIs that should not be mimicked.)
* Add a type alias Cache::ObjectPtr = void*, so that we can better indicate the intent of the void* when it is to be the object associated with a Cache entry. Related: started (but did not complete) a refactoring to move away from "value" of a cache entry toward "object" or "obj". (It is confusing to call Cache a key-value store (like DB) when it is really storing arbitrary in-memory objects, not byte strings.)
* Remove unnecessary key param from DeleterFn. This is good for efficiency in HyperClockCache, which does not directly store the cache key in memory. (Alternative to https://github.com/facebook/rocksdb/issues/10774)
* Add allocator to Cache DeleterFn. This is a kind of future-proofing change in case we get more serious about using the Cache allocator for memory tracked by the Cache. Right now, only the uncompressed block contents are allocated using the allocator, and a pointer to that allocator is saved as part of the cached object so that the deleter can use it. (See CacheAllocationPtr.) If in the future we are able to "flatten out" our Cache objects some more, it would be good not to have to track the allocator as part of each object.
* Removes legacy `ApplyToAllCacheEntries` and changes `ApplyToAllEntries` signature for Deleter->CacheItemHelper change.
## typed_cache.h
Adds various "typed" interfaces to the Cache as internal APIs, so that most uses of Cache can use simple type safe code without casting and without explicit deleters, etc. Almost all of the non-test, non-glue code uses of Cache have been migrated. (Follow-up work: CompressedSecondaryCache deserves deeper attention to migrate.) This change expands RocksDB's internal usage of metaprogramming and SFINAE (https://en.cppreference.com/w/cpp/language/sfinae).
The existing usages of Cache are divided up at a high level into these new interfaces. See updated existing uses of Cache for examples of how these are used.
* PlaceholderCacheInterface - Used for making cache reservations, with entries that have a charge but no value.
* BasicTypedCacheInterface<TValue> - Used for primary cache storage of objects of type TValue, which can be cleaned up with std::default_delete<TValue>. The role is provided by TValue::kCacheEntryRole or given in an optional template parameter.
* FullTypedCacheInterface<TValue, TCreateContext> - Used for secondary cache compatible storage of objects of type TValue. In addition to BasicTypedCacheInterface constraints, we require TValue::ContentSlice() to return persistable data. This simplifies usage for the normal case of simple secondary cache compatibility (can give you a Slice to the data already in memory). In addition to TCreateContext performing the role of Cache::CreateContext, it is also expected to provide a factory function for creating TValue.
* For each of these, there's a "Shared" version (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, rather than assuming external ownership by holding only a raw `Cache*`.
These interfaces introduce specific handle types for each interface instantiation, so that it's easy to see what kind of object is controlled by a handle. (Ultimately, this might not be worth the extra complexity, but it seems OK so far.)
Note: I attempted to make the cache 'charge' automatically inferred from the cache object type, such as by expecting an ApproximateMemoryUsage() function, but this is not so clean because there are cases where we need to compute the charge ahead of time and don't want to re-compute it.
## block_cache.h
This header is essentially the replacement for the old block_like_traits.h. It includes various things to support block cache access with typed_cache.h for block-based table.
## block_based_table_reader.cc
Before this change, accessing the block cache here was an awkward mix of static polymorphism (template TBlocklike) and switch-case on a dynamic BlockType value. This change mostly unifies on static polymorphism, relying on minor hacks in block_cache.h to distinguish variants of Block. We still check BlockType in some places (especially for stats, which could be improved in follow-up work) but at least the BlockType is a static constant from the template parameter. (No more awkward partial redundancy between static and dynamic info.) This likely contributes to the overall performance improvement, but hasn't been tested in isolation.
The other key source of simplification here is a more unified system of creating block cache objects: for directly populating from primary cache and for promotion from secondary cache. Both use BlockCreateContext, for context and for factory functions.
## block_based_table_builder.cc, cache_dump_load_impl.cc
Before this change, warming caches was super ugly code. Both of these source files had switch statements to basically transition from the dynamic BlockType world to the static TBlocklike world. None of that mess is needed anymore as there's a new, untyped WarmInCache function that handles all the details just as promotion from SecondaryCache would. (Fixes `TODO akanksha: Dedup below code` in block_based_table_builder.cc.)
## Everything else
Mostly just updating Cache users to use new typed APIs when reasonably possible, or changed Cache APIs when not.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10975
Test Plan:
tests updated
Performance test setup similar to https://github.com/facebook/rocksdb/issues/10626 (by cache size, LRUCache when not "hyper" for HyperClockCache):
34MB 1thread base.hyper -> kops/s: 0.745 io_bytes/op: 2.52504e+06 miss_ratio: 0.140906 max_rss_mb: 76.4844
34MB 1thread new.hyper -> kops/s: 0.751 io_bytes/op: 2.5123e+06 miss_ratio: 0.140161 max_rss_mb: 79.3594
34MB 1thread base -> kops/s: 0.254 io_bytes/op: 1.36073e+07 miss_ratio: 0.918818 max_rss_mb: 45.9297
34MB 1thread new -> kops/s: 0.252 io_bytes/op: 1.36157e+07 miss_ratio: 0.918999 max_rss_mb: 44.1523
34MB 32thread base.hyper -> kops/s: 7.272 io_bytes/op: 2.88323e+06 miss_ratio: 0.162532 max_rss_mb: 516.602
34MB 32thread new.hyper -> kops/s: 7.214 io_bytes/op: 2.99046e+06 miss_ratio: 0.168818 max_rss_mb: 518.293
34MB 32thread base -> kops/s: 3.528 io_bytes/op: 1.35722e+07 miss_ratio: 0.914691 max_rss_mb: 264.926
34MB 32thread new -> kops/s: 3.604 io_bytes/op: 1.35744e+07 miss_ratio: 0.915054 max_rss_mb: 264.488
233MB 1thread base.hyper -> kops/s: 53.909 io_bytes/op: 2552.35 miss_ratio: 0.0440566 max_rss_mb: 241.984
233MB 1thread new.hyper -> kops/s: 62.792 io_bytes/op: 2549.79 miss_ratio: 0.044043 max_rss_mb: 241.922
233MB 1thread base -> kops/s: 1.197 io_bytes/op: 2.75173e+06 miss_ratio: 0.103093 max_rss_mb: 241.559
233MB 1thread new -> kops/s: 1.199 io_bytes/op: 2.73723e+06 miss_ratio: 0.10305 max_rss_mb: 240.93
233MB 32thread base.hyper -> kops/s: 1298.69 io_bytes/op: 2539.12 miss_ratio: 0.0440307 max_rss_mb: 371.418
233MB 32thread new.hyper -> kops/s: 1421.35 io_bytes/op: 2538.75 miss_ratio: 0.0440307 max_rss_mb: 347.273
233MB 32thread base -> kops/s: 9.693 io_bytes/op: 2.77304e+06 miss_ratio: 0.103745 max_rss_mb: 569.691
233MB 32thread new -> kops/s: 9.75 io_bytes/op: 2.77559e+06 miss_ratio: 0.103798 max_rss_mb: 552.82
1597MB 1thread base.hyper -> kops/s: 58.607 io_bytes/op: 1449.14 miss_ratio: 0.0249324 max_rss_mb: 1583.55
1597MB 1thread new.hyper -> kops/s: 69.6 io_bytes/op: 1434.89 miss_ratio: 0.0247167 max_rss_mb: 1584.02
1597MB 1thread base -> kops/s: 60.478 io_bytes/op: 1421.28 miss_ratio: 0.024452 max_rss_mb: 1589.45
1597MB 1thread new -> kops/s: 63.973 io_bytes/op: 1416.07 miss_ratio: 0.0243766 max_rss_mb: 1589.24
1597MB 32thread base.hyper -> kops/s: 1436.2 io_bytes/op: 1357.93 miss_ratio: 0.0235353 max_rss_mb: 1692.92
1597MB 32thread new.hyper -> kops/s: 1605.03 io_bytes/op: 1358.04 miss_ratio: 0.023538 max_rss_mb: 1702.78
1597MB 32thread base -> kops/s: 280.059 io_bytes/op: 1350.34 miss_ratio: 0.023289 max_rss_mb: 1675.36
1597MB 32thread new -> kops/s: 283.125 io_bytes/op: 1351.05 miss_ratio: 0.0232797 max_rss_mb: 1703.83
Almost uniformly improving over base revision, especially for hot paths with HyperClockCache, up to 12% higher throughput seen (1597MB, 32thread, hyper). The improvement for that is likely coming from much simplified code for providing context for secondary cache promotion (CreateCallback/CreateContext), and possibly from less branching in block_based_table_reader. And likely a small improvement from not reconstituting key for DeleterFn.
Reviewed By: anand1976
Differential Revision: D42417818
Pulled By: pdillinger
fbshipit-source-id: f86bfdd584dce27c028b151ba56818ad14f7a432
2023-01-11 22:20:40 +00:00
|
|
|
static Cache::CacheItemHelper kHelper{CacheEntryRole::kMisc};
|
2022-10-19 05:06:57 +00:00
|
|
|
CacheKey sentinel_key = CacheKey::CreateUniqueForProcessLifetime();
|
Major Cache refactoring, CPU efficiency improvement (#10975)
Summary:
This is several refactorings bundled into one to avoid having to incrementally re-modify uses of Cache several times. Overall, there are breaking changes to Cache class, and it becomes more of low-level interface for implementing caches, especially block cache. New internal APIs make using Cache cleaner than before, and more insulated from block cache evolution. Hopefully, this is the last really big block cache refactoring, because of rather effectively decoupling the implementations from the uses. This change also removes the EXPERIMENTAL designation on the SecondaryCache support in Cache. It seems reasonably mature at this point but still subject to change/evolution (as I warn in the API docs for Cache).
The high-level motivation for this refactoring is to minimize code duplication / compounding complexity in adding SecondaryCache support to HyperClockCache (in a later PR). Other benefits listed below.
* static_cast lines of code +29 -35 (net removed 6)
* reinterpret_cast lines of code +6 -32 (net removed 26)
## cache.h and secondary_cache.h
* Always use CacheItemHelper with entries instead of just a Deleter. There are several motivations / justifications:
* Simpler for implementations to deal with just one Insert and one Lookup.
* Simpler and more efficient implementation because we don't have to track which entries are using helpers and which are using deleters
* Gets rid of hack to classify cache entries by their deleter. Instead, the CacheItemHelper includes a CacheEntryRole. This simplifies a lot of code (cache_entry_roles.h almost eliminated). Fixes https://github.com/facebook/rocksdb/issues/9428.
* Makes it trivial to adjust SecondaryCache behavior based on kind of block (e.g. don't re-compress filter blocks).
* It is arguably less convenient for many direct users of Cache, but direct users of Cache are now rare with introduction of typed_cache.h (below).
* I considered and rejected an alternative approach in which we reduce customizability by assuming each secondary cache compatible value starts with a Slice referencing the uncompressed block contents (already true or mostly true), but we apparently intend to stack secondary caches. Saving an entry from a compressed secondary to a lower tier requires custom handling offered by SaveToCallback, etc.
* Make CreateCallback part of the helper and introduce CreateContext to work with it (alternative to https://github.com/facebook/rocksdb/issues/10562). This cleans up the interface while still allowing context to be provided for loading/parsing values into primary cache. This model works for async lookup in BlockBasedTable reader (reader owns a CreateContext) under the assumption that it always waits on secondary cache operations to finish. (Otherwise, the CreateContext could be destroyed while async operation depending on it continues.) This likely contributes most to the observed performance improvement because it saves an std::function backed by a heap allocation.
* Use char* for serialized data, e.g. in SaveToCallback, where void* was confusingly used. (We use `char*` for serialized byte data all over RocksDB, with many advantages over `void*`. `memcpy` etc. are legacy APIs that should not be mimicked.)
* Add a type alias Cache::ObjectPtr = void*, so that we can better indicate the intent of the void* when it is to be the object associated with a Cache entry. Related: started (but did not complete) a refactoring to move away from "value" of a cache entry toward "object" or "obj". (It is confusing to call Cache a key-value store (like DB) when it is really storing arbitrary in-memory objects, not byte strings.)
* Remove unnecessary key param from DeleterFn. This is good for efficiency in HyperClockCache, which does not directly store the cache key in memory. (Alternative to https://github.com/facebook/rocksdb/issues/10774)
* Add allocator to Cache DeleterFn. This is a kind of future-proofing change in case we get more serious about using the Cache allocator for memory tracked by the Cache. Right now, only the uncompressed block contents are allocated using the allocator, and a pointer to that allocator is saved as part of the cached object so that the deleter can use it. (See CacheAllocationPtr.) If in the future we are able to "flatten out" our Cache objects some more, it would be good not to have to track the allocator as part of each object.
* Removes legacy `ApplyToAllCacheEntries` and changes `ApplyToAllEntries` signature for Deleter->CacheItemHelper change.
## typed_cache.h
Adds various "typed" interfaces to the Cache as internal APIs, so that most uses of Cache can use simple type safe code without casting and without explicit deleters, etc. Almost all of the non-test, non-glue code uses of Cache have been migrated. (Follow-up work: CompressedSecondaryCache deserves deeper attention to migrate.) This change expands RocksDB's internal usage of metaprogramming and SFINAE (https://en.cppreference.com/w/cpp/language/sfinae).
The existing usages of Cache are divided up at a high level into these new interfaces. See updated existing uses of Cache for examples of how these are used.
* PlaceholderCacheInterface - Used for making cache reservations, with entries that have a charge but no value.
* BasicTypedCacheInterface<TValue> - Used for primary cache storage of objects of type TValue, which can be cleaned up with std::default_delete<TValue>. The role is provided by TValue::kCacheEntryRole or given in an optional template parameter.
* FullTypedCacheInterface<TValue, TCreateContext> - Used for secondary cache compatible storage of objects of type TValue. In addition to BasicTypedCacheInterface constraints, we require TValue::ContentSlice() to return persistable data. This simplifies usage for the normal case of simple secondary cache compatibility (can give you a Slice to the data already in memory). In addition to TCreateContext performing the role of Cache::CreateContext, it is also expected to provide a factory function for creating TValue.
* For each of these, there's a "Shared" version (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, rather than assuming external ownership by holding only a raw `Cache*`.
These interfaces introduce specific handle types for each interface instantiation, so that it's easy to see what kind of object is controlled by a handle. (Ultimately, this might not be worth the extra complexity, but it seems OK so far.)
Note: I attempted to make the cache 'charge' automatically inferred from the cache object type, such as by expecting an ApproximateMemoryUsage() function, but this is not so clean because there are cases where we need to compute the charge ahead of time and don't want to re-compute it.
## block_cache.h
This header is essentially the replacement for the old block_like_traits.h. It includes various things to support block cache access with typed_cache.h for block-based table.
## block_based_table_reader.cc
Before this change, accessing the block cache here was an awkward mix of static polymorphism (template TBlocklike) and switch-case on a dynamic BlockType value. This change mostly unifies on static polymorphism, relying on minor hacks in block_cache.h to distinguish variants of Block. We still check BlockType in some places (especially for stats, which could be improved in follow-up work) but at least the BlockType is a static constant from the template parameter. (No more awkward partial redundancy between static and dynamic info.) This likely contributes to the overall performance improvement, but hasn't been tested in isolation.
The other key source of simplification here is a more unified system of creating block cache objects: for directly populating from primary cache and for promotion from secondary cache. Both use BlockCreateContext, for context and for factory functions.
## block_based_table_builder.cc, cache_dump_load_impl.cc
Before this change, warming caches was super ugly code. Both of these source files had switch statements to basically transition from the dynamic BlockType world to the static TBlocklike world. None of that mess is needed anymore as there's a new, untyped WarmInCache function that handles all the details just as promotion from SecondaryCache would. (Fixes `TODO akanksha: Dedup below code` in block_based_table_builder.cc.)
## Everything else
Mostly just updating Cache users to use new typed APIs when reasonably possible, or changed Cache APIs when not.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10975
Test Plan:
tests updated
Performance test setup similar to https://github.com/facebook/rocksdb/issues/10626 (by cache size, LRUCache when not "hyper" for HyperClockCache):
34MB 1thread base.hyper -> kops/s: 0.745 io_bytes/op: 2.52504e+06 miss_ratio: 0.140906 max_rss_mb: 76.4844
34MB 1thread new.hyper -> kops/s: 0.751 io_bytes/op: 2.5123e+06 miss_ratio: 0.140161 max_rss_mb: 79.3594
34MB 1thread base -> kops/s: 0.254 io_bytes/op: 1.36073e+07 miss_ratio: 0.918818 max_rss_mb: 45.9297
34MB 1thread new -> kops/s: 0.252 io_bytes/op: 1.36157e+07 miss_ratio: 0.918999 max_rss_mb: 44.1523
34MB 32thread base.hyper -> kops/s: 7.272 io_bytes/op: 2.88323e+06 miss_ratio: 0.162532 max_rss_mb: 516.602
34MB 32thread new.hyper -> kops/s: 7.214 io_bytes/op: 2.99046e+06 miss_ratio: 0.168818 max_rss_mb: 518.293
34MB 32thread base -> kops/s: 3.528 io_bytes/op: 1.35722e+07 miss_ratio: 0.914691 max_rss_mb: 264.926
34MB 32thread new -> kops/s: 3.604 io_bytes/op: 1.35744e+07 miss_ratio: 0.915054 max_rss_mb: 264.488
233MB 1thread base.hyper -> kops/s: 53.909 io_bytes/op: 2552.35 miss_ratio: 0.0440566 max_rss_mb: 241.984
233MB 1thread new.hyper -> kops/s: 62.792 io_bytes/op: 2549.79 miss_ratio: 0.044043 max_rss_mb: 241.922
233MB 1thread base -> kops/s: 1.197 io_bytes/op: 2.75173e+06 miss_ratio: 0.103093 max_rss_mb: 241.559
233MB 1thread new -> kops/s: 1.199 io_bytes/op: 2.73723e+06 miss_ratio: 0.10305 max_rss_mb: 240.93
233MB 32thread base.hyper -> kops/s: 1298.69 io_bytes/op: 2539.12 miss_ratio: 0.0440307 max_rss_mb: 371.418
233MB 32thread new.hyper -> kops/s: 1421.35 io_bytes/op: 2538.75 miss_ratio: 0.0440307 max_rss_mb: 347.273
233MB 32thread base -> kops/s: 9.693 io_bytes/op: 2.77304e+06 miss_ratio: 0.103745 max_rss_mb: 569.691
233MB 32thread new -> kops/s: 9.75 io_bytes/op: 2.77559e+06 miss_ratio: 0.103798 max_rss_mb: 552.82
1597MB 1thread base.hyper -> kops/s: 58.607 io_bytes/op: 1449.14 miss_ratio: 0.0249324 max_rss_mb: 1583.55
1597MB 1thread new.hyper -> kops/s: 69.6 io_bytes/op: 1434.89 miss_ratio: 0.0247167 max_rss_mb: 1584.02
1597MB 1thread base -> kops/s: 60.478 io_bytes/op: 1421.28 miss_ratio: 0.024452 max_rss_mb: 1589.45
1597MB 1thread new -> kops/s: 63.973 io_bytes/op: 1416.07 miss_ratio: 0.0243766 max_rss_mb: 1589.24
1597MB 32thread base.hyper -> kops/s: 1436.2 io_bytes/op: 1357.93 miss_ratio: 0.0235353 max_rss_mb: 1692.92
1597MB 32thread new.hyper -> kops/s: 1605.03 io_bytes/op: 1358.04 miss_ratio: 0.023538 max_rss_mb: 1702.78
1597MB 32thread base -> kops/s: 280.059 io_bytes/op: 1350.34 miss_ratio: 0.023289 max_rss_mb: 1675.36
1597MB 32thread new -> kops/s: 283.125 io_bytes/op: 1351.05 miss_ratio: 0.0232797 max_rss_mb: 1703.83
Almost uniformly improving over base revision, especially for hot paths with HyperClockCache, up to 12% higher throughput seen (1597MB, 32thread, hyper). The improvement for that is likely coming from much simplified code for providing context for secondary cache promotion (CreateCallback/CreateContext), and possibly from less branching in block_based_table_reader. And likely a small improvement from not reconstituting key for DeleterFn.
Reviewed By: anand1976
Differential Revision: D42417818
Pulled By: pdillinger
fbshipit-source-id: f86bfdd584dce27c028b151ba56818ad14f7a432
2023-01-11 22:20:40 +00:00
|
|
|
struct SentinelValue {
|
|
|
|
explicit SentinelValue(char _c) : c(_c) {}
|
|
|
|
char c;
|
|
|
|
};
|
|
|
|
static SentinelValue kRegularBlockCacheMarker{'b'};
|
|
|
|
static char kPersistentCacheMarker{'p'};
|
2021-11-16 19:14:02 +00:00
|
|
|
if (bbto.block_cache) {
|
|
|
|
bbto.block_cache
|
Major Cache refactoring, CPU efficiency improvement (#10975)
Summary:
This is several refactorings bundled into one to avoid having to incrementally re-modify uses of Cache several times. Overall, there are breaking changes to Cache class, and it becomes more of low-level interface for implementing caches, especially block cache. New internal APIs make using Cache cleaner than before, and more insulated from block cache evolution. Hopefully, this is the last really big block cache refactoring, because of rather effectively decoupling the implementations from the uses. This change also removes the EXPERIMENTAL designation on the SecondaryCache support in Cache. It seems reasonably mature at this point but still subject to change/evolution (as I warn in the API docs for Cache).
The high-level motivation for this refactoring is to minimize code duplication / compounding complexity in adding SecondaryCache support to HyperClockCache (in a later PR). Other benefits listed below.
* static_cast lines of code +29 -35 (net removed 6)
* reinterpret_cast lines of code +6 -32 (net removed 26)
## cache.h and secondary_cache.h
* Always use CacheItemHelper with entries instead of just a Deleter. There are several motivations / justifications:
* Simpler for implementations to deal with just one Insert and one Lookup.
* Simpler and more efficient implementation because we don't have to track which entries are using helpers and which are using deleters
* Gets rid of hack to classify cache entries by their deleter. Instead, the CacheItemHelper includes a CacheEntryRole. This simplifies a lot of code (cache_entry_roles.h almost eliminated). Fixes https://github.com/facebook/rocksdb/issues/9428.
* Makes it trivial to adjust SecondaryCache behavior based on kind of block (e.g. don't re-compress filter blocks).
* It is arguably less convenient for many direct users of Cache, but direct users of Cache are now rare with introduction of typed_cache.h (below).
* I considered and rejected an alternative approach in which we reduce customizability by assuming each secondary cache compatible value starts with a Slice referencing the uncompressed block contents (already true or mostly true), but we apparently intend to stack secondary caches. Saving an entry from a compressed secondary to a lower tier requires custom handling offered by SaveToCallback, etc.
* Make CreateCallback part of the helper and introduce CreateContext to work with it (alternative to https://github.com/facebook/rocksdb/issues/10562). This cleans up the interface while still allowing context to be provided for loading/parsing values into primary cache. This model works for async lookup in BlockBasedTable reader (reader owns a CreateContext) under the assumption that it always waits on secondary cache operations to finish. (Otherwise, the CreateContext could be destroyed while async operation depending on it continues.) This likely contributes most to the observed performance improvement because it saves an std::function backed by a heap allocation.
* Use char* for serialized data, e.g. in SaveToCallback, where void* was confusingly used. (We use `char*` for serialized byte data all over RocksDB, with many advantages over `void*`. `memcpy` etc. are legacy APIs that should not be mimicked.)
* Add a type alias Cache::ObjectPtr = void*, so that we can better indicate the intent of the void* when it is to be the object associated with a Cache entry. Related: started (but did not complete) a refactoring to move away from "value" of a cache entry toward "object" or "obj". (It is confusing to call Cache a key-value store (like DB) when it is really storing arbitrary in-memory objects, not byte strings.)
* Remove unnecessary key param from DeleterFn. This is good for efficiency in HyperClockCache, which does not directly store the cache key in memory. (Alternative to https://github.com/facebook/rocksdb/issues/10774)
* Add allocator to Cache DeleterFn. This is a kind of future-proofing change in case we get more serious about using the Cache allocator for memory tracked by the Cache. Right now, only the uncompressed block contents are allocated using the allocator, and a pointer to that allocator is saved as part of the cached object so that the deleter can use it. (See CacheAllocationPtr.) If in the future we are able to "flatten out" our Cache objects some more, it would be good not to have to track the allocator as part of each object.
* Removes legacy `ApplyToAllCacheEntries` and changes `ApplyToAllEntries` signature for Deleter->CacheItemHelper change.
## typed_cache.h
Adds various "typed" interfaces to the Cache as internal APIs, so that most uses of Cache can use simple type safe code without casting and without explicit deleters, etc. Almost all of the non-test, non-glue code uses of Cache have been migrated. (Follow-up work: CompressedSecondaryCache deserves deeper attention to migrate.) This change expands RocksDB's internal usage of metaprogramming and SFINAE (https://en.cppreference.com/w/cpp/language/sfinae).
The existing usages of Cache are divided up at a high level into these new interfaces. See updated existing uses of Cache for examples of how these are used.
* PlaceholderCacheInterface - Used for making cache reservations, with entries that have a charge but no value.
* BasicTypedCacheInterface<TValue> - Used for primary cache storage of objects of type TValue, which can be cleaned up with std::default_delete<TValue>. The role is provided by TValue::kCacheEntryRole or given in an optional template parameter.
* FullTypedCacheInterface<TValue, TCreateContext> - Used for secondary cache compatible storage of objects of type TValue. In addition to BasicTypedCacheInterface constraints, we require TValue::ContentSlice() to return persistable data. This simplifies usage for the normal case of simple secondary cache compatibility (can give you a Slice to the data already in memory). In addition to TCreateContext performing the role of Cache::CreateContext, it is also expected to provide a factory function for creating TValue.
* For each of these, there's a "Shared" version (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, rather than assuming external ownership by holding only a raw `Cache*`.
These interfaces introduce specific handle types for each interface instantiation, so that it's easy to see what kind of object is controlled by a handle. (Ultimately, this might not be worth the extra complexity, but it seems OK so far.)
Note: I attempted to make the cache 'charge' automatically inferred from the cache object type, such as by expecting an ApproximateMemoryUsage() function, but this is not so clean because there are cases where we need to compute the charge ahead of time and don't want to re-compute it.
## block_cache.h
This header is essentially the replacement for the old block_like_traits.h. It includes various things to support block cache access with typed_cache.h for block-based table.
## block_based_table_reader.cc
Before this change, accessing the block cache here was an awkward mix of static polymorphism (template TBlocklike) and switch-case on a dynamic BlockType value. This change mostly unifies on static polymorphism, relying on minor hacks in block_cache.h to distinguish variants of Block. We still check BlockType in some places (especially for stats, which could be improved in follow-up work) but at least the BlockType is a static constant from the template parameter. (No more awkward partial redundancy between static and dynamic info.) This likely contributes to the overall performance improvement, but hasn't been tested in isolation.
The other key source of simplification here is a more unified system of creating block cache objects: for directly populating from primary cache and for promotion from secondary cache. Both use BlockCreateContext, for context and for factory functions.
## block_based_table_builder.cc, cache_dump_load_impl.cc
Before this change, warming caches was super ugly code. Both of these source files had switch statements to basically transition from the dynamic BlockType world to the static TBlocklike world. None of that mess is needed anymore as there's a new, untyped WarmInCache function that handles all the details just as promotion from SecondaryCache would. (Fixes `TODO akanksha: Dedup below code` in block_based_table_builder.cc.)
## Everything else
Mostly just updating Cache users to use new typed APIs when reasonably possible, or changed Cache APIs when not.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10975
Test Plan:
tests updated
Performance test setup similar to https://github.com/facebook/rocksdb/issues/10626 (by cache size, LRUCache when not "hyper" for HyperClockCache):
34MB 1thread base.hyper -> kops/s: 0.745 io_bytes/op: 2.52504e+06 miss_ratio: 0.140906 max_rss_mb: 76.4844
34MB 1thread new.hyper -> kops/s: 0.751 io_bytes/op: 2.5123e+06 miss_ratio: 0.140161 max_rss_mb: 79.3594
34MB 1thread base -> kops/s: 0.254 io_bytes/op: 1.36073e+07 miss_ratio: 0.918818 max_rss_mb: 45.9297
34MB 1thread new -> kops/s: 0.252 io_bytes/op: 1.36157e+07 miss_ratio: 0.918999 max_rss_mb: 44.1523
34MB 32thread base.hyper -> kops/s: 7.272 io_bytes/op: 2.88323e+06 miss_ratio: 0.162532 max_rss_mb: 516.602
34MB 32thread new.hyper -> kops/s: 7.214 io_bytes/op: 2.99046e+06 miss_ratio: 0.168818 max_rss_mb: 518.293
34MB 32thread base -> kops/s: 3.528 io_bytes/op: 1.35722e+07 miss_ratio: 0.914691 max_rss_mb: 264.926
34MB 32thread new -> kops/s: 3.604 io_bytes/op: 1.35744e+07 miss_ratio: 0.915054 max_rss_mb: 264.488
233MB 1thread base.hyper -> kops/s: 53.909 io_bytes/op: 2552.35 miss_ratio: 0.0440566 max_rss_mb: 241.984
233MB 1thread new.hyper -> kops/s: 62.792 io_bytes/op: 2549.79 miss_ratio: 0.044043 max_rss_mb: 241.922
233MB 1thread base -> kops/s: 1.197 io_bytes/op: 2.75173e+06 miss_ratio: 0.103093 max_rss_mb: 241.559
233MB 1thread new -> kops/s: 1.199 io_bytes/op: 2.73723e+06 miss_ratio: 0.10305 max_rss_mb: 240.93
233MB 32thread base.hyper -> kops/s: 1298.69 io_bytes/op: 2539.12 miss_ratio: 0.0440307 max_rss_mb: 371.418
233MB 32thread new.hyper -> kops/s: 1421.35 io_bytes/op: 2538.75 miss_ratio: 0.0440307 max_rss_mb: 347.273
233MB 32thread base -> kops/s: 9.693 io_bytes/op: 2.77304e+06 miss_ratio: 0.103745 max_rss_mb: 569.691
233MB 32thread new -> kops/s: 9.75 io_bytes/op: 2.77559e+06 miss_ratio: 0.103798 max_rss_mb: 552.82
1597MB 1thread base.hyper -> kops/s: 58.607 io_bytes/op: 1449.14 miss_ratio: 0.0249324 max_rss_mb: 1583.55
1597MB 1thread new.hyper -> kops/s: 69.6 io_bytes/op: 1434.89 miss_ratio: 0.0247167 max_rss_mb: 1584.02
1597MB 1thread base -> kops/s: 60.478 io_bytes/op: 1421.28 miss_ratio: 0.024452 max_rss_mb: 1589.45
1597MB 1thread new -> kops/s: 63.973 io_bytes/op: 1416.07 miss_ratio: 0.0243766 max_rss_mb: 1589.24
1597MB 32thread base.hyper -> kops/s: 1436.2 io_bytes/op: 1357.93 miss_ratio: 0.0235353 max_rss_mb: 1692.92
1597MB 32thread new.hyper -> kops/s: 1605.03 io_bytes/op: 1358.04 miss_ratio: 0.023538 max_rss_mb: 1702.78
1597MB 32thread base -> kops/s: 280.059 io_bytes/op: 1350.34 miss_ratio: 0.023289 max_rss_mb: 1675.36
1597MB 32thread new -> kops/s: 283.125 io_bytes/op: 1351.05 miss_ratio: 0.0232797 max_rss_mb: 1703.83
Almost uniformly improving over base revision, especially for hot paths with HyperClockCache, up to 12% higher throughput seen (1597MB, 32thread, hyper). The improvement for that is likely coming from much simplified code for providing context for secondary cache promotion (CreateCallback/CreateContext), and possibly from less branching in block_based_table_reader. And likely a small improvement from not reconstituting key for DeleterFn.
Reviewed By: anand1976
Differential Revision: D42417818
Pulled By: pdillinger
fbshipit-source-id: f86bfdd584dce27c028b151ba56818ad14f7a432
2023-01-11 22:20:40 +00:00
|
|
|
->Insert(sentinel_key.AsSlice(), &kRegularBlockCacheMarker, &kHelper, 1)
|
2021-11-16 19:14:02 +00:00
|
|
|
.PermitUncheckedError();
|
|
|
|
}
|
|
|
|
if (bbto.persistent_cache) {
|
|
|
|
// Note: persistent cache copies the data, not keeping the pointer
|
|
|
|
bbto.persistent_cache
|
2022-10-19 05:06:57 +00:00
|
|
|
->Insert(sentinel_key.AsSlice(), &kPersistentCacheMarker, 1)
|
2021-11-16 19:14:02 +00:00
|
|
|
.PermitUncheckedError();
|
|
|
|
}
|
|
|
|
// If we get something different from what we inserted, that indicates
|
|
|
|
// dangerously overlapping key spaces.
|
|
|
|
if (bbto.block_cache) {
|
2022-10-19 05:06:57 +00:00
|
|
|
auto handle = bbto.block_cache->Lookup(sentinel_key.AsSlice());
|
2021-11-16 19:14:02 +00:00
|
|
|
if (handle) {
|
Major Cache refactoring, CPU efficiency improvement (#10975)
Summary:
This is several refactorings bundled into one to avoid having to incrementally re-modify uses of Cache several times. Overall, there are breaking changes to Cache class, and it becomes more of low-level interface for implementing caches, especially block cache. New internal APIs make using Cache cleaner than before, and more insulated from block cache evolution. Hopefully, this is the last really big block cache refactoring, because of rather effectively decoupling the implementations from the uses. This change also removes the EXPERIMENTAL designation on the SecondaryCache support in Cache. It seems reasonably mature at this point but still subject to change/evolution (as I warn in the API docs for Cache).
The high-level motivation for this refactoring is to minimize code duplication / compounding complexity in adding SecondaryCache support to HyperClockCache (in a later PR). Other benefits listed below.
* static_cast lines of code +29 -35 (net removed 6)
* reinterpret_cast lines of code +6 -32 (net removed 26)
## cache.h and secondary_cache.h
* Always use CacheItemHelper with entries instead of just a Deleter. There are several motivations / justifications:
* Simpler for implementations to deal with just one Insert and one Lookup.
* Simpler and more efficient implementation because we don't have to track which entries are using helpers and which are using deleters
* Gets rid of hack to classify cache entries by their deleter. Instead, the CacheItemHelper includes a CacheEntryRole. This simplifies a lot of code (cache_entry_roles.h almost eliminated). Fixes https://github.com/facebook/rocksdb/issues/9428.
* Makes it trivial to adjust SecondaryCache behavior based on kind of block (e.g. don't re-compress filter blocks).
* It is arguably less convenient for many direct users of Cache, but direct users of Cache are now rare with introduction of typed_cache.h (below).
* I considered and rejected an alternative approach in which we reduce customizability by assuming each secondary cache compatible value starts with a Slice referencing the uncompressed block contents (already true or mostly true), but we apparently intend to stack secondary caches. Saving an entry from a compressed secondary to a lower tier requires custom handling offered by SaveToCallback, etc.
* Make CreateCallback part of the helper and introduce CreateContext to work with it (alternative to https://github.com/facebook/rocksdb/issues/10562). This cleans up the interface while still allowing context to be provided for loading/parsing values into primary cache. This model works for async lookup in BlockBasedTable reader (reader owns a CreateContext) under the assumption that it always waits on secondary cache operations to finish. (Otherwise, the CreateContext could be destroyed while async operation depending on it continues.) This likely contributes most to the observed performance improvement because it saves an std::function backed by a heap allocation.
* Use char* for serialized data, e.g. in SaveToCallback, where void* was confusingly used. (We use `char*` for serialized byte data all over RocksDB, with many advantages over `void*`. `memcpy` etc. are legacy APIs that should not be mimicked.)
* Add a type alias Cache::ObjectPtr = void*, so that we can better indicate the intent of the void* when it is to be the object associated with a Cache entry. Related: started (but did not complete) a refactoring to move away from "value" of a cache entry toward "object" or "obj". (It is confusing to call Cache a key-value store (like DB) when it is really storing arbitrary in-memory objects, not byte strings.)
* Remove unnecessary key param from DeleterFn. This is good for efficiency in HyperClockCache, which does not directly store the cache key in memory. (Alternative to https://github.com/facebook/rocksdb/issues/10774)
* Add allocator to Cache DeleterFn. This is a kind of future-proofing change in case we get more serious about using the Cache allocator for memory tracked by the Cache. Right now, only the uncompressed block contents are allocated using the allocator, and a pointer to that allocator is saved as part of the cached object so that the deleter can use it. (See CacheAllocationPtr.) If in the future we are able to "flatten out" our Cache objects some more, it would be good not to have to track the allocator as part of each object.
* Removes legacy `ApplyToAllCacheEntries` and changes `ApplyToAllEntries` signature for Deleter->CacheItemHelper change.
## typed_cache.h
Adds various "typed" interfaces to the Cache as internal APIs, so that most uses of Cache can use simple type safe code without casting and without explicit deleters, etc. Almost all of the non-test, non-glue code uses of Cache have been migrated. (Follow-up work: CompressedSecondaryCache deserves deeper attention to migrate.) This change expands RocksDB's internal usage of metaprogramming and SFINAE (https://en.cppreference.com/w/cpp/language/sfinae).
The existing usages of Cache are divided up at a high level into these new interfaces. See updated existing uses of Cache for examples of how these are used.
* PlaceholderCacheInterface - Used for making cache reservations, with entries that have a charge but no value.
* BasicTypedCacheInterface<TValue> - Used for primary cache storage of objects of type TValue, which can be cleaned up with std::default_delete<TValue>. The role is provided by TValue::kCacheEntryRole or given in an optional template parameter.
* FullTypedCacheInterface<TValue, TCreateContext> - Used for secondary cache compatible storage of objects of type TValue. In addition to BasicTypedCacheInterface constraints, we require TValue::ContentSlice() to return persistable data. This simplifies usage for the normal case of simple secondary cache compatibility (can give you a Slice to the data already in memory). In addition to TCreateContext performing the role of Cache::CreateContext, it is also expected to provide a factory function for creating TValue.
* For each of these, there's a "Shared" version (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, rather than assuming external ownership by holding only a raw `Cache*`.
These interfaces introduce specific handle types for each interface instantiation, so that it's easy to see what kind of object is controlled by a handle. (Ultimately, this might not be worth the extra complexity, but it seems OK so far.)
Note: I attempted to make the cache 'charge' automatically inferred from the cache object type, such as by expecting an ApproximateMemoryUsage() function, but this is not so clean because there are cases where we need to compute the charge ahead of time and don't want to re-compute it.
## block_cache.h
This header is essentially the replacement for the old block_like_traits.h. It includes various things to support block cache access with typed_cache.h for block-based table.
## block_based_table_reader.cc
Before this change, accessing the block cache here was an awkward mix of static polymorphism (template TBlocklike) and switch-case on a dynamic BlockType value. This change mostly unifies on static polymorphism, relying on minor hacks in block_cache.h to distinguish variants of Block. We still check BlockType in some places (especially for stats, which could be improved in follow-up work) but at least the BlockType is a static constant from the template parameter. (No more awkward partial redundancy between static and dynamic info.) This likely contributes to the overall performance improvement, but hasn't been tested in isolation.
The other key source of simplification here is a more unified system of creating block cache objects: for directly populating from primary cache and for promotion from secondary cache. Both use BlockCreateContext, for context and for factory functions.
## block_based_table_builder.cc, cache_dump_load_impl.cc
Before this change, warming caches was super ugly code. Both of these source files had switch statements to basically transition from the dynamic BlockType world to the static TBlocklike world. None of that mess is needed anymore as there's a new, untyped WarmInCache function that handles all the details just as promotion from SecondaryCache would. (Fixes `TODO akanksha: Dedup below code` in block_based_table_builder.cc.)
## Everything else
Mostly just updating Cache users to use new typed APIs when reasonably possible, or changed Cache APIs when not.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10975
Test Plan:
tests updated
Performance test setup similar to https://github.com/facebook/rocksdb/issues/10626 (by cache size, LRUCache when not "hyper" for HyperClockCache):
34MB 1thread base.hyper -> kops/s: 0.745 io_bytes/op: 2.52504e+06 miss_ratio: 0.140906 max_rss_mb: 76.4844
34MB 1thread new.hyper -> kops/s: 0.751 io_bytes/op: 2.5123e+06 miss_ratio: 0.140161 max_rss_mb: 79.3594
34MB 1thread base -> kops/s: 0.254 io_bytes/op: 1.36073e+07 miss_ratio: 0.918818 max_rss_mb: 45.9297
34MB 1thread new -> kops/s: 0.252 io_bytes/op: 1.36157e+07 miss_ratio: 0.918999 max_rss_mb: 44.1523
34MB 32thread base.hyper -> kops/s: 7.272 io_bytes/op: 2.88323e+06 miss_ratio: 0.162532 max_rss_mb: 516.602
34MB 32thread new.hyper -> kops/s: 7.214 io_bytes/op: 2.99046e+06 miss_ratio: 0.168818 max_rss_mb: 518.293
34MB 32thread base -> kops/s: 3.528 io_bytes/op: 1.35722e+07 miss_ratio: 0.914691 max_rss_mb: 264.926
34MB 32thread new -> kops/s: 3.604 io_bytes/op: 1.35744e+07 miss_ratio: 0.915054 max_rss_mb: 264.488
233MB 1thread base.hyper -> kops/s: 53.909 io_bytes/op: 2552.35 miss_ratio: 0.0440566 max_rss_mb: 241.984
233MB 1thread new.hyper -> kops/s: 62.792 io_bytes/op: 2549.79 miss_ratio: 0.044043 max_rss_mb: 241.922
233MB 1thread base -> kops/s: 1.197 io_bytes/op: 2.75173e+06 miss_ratio: 0.103093 max_rss_mb: 241.559
233MB 1thread new -> kops/s: 1.199 io_bytes/op: 2.73723e+06 miss_ratio: 0.10305 max_rss_mb: 240.93
233MB 32thread base.hyper -> kops/s: 1298.69 io_bytes/op: 2539.12 miss_ratio: 0.0440307 max_rss_mb: 371.418
233MB 32thread new.hyper -> kops/s: 1421.35 io_bytes/op: 2538.75 miss_ratio: 0.0440307 max_rss_mb: 347.273
233MB 32thread base -> kops/s: 9.693 io_bytes/op: 2.77304e+06 miss_ratio: 0.103745 max_rss_mb: 569.691
233MB 32thread new -> kops/s: 9.75 io_bytes/op: 2.77559e+06 miss_ratio: 0.103798 max_rss_mb: 552.82
1597MB 1thread base.hyper -> kops/s: 58.607 io_bytes/op: 1449.14 miss_ratio: 0.0249324 max_rss_mb: 1583.55
1597MB 1thread new.hyper -> kops/s: 69.6 io_bytes/op: 1434.89 miss_ratio: 0.0247167 max_rss_mb: 1584.02
1597MB 1thread base -> kops/s: 60.478 io_bytes/op: 1421.28 miss_ratio: 0.024452 max_rss_mb: 1589.45
1597MB 1thread new -> kops/s: 63.973 io_bytes/op: 1416.07 miss_ratio: 0.0243766 max_rss_mb: 1589.24
1597MB 32thread base.hyper -> kops/s: 1436.2 io_bytes/op: 1357.93 miss_ratio: 0.0235353 max_rss_mb: 1692.92
1597MB 32thread new.hyper -> kops/s: 1605.03 io_bytes/op: 1358.04 miss_ratio: 0.023538 max_rss_mb: 1702.78
1597MB 32thread base -> kops/s: 280.059 io_bytes/op: 1350.34 miss_ratio: 0.023289 max_rss_mb: 1675.36
1597MB 32thread new -> kops/s: 283.125 io_bytes/op: 1351.05 miss_ratio: 0.0232797 max_rss_mb: 1703.83
Almost uniformly improving over base revision, especially for hot paths with HyperClockCache, up to 12% higher throughput seen (1597MB, 32thread, hyper). The improvement for that is likely coming from much simplified code for providing context for secondary cache promotion (CreateCallback/CreateContext), and possibly from less branching in block_based_table_reader. And likely a small improvement from not reconstituting key for DeleterFn.
Reviewed By: anand1976
Differential Revision: D42417818
Pulled By: pdillinger
fbshipit-source-id: f86bfdd584dce27c028b151ba56818ad14f7a432
2023-01-11 22:20:40 +00:00
|
|
|
auto v = static_cast<SentinelValue*>(bbto.block_cache->Value(handle));
|
|
|
|
char c = v->c;
|
2021-11-16 19:14:02 +00:00
|
|
|
bbto.block_cache->Release(handle);
|
2023-01-25 01:09:19 +00:00
|
|
|
if (c == kPersistentCacheMarker) {
|
2021-11-16 19:14:02 +00:00
|
|
|
return Status::InvalidArgument(
|
|
|
|
"block_cache and persistent_cache share the same key space, "
|
|
|
|
"which is not supported");
|
|
|
|
} else if (v != &kRegularBlockCacheMarker) {
|
|
|
|
return Status::Corruption("Unexpected mutation to block_cache");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-01-25 01:09:19 +00:00
|
|
|
|
2021-11-16 19:14:02 +00:00
|
|
|
if (bbto.persistent_cache) {
|
|
|
|
std::unique_ptr<char[]> data;
|
|
|
|
size_t size = 0;
|
2022-10-19 05:06:57 +00:00
|
|
|
bbto.persistent_cache->Lookup(sentinel_key.AsSlice(), &data, &size)
|
2021-11-16 19:14:02 +00:00
|
|
|
.PermitUncheckedError();
|
|
|
|
if (data && size > 0) {
|
Major Cache refactoring, CPU efficiency improvement (#10975)
Summary:
This is several refactorings bundled into one to avoid having to incrementally re-modify uses of Cache several times. Overall, there are breaking changes to Cache class, and it becomes more of low-level interface for implementing caches, especially block cache. New internal APIs make using Cache cleaner than before, and more insulated from block cache evolution. Hopefully, this is the last really big block cache refactoring, because of rather effectively decoupling the implementations from the uses. This change also removes the EXPERIMENTAL designation on the SecondaryCache support in Cache. It seems reasonably mature at this point but still subject to change/evolution (as I warn in the API docs for Cache).
The high-level motivation for this refactoring is to minimize code duplication / compounding complexity in adding SecondaryCache support to HyperClockCache (in a later PR). Other benefits listed below.
* static_cast lines of code +29 -35 (net removed 6)
* reinterpret_cast lines of code +6 -32 (net removed 26)
## cache.h and secondary_cache.h
* Always use CacheItemHelper with entries instead of just a Deleter. There are several motivations / justifications:
* Simpler for implementations to deal with just one Insert and one Lookup.
* Simpler and more efficient implementation because we don't have to track which entries are using helpers and which are using deleters
* Gets rid of hack to classify cache entries by their deleter. Instead, the CacheItemHelper includes a CacheEntryRole. This simplifies a lot of code (cache_entry_roles.h almost eliminated). Fixes https://github.com/facebook/rocksdb/issues/9428.
* Makes it trivial to adjust SecondaryCache behavior based on kind of block (e.g. don't re-compress filter blocks).
* It is arguably less convenient for many direct users of Cache, but direct users of Cache are now rare with introduction of typed_cache.h (below).
* I considered and rejected an alternative approach in which we reduce customizability by assuming each secondary cache compatible value starts with a Slice referencing the uncompressed block contents (already true or mostly true), but we apparently intend to stack secondary caches. Saving an entry from a compressed secondary to a lower tier requires custom handling offered by SaveToCallback, etc.
* Make CreateCallback part of the helper and introduce CreateContext to work with it (alternative to https://github.com/facebook/rocksdb/issues/10562). This cleans up the interface while still allowing context to be provided for loading/parsing values into primary cache. This model works for async lookup in BlockBasedTable reader (reader owns a CreateContext) under the assumption that it always waits on secondary cache operations to finish. (Otherwise, the CreateContext could be destroyed while async operation depending on it continues.) This likely contributes most to the observed performance improvement because it saves an std::function backed by a heap allocation.
* Use char* for serialized data, e.g. in SaveToCallback, where void* was confusingly used. (We use `char*` for serialized byte data all over RocksDB, with many advantages over `void*`. `memcpy` etc. are legacy APIs that should not be mimicked.)
* Add a type alias Cache::ObjectPtr = void*, so that we can better indicate the intent of the void* when it is to be the object associated with a Cache entry. Related: started (but did not complete) a refactoring to move away from "value" of a cache entry toward "object" or "obj". (It is confusing to call Cache a key-value store (like DB) when it is really storing arbitrary in-memory objects, not byte strings.)
* Remove unnecessary key param from DeleterFn. This is good for efficiency in HyperClockCache, which does not directly store the cache key in memory. (Alternative to https://github.com/facebook/rocksdb/issues/10774)
* Add allocator to Cache DeleterFn. This is a kind of future-proofing change in case we get more serious about using the Cache allocator for memory tracked by the Cache. Right now, only the uncompressed block contents are allocated using the allocator, and a pointer to that allocator is saved as part of the cached object so that the deleter can use it. (See CacheAllocationPtr.) If in the future we are able to "flatten out" our Cache objects some more, it would be good not to have to track the allocator as part of each object.
* Removes legacy `ApplyToAllCacheEntries` and changes `ApplyToAllEntries` signature for Deleter->CacheItemHelper change.
## typed_cache.h
Adds various "typed" interfaces to the Cache as internal APIs, so that most uses of Cache can use simple type safe code without casting and without explicit deleters, etc. Almost all of the non-test, non-glue code uses of Cache have been migrated. (Follow-up work: CompressedSecondaryCache deserves deeper attention to migrate.) This change expands RocksDB's internal usage of metaprogramming and SFINAE (https://en.cppreference.com/w/cpp/language/sfinae).
The existing usages of Cache are divided up at a high level into these new interfaces. See updated existing uses of Cache for examples of how these are used.
* PlaceholderCacheInterface - Used for making cache reservations, with entries that have a charge but no value.
* BasicTypedCacheInterface<TValue> - Used for primary cache storage of objects of type TValue, which can be cleaned up with std::default_delete<TValue>. The role is provided by TValue::kCacheEntryRole or given in an optional template parameter.
* FullTypedCacheInterface<TValue, TCreateContext> - Used for secondary cache compatible storage of objects of type TValue. In addition to BasicTypedCacheInterface constraints, we require TValue::ContentSlice() to return persistable data. This simplifies usage for the normal case of simple secondary cache compatibility (can give you a Slice to the data already in memory). In addition to TCreateContext performing the role of Cache::CreateContext, it is also expected to provide a factory function for creating TValue.
* For each of these, there's a "Shared" version (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, rather than assuming external ownership by holding only a raw `Cache*`.
These interfaces introduce specific handle types for each interface instantiation, so that it's easy to see what kind of object is controlled by a handle. (Ultimately, this might not be worth the extra complexity, but it seems OK so far.)
Note: I attempted to make the cache 'charge' automatically inferred from the cache object type, such as by expecting an ApproximateMemoryUsage() function, but this is not so clean because there are cases where we need to compute the charge ahead of time and don't want to re-compute it.
## block_cache.h
This header is essentially the replacement for the old block_like_traits.h. It includes various things to support block cache access with typed_cache.h for block-based table.
## block_based_table_reader.cc
Before this change, accessing the block cache here was an awkward mix of static polymorphism (template TBlocklike) and switch-case on a dynamic BlockType value. This change mostly unifies on static polymorphism, relying on minor hacks in block_cache.h to distinguish variants of Block. We still check BlockType in some places (especially for stats, which could be improved in follow-up work) but at least the BlockType is a static constant from the template parameter. (No more awkward partial redundancy between static and dynamic info.) This likely contributes to the overall performance improvement, but hasn't been tested in isolation.
The other key source of simplification here is a more unified system of creating block cache objects: for directly populating from primary cache and for promotion from secondary cache. Both use BlockCreateContext, for context and for factory functions.
## block_based_table_builder.cc, cache_dump_load_impl.cc
Before this change, warming caches was super ugly code. Both of these source files had switch statements to basically transition from the dynamic BlockType world to the static TBlocklike world. None of that mess is needed anymore as there's a new, untyped WarmInCache function that handles all the details just as promotion from SecondaryCache would. (Fixes `TODO akanksha: Dedup below code` in block_based_table_builder.cc.)
## Everything else
Mostly just updating Cache users to use new typed APIs when reasonably possible, or changed Cache APIs when not.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10975
Test Plan:
tests updated
Performance test setup similar to https://github.com/facebook/rocksdb/issues/10626 (by cache size, LRUCache when not "hyper" for HyperClockCache):
34MB 1thread base.hyper -> kops/s: 0.745 io_bytes/op: 2.52504e+06 miss_ratio: 0.140906 max_rss_mb: 76.4844
34MB 1thread new.hyper -> kops/s: 0.751 io_bytes/op: 2.5123e+06 miss_ratio: 0.140161 max_rss_mb: 79.3594
34MB 1thread base -> kops/s: 0.254 io_bytes/op: 1.36073e+07 miss_ratio: 0.918818 max_rss_mb: 45.9297
34MB 1thread new -> kops/s: 0.252 io_bytes/op: 1.36157e+07 miss_ratio: 0.918999 max_rss_mb: 44.1523
34MB 32thread base.hyper -> kops/s: 7.272 io_bytes/op: 2.88323e+06 miss_ratio: 0.162532 max_rss_mb: 516.602
34MB 32thread new.hyper -> kops/s: 7.214 io_bytes/op: 2.99046e+06 miss_ratio: 0.168818 max_rss_mb: 518.293
34MB 32thread base -> kops/s: 3.528 io_bytes/op: 1.35722e+07 miss_ratio: 0.914691 max_rss_mb: 264.926
34MB 32thread new -> kops/s: 3.604 io_bytes/op: 1.35744e+07 miss_ratio: 0.915054 max_rss_mb: 264.488
233MB 1thread base.hyper -> kops/s: 53.909 io_bytes/op: 2552.35 miss_ratio: 0.0440566 max_rss_mb: 241.984
233MB 1thread new.hyper -> kops/s: 62.792 io_bytes/op: 2549.79 miss_ratio: 0.044043 max_rss_mb: 241.922
233MB 1thread base -> kops/s: 1.197 io_bytes/op: 2.75173e+06 miss_ratio: 0.103093 max_rss_mb: 241.559
233MB 1thread new -> kops/s: 1.199 io_bytes/op: 2.73723e+06 miss_ratio: 0.10305 max_rss_mb: 240.93
233MB 32thread base.hyper -> kops/s: 1298.69 io_bytes/op: 2539.12 miss_ratio: 0.0440307 max_rss_mb: 371.418
233MB 32thread new.hyper -> kops/s: 1421.35 io_bytes/op: 2538.75 miss_ratio: 0.0440307 max_rss_mb: 347.273
233MB 32thread base -> kops/s: 9.693 io_bytes/op: 2.77304e+06 miss_ratio: 0.103745 max_rss_mb: 569.691
233MB 32thread new -> kops/s: 9.75 io_bytes/op: 2.77559e+06 miss_ratio: 0.103798 max_rss_mb: 552.82
1597MB 1thread base.hyper -> kops/s: 58.607 io_bytes/op: 1449.14 miss_ratio: 0.0249324 max_rss_mb: 1583.55
1597MB 1thread new.hyper -> kops/s: 69.6 io_bytes/op: 1434.89 miss_ratio: 0.0247167 max_rss_mb: 1584.02
1597MB 1thread base -> kops/s: 60.478 io_bytes/op: 1421.28 miss_ratio: 0.024452 max_rss_mb: 1589.45
1597MB 1thread new -> kops/s: 63.973 io_bytes/op: 1416.07 miss_ratio: 0.0243766 max_rss_mb: 1589.24
1597MB 32thread base.hyper -> kops/s: 1436.2 io_bytes/op: 1357.93 miss_ratio: 0.0235353 max_rss_mb: 1692.92
1597MB 32thread new.hyper -> kops/s: 1605.03 io_bytes/op: 1358.04 miss_ratio: 0.023538 max_rss_mb: 1702.78
1597MB 32thread base -> kops/s: 280.059 io_bytes/op: 1350.34 miss_ratio: 0.023289 max_rss_mb: 1675.36
1597MB 32thread new -> kops/s: 283.125 io_bytes/op: 1351.05 miss_ratio: 0.0232797 max_rss_mb: 1703.83
Almost uniformly improving over base revision, especially for hot paths with HyperClockCache, up to 12% higher throughput seen (1597MB, 32thread, hyper). The improvement for that is likely coming from much simplified code for providing context for secondary cache promotion (CreateCallback/CreateContext), and possibly from less branching in block_based_table_reader. And likely a small improvement from not reconstituting key for DeleterFn.
Reviewed By: anand1976
Differential Revision: D42417818
Pulled By: pdillinger
fbshipit-source-id: f86bfdd584dce27c028b151ba56818ad14f7a432
2023-01-11 22:20:40 +00:00
|
|
|
if (data[0] == kRegularBlockCacheMarker.c) {
|
2021-11-16 19:14:02 +00:00
|
|
|
return Status::InvalidArgument(
|
|
|
|
"persistent_cache and block_cache share the same key space, "
|
|
|
|
"which is not supported");
|
|
|
|
} else if (data[0] != kPersistentCacheMarker) {
|
|
|
|
return Status::Corruption("Unexpected mutation to persistent_cache");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
2014-01-28 05:58:46 +00:00
|
|
|
Status BlockBasedTableFactory::NewTableReader(
|
2020-06-29 21:51:57 +00:00
|
|
|
const ReadOptions& ro, const TableReaderOptions& table_reader_options,
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
|
|
|
|
std::unique_ptr<TableReader>* table_reader,
|
2016-07-20 18:23:31 +00:00
|
|
|
bool prefetch_index_and_filter_in_cache) const {
|
2015-09-11 18:36:33 +00:00
|
|
|
return BlockBasedTable::Open(
|
2020-06-29 21:51:57 +00:00
|
|
|
ro, table_reader_options.ioptions, table_reader_options.env_options,
|
2015-09-11 18:36:33 +00:00
|
|
|
table_options_, table_reader_options.internal_comparator, std::move(file),
|
2023-04-25 19:08:23 +00:00
|
|
|
file_size, table_reader_options.block_protection_bytes_per_key,
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2023-05-08 20:14:28 +00:00
|
|
|
table_reader, table_reader_options.tail_size, table_reader_cache_res_mgr_,
|
2022-04-06 17:33:00 +00:00
|
|
|
table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache,
|
|
|
|
table_reader_options.skip_filters, table_reader_options.level,
|
|
|
|
table_reader_options.immortal, table_reader_options.largest_seqno,
|
2020-05-13 01:21:32 +00:00
|
|
|
table_reader_options.force_direct_prefetch, &tail_prefetch_stats_,
|
2020-06-09 23:49:07 +00:00
|
|
|
table_reader_options.block_cache_tracer,
|
2021-06-10 18:01:44 +00:00
|
|
|
table_reader_options.max_file_size_for_l0_meta_pin,
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
table_reader_options.cur_db_session_id, table_reader_options.cur_file_num,
|
Add support to strip / pad timestamp when creating / reading a block based table (#11495)
Summary:
Add support to strip timestamp in block based table builder and pad timestamp in block based table reader.
On the write path, use the per column family option `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` to indicate whether user-defined timestamps should be stripped for all block based tables created for the column family.
On the read path, added a per table `TableReadOption.user_defined_timestamps_persisted` to flag whether the user keys in the table contains user defined timestamps.
This patch is mostly passing the related flags down to the block building/parsing level with the exception of handling the `first_internal_key` in `IndexValue`, which is included in the `IndexBuilder` level. The value part of range deletion entries should have a similar handling, I haven't decided where to best fit this piece of logic, I will do it in a follow up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11495
Test Plan:
Existing test `BlockBasedTableReaderTest` is parameterized to run with:
1) different UDT test modes: kNone, kNormal, kStripUserDefinedTimestamp
2) all four index types, when index type is `kTwoLevelIndexSearch`, also enables partitioned filters
3) parallel vs non-parallel compression
4) enable/disable compression dictionary.
Also added tests for API `BlockBasedTableReader::NewIterator`.
`PartitionedFilterBlockTest` is parameterized to run with different UDT test modes:kNone, kNormal, kStripUserDefinedTimestamp.
```
make all check
./block_based_table_reader_test
./partitioned_filter_block_test
```
Reviewed By: ltamasi
Differential Revision: D46344577
Pulled By: jowlyzhang
fbshipit-source-id: 93ac8542b19319d1298712b8bed908c8831ba675
2023-06-01 18:10:03 +00:00
|
|
|
table_reader_options.unique_id,
|
|
|
|
table_reader_options.user_defined_timestamps_persisted);
|
2013-10-29 00:54:09 +00:00
|
|
|
}
|
|
|
|
|
2014-01-28 05:58:46 +00:00
|
|
|
TableBuilder* BlockBasedTableFactory::NewTableBuilder(
|
2021-04-29 13:59:53 +00:00
|
|
|
const TableBuilderOptions& table_builder_options,
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-17 23:16:11 +00:00
|
|
|
WritableFileWriter* file) const {
|
2021-04-29 13:59:53 +00:00
|
|
|
return new BlockBasedTableBuilder(table_options_, table_builder_options,
|
|
|
|
file);
|
2013-10-29 00:54:09 +00:00
|
|
|
}
|
2013-11-20 06:00:48 +00:00
|
|
|
|
2020-09-14 23:59:00 +00:00
|
|
|
Status BlockBasedTableFactory::ValidateOptions(
|
2019-05-14 00:43:47 +00:00
|
|
|
const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
|
2014-10-18 04:18:36 +00:00
|
|
|
if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
|
|
|
|
cf_opts.prefix_extractor == nullptr) {
|
2018-04-13 00:55:14 +00:00
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Hash index is specified for block-based "
|
2014-10-18 04:18:36 +00:00
|
|
|
"table, but prefix_extractor is not given");
|
|
|
|
}
|
2014-10-22 18:52:35 +00:00
|
|
|
if (table_options_.cache_index_and_filter_blocks &&
|
|
|
|
table_options_.no_block_cache) {
|
2018-04-13 00:55:14 +00:00
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable cache_index_and_filter_blocks, "
|
2014-10-22 18:52:35 +00:00
|
|
|
", but block cache is disabled");
|
|
|
|
}
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
if (table_options_.pin_l0_filter_and_index_blocks_in_cache &&
|
|
|
|
table_options_.no_block_cache) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable pin_l0_filter_and_index_blocks_in_cache, "
|
|
|
|
", but block cache is disabled");
|
|
|
|
}
|
2021-12-10 16:12:09 +00:00
|
|
|
if (!IsSupportedFormatVersion(table_options_.format_version)) {
|
2015-01-13 22:33:04 +00:00
|
|
|
return Status::InvalidArgument(
|
2015-01-15 00:24:24 +00:00
|
|
|
"Unsupported BlockBasedTable format_version. Please check "
|
|
|
|
"include/rocksdb/table.h for more info");
|
2015-01-13 22:33:04 +00:00
|
|
|
}
|
2018-03-27 03:14:24 +00:00
|
|
|
if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
|
2018-04-13 00:55:14 +00:00
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable block_align, but compression "
|
2018-03-27 03:14:24 +00:00
|
|
|
"enabled");
|
|
|
|
}
|
|
|
|
if (table_options_.block_align &&
|
|
|
|
(table_options_.block_size & (table_options_.block_size - 1))) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Block alignment requested but block size is not a power of 2");
|
|
|
|
}
|
2022-05-05 20:08:21 +00:00
|
|
|
if (table_options_.block_size > std::numeric_limits<uint32_t>::max()) {
|
2019-06-20 18:41:59 +00:00
|
|
|
return Status::InvalidArgument(
|
|
|
|
"block size exceeds maximum number (4GiB) allowed");
|
|
|
|
}
|
2018-08-15 21:27:47 +00:00
|
|
|
if (table_options_.data_block_index_type ==
|
2018-08-17 01:29:13 +00:00
|
|
|
BlockBasedTableOptions::kDataBlockBinaryAndHash &&
|
2018-08-15 21:27:47 +00:00
|
|
|
table_options_.data_block_hash_table_util_ratio <= 0) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"data_block_hash_table_util_ratio should be greater than 0 when "
|
|
|
|
"data_block_index_type is set to kDataBlockBinaryAndHash");
|
|
|
|
}
|
2019-05-14 00:43:47 +00:00
|
|
|
if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
|
|
|
|
// TODO(myabandeh): support it
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"max_successive_merges larger than 0 is currently inconsistent with "
|
|
|
|
"unordered_write");
|
|
|
|
}
|
2022-05-17 22:01:51 +00:00
|
|
|
const auto& options_overrides =
|
|
|
|
table_options_.cache_usage_options.options_overrides;
|
|
|
|
for (auto options_overrides_iter = options_overrides.cbegin();
|
|
|
|
options_overrides_iter != options_overrides.cend();
|
|
|
|
++options_overrides_iter) {
|
|
|
|
const CacheEntryRole role = options_overrides_iter->first;
|
|
|
|
const CacheEntryRoleOptions options = options_overrides_iter->second;
|
|
|
|
static const std::set<CacheEntryRole> kMemoryChargingSupported = {
|
|
|
|
CacheEntryRole::kCompressionDictionaryBuildingBuffer,
|
|
|
|
CacheEntryRole::kFilterConstruction,
|
2022-07-19 06:26:57 +00:00
|
|
|
CacheEntryRole::kBlockBasedTableReader, CacheEntryRole::kFileMetadata,
|
|
|
|
CacheEntryRole::kBlobCache};
|
2022-05-17 22:01:51 +00:00
|
|
|
if (options.charged != CacheEntryRoleOptions::Decision::kFallback &&
|
|
|
|
kMemoryChargingSupported.count(role) == 0) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Enable/Disable CacheEntryRoleOptions::charged"
|
2022-07-19 06:26:57 +00:00
|
|
|
" for CacheEntryRole " +
|
2022-05-17 22:01:51 +00:00
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" is not supported");
|
|
|
|
}
|
|
|
|
if (table_options_.no_block_cache &&
|
|
|
|
options.charged == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable CacheEntryRoleOptions::charged"
|
2022-07-19 06:26:57 +00:00
|
|
|
" for CacheEntryRole " +
|
2022-05-17 22:01:51 +00:00
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" but block cache is disabled");
|
|
|
|
}
|
2022-07-19 06:26:57 +00:00
|
|
|
if (role == CacheEntryRole::kBlobCache &&
|
|
|
|
options.charged == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
|
|
if (cf_opts.blob_cache == nullptr) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable CacheEntryRoleOptions::charged"
|
|
|
|
" for CacheEntryRole " +
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" but blob cache is not configured");
|
|
|
|
}
|
|
|
|
if (table_options_.no_block_cache) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable CacheEntryRoleOptions::charged"
|
|
|
|
" for CacheEntryRole " +
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" but block cache is disabled");
|
|
|
|
}
|
|
|
|
if (table_options_.block_cache == cf_opts.blob_cache) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable CacheEntryRoleOptions::charged"
|
|
|
|
" for CacheEntryRole " +
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" but blob cache is the same as block cache");
|
|
|
|
}
|
|
|
|
if (cf_opts.blob_cache->GetCapacity() >
|
|
|
|
table_options_.block_cache->GetCapacity()) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Enable CacheEntryRoleOptions::charged"
|
|
|
|
" for CacheEntryRole " +
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
|
|
|
|
" but blob cache capacity is larger than block cache capacity");
|
|
|
|
}
|
|
|
|
}
|
2022-05-17 22:01:51 +00:00
|
|
|
}
|
2021-11-16 19:14:02 +00:00
|
|
|
{
|
|
|
|
Status s = CheckCacheOptionCompatibility(table_options_);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
std::string garbage;
|
|
|
|
if (!SerializeEnum<ChecksumType>(checksum_type_string_map,
|
|
|
|
table_options_.checksum, &garbage)) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Unrecognized ChecksumType for checksum: " +
|
2022-05-06 20:03:58 +00:00
|
|
|
std::to_string(static_cast<uint32_t>(table_options_.checksum)));
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
}
|
2020-09-14 23:59:00 +00:00
|
|
|
return TableFactory::ValidateOptions(db_opts, cf_opts);
|
2014-10-18 04:18:36 +00:00
|
|
|
}
|
|
|
|
|
2020-09-14 23:59:00 +00:00
|
|
|
std::string BlockBasedTableFactory::GetPrintableOptions() const {
|
2014-08-25 21:24:09 +00:00
|
|
|
std::string ret;
|
|
|
|
ret.reserve(20000);
|
|
|
|
const int kBufferSize = 200;
|
|
|
|
char buffer[kBufferSize];
|
|
|
|
|
|
|
|
snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n",
|
|
|
|
table_options_.flush_block_policy_factory->Name(),
|
2016-03-16 21:57:57 +00:00
|
|
|
static_cast<void*>(table_options_.flush_block_policy_factory.get()));
|
2014-08-25 21:24:09 +00:00
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n",
|
|
|
|
table_options_.cache_index_and_filter_blocks);
|
|
|
|
ret.append(buffer);
|
2016-12-22 22:44:01 +00:00
|
|
|
snprintf(buffer, kBufferSize,
|
|
|
|
" cache_index_and_filter_blocks_with_high_priority: %d\n",
|
|
|
|
table_options_.cache_index_and_filter_blocks_with_high_priority);
|
|
|
|
ret.append(buffer);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
snprintf(buffer, kBufferSize,
|
|
|
|
" pin_l0_filter_and_index_blocks_in_cache: %d\n",
|
|
|
|
table_options_.pin_l0_filter_and_index_blocks_in_cache);
|
|
|
|
ret.append(buffer);
|
2018-06-22 22:14:05 +00:00
|
|
|
snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n",
|
|
|
|
table_options_.pin_top_level_index_and_filter);
|
|
|
|
ret.append(buffer);
|
2014-08-25 21:24:09 +00:00
|
|
|
snprintf(buffer, kBufferSize, " index_type: %d\n",
|
|
|
|
table_options_.index_type);
|
|
|
|
ret.append(buffer);
|
2019-03-08 19:15:51 +00:00
|
|
|
snprintf(buffer, kBufferSize, " data_block_index_type: %d\n",
|
|
|
|
table_options_.data_block_index_type);
|
|
|
|
ret.append(buffer);
|
2019-04-22 15:17:45 +00:00
|
|
|
snprintf(buffer, kBufferSize, " index_shortening: %d\n",
|
|
|
|
static_cast<int>(table_options_.index_shortening));
|
|
|
|
ret.append(buffer);
|
2019-03-08 19:15:51 +00:00
|
|
|
snprintf(buffer, kBufferSize, " data_block_hash_table_util_ratio: %lf\n",
|
|
|
|
table_options_.data_block_hash_table_util_ratio);
|
|
|
|
ret.append(buffer);
|
2018-04-13 00:55:14 +00:00
|
|
|
snprintf(buffer, kBufferSize, " checksum: %d\n", table_options_.checksum);
|
2014-08-25 21:24:09 +00:00
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " no_block_cache: %d\n",
|
|
|
|
table_options_.no_block_cache);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " block_cache: %p\n",
|
2016-03-16 21:57:57 +00:00
|
|
|
static_cast<void*>(table_options_.block_cache.get()));
|
2014-08-25 21:24:09 +00:00
|
|
|
ret.append(buffer);
|
|
|
|
if (table_options_.block_cache) {
|
2016-12-22 22:44:01 +00:00
|
|
|
const char* block_cache_name = table_options_.block_cache->Name();
|
|
|
|
if (block_cache_name != nullptr) {
|
|
|
|
snprintf(buffer, kBufferSize, " block_cache_name: %s\n",
|
|
|
|
block_cache_name);
|
|
|
|
ret.append(buffer);
|
|
|
|
}
|
|
|
|
ret.append(" block_cache_options:\n");
|
|
|
|
ret.append(table_options_.block_cache->GetPrintableOptions());
|
|
|
|
}
|
2016-12-19 22:00:04 +00:00
|
|
|
snprintf(buffer, kBufferSize, " persistent_cache: %p\n",
|
|
|
|
static_cast<void*>(table_options_.persistent_cache.get()));
|
|
|
|
ret.append(buffer);
|
|
|
|
if (table_options_.persistent_cache) {
|
|
|
|
snprintf(buffer, kBufferSize, " persistent_cache_options:\n");
|
|
|
|
ret.append(buffer);
|
|
|
|
ret.append(table_options_.persistent_cache->GetPrintableOptions());
|
|
|
|
}
|
2022-01-14 19:57:12 +00:00
|
|
|
snprintf(buffer, kBufferSize, " block_size: %" PRIu64 "\n",
|
2014-08-25 21:24:09 +00:00
|
|
|
table_options_.block_size);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " block_size_deviation: %d\n",
|
|
|
|
table_options_.block_size_deviation);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " block_restart_interval: %d\n",
|
|
|
|
table_options_.block_restart_interval);
|
|
|
|
ret.append(buffer);
|
2016-02-05 18:22:37 +00:00
|
|
|
snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n",
|
|
|
|
table_options_.index_block_restart_interval);
|
|
|
|
ret.append(buffer);
|
2017-10-13 21:41:07 +00:00
|
|
|
snprintf(buffer, kBufferSize, " metadata_block_size: %" PRIu64 "\n",
|
|
|
|
table_options_.metadata_block_size);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " partition_filters: %d\n",
|
|
|
|
table_options_.partition_filters);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " use_delta_encoding: %d\n",
|
|
|
|
table_options_.use_delta_encoding);
|
|
|
|
ret.append(buffer);
|
2014-08-25 21:24:09 +00:00
|
|
|
snprintf(buffer, kBufferSize, " filter_policy: %s\n",
|
2018-04-13 00:55:14 +00:00
|
|
|
table_options_.filter_policy == nullptr
|
|
|
|
? "nullptr"
|
|
|
|
: table_options_.filter_policy->Name());
|
2014-08-25 21:24:09 +00:00
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n",
|
|
|
|
table_options_.whole_key_filtering);
|
2015-10-31 01:33:01 +00:00
|
|
|
ret.append(buffer);
|
2017-10-13 21:41:07 +00:00
|
|
|
snprintf(buffer, kBufferSize, " verify_compression: %d\n",
|
|
|
|
table_options_.verify_compression);
|
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " read_amp_bytes_per_bit: %d\n",
|
|
|
|
table_options_.read_amp_bytes_per_bit);
|
|
|
|
ret.append(buffer);
|
2015-01-13 22:33:04 +00:00
|
|
|
snprintf(buffer, kBufferSize, " format_version: %d\n",
|
|
|
|
table_options_.format_version);
|
2014-08-25 21:24:09 +00:00
|
|
|
ret.append(buffer);
|
2018-01-10 23:06:29 +00:00
|
|
|
snprintf(buffer, kBufferSize, " enable_index_compression: %d\n",
|
|
|
|
table_options_.enable_index_compression);
|
|
|
|
ret.append(buffer);
|
2018-03-27 03:14:24 +00:00
|
|
|
snprintf(buffer, kBufferSize, " block_align: %d\n",
|
|
|
|
table_options_.block_align);
|
|
|
|
ret.append(buffer);
|
2021-02-24 00:52:35 +00:00
|
|
|
snprintf(buffer, kBufferSize,
|
|
|
|
" max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
|
|
|
|
table_options_.max_auto_readahead_size);
|
2021-06-18 04:55:42 +00:00
|
|
|
ret.append(buffer);
|
|
|
|
snprintf(buffer, kBufferSize, " prepopulate_block_cache: %d\n",
|
|
|
|
static_cast<int>(table_options_.prepopulate_block_cache));
|
|
|
|
ret.append(buffer);
|
2022-04-16 00:28:09 +00:00
|
|
|
snprintf(buffer, kBufferSize,
|
|
|
|
" initial_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
|
|
|
|
table_options_.initial_auto_readahead_size);
|
|
|
|
ret.append(buffer);
|
2022-09-01 18:56:00 +00:00
|
|
|
snprintf(buffer, kBufferSize,
|
|
|
|
" num_file_reads_for_auto_readahead: %" PRIu64 "\n",
|
|
|
|
table_options_.num_file_reads_for_auto_readahead);
|
|
|
|
ret.append(buffer);
|
2014-08-25 21:24:09 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-09-14 23:59:00 +00:00
|
|
|
const void* BlockBasedTableFactory::GetOptionsPtr(
|
|
|
|
const std::string& name) const {
|
|
|
|
if (name == kBlockCacheOpts()) {
|
|
|
|
if (table_options_.no_block_cache) {
|
|
|
|
return nullptr;
|
|
|
|
} else {
|
|
|
|
return table_options_.block_cache.get();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return TableFactory::GetOptionsPtr(name);
|
|
|
|
}
|
2014-11-21 03:24:39 +00:00
|
|
|
}
|
|
|
|
|
2020-09-14 23:59:00 +00:00
|
|
|
// Take a default BlockBasedTableOptions "table_options" in addition to a
|
|
|
|
// map "opts_map" of option name to option value to construct the new
|
|
|
|
// BlockBasedTableOptions "new_table_options".
|
|
|
|
//
|
|
|
|
// Below are the instructions of how to config some non-primitive-typed
|
|
|
|
// options in BlockBasedTableOptions:
|
|
|
|
//
|
|
|
|
// * filter_policy:
|
|
|
|
// We currently only support the following FilterPolicy in the convenience
|
|
|
|
// functions:
|
|
|
|
// - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
|
|
|
|
// to specify BloomFilter. The above string is equivalent to calling
|
|
|
|
// NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
|
|
|
|
// [Example]:
|
|
|
|
// - Pass {"filter_policy", "bloomfilter:4:true"} in
|
|
|
|
// GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
|
|
|
|
// per key and use_block_based_builder enabled.
|
|
|
|
//
|
|
|
|
// * block_cache / block_cache_compressed:
|
|
|
|
// We currently only support LRU cache in the GetOptions API. The LRU
|
|
|
|
// cache can be set by directly specifying its size.
|
|
|
|
// [Example]:
|
|
|
|
// - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
|
|
|
|
// equivalent to setting block_cache using NewLRUCache(1024 * 1024).
|
|
|
|
//
|
|
|
|
// @param table_options the default options of the output "new_table_options".
|
|
|
|
// @param opts_map an option name to value map for specifying how
|
|
|
|
// "new_table_options" should be set.
|
|
|
|
// @param new_table_options the resulting options based on "table_options"
|
|
|
|
// with the change specified in "opts_map".
|
|
|
|
// @param input_strings_escaped when set to true, each escaped characters
|
|
|
|
// prefixed by '\' in the values of the opts_map will be further converted
|
|
|
|
// back to the raw string before assigning to the associated options.
|
|
|
|
// @param ignore_unknown_options when set to true, unknown options are ignored
|
|
|
|
// instead of resulting in an unknown-option error.
|
|
|
|
// @return Status::OK() on success. Otherwise, a non-ok status indicating
|
|
|
|
// error will be returned, and "new_table_options" will be set to
|
|
|
|
// "table_options".
|
|
|
|
Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options,
|
|
|
|
const OptionTypeInfo& opt_info,
|
|
|
|
const std::string& opt_name,
|
|
|
|
const std::string& opt_value,
|
|
|
|
void* opt_ptr) {
|
|
|
|
Status status = TableFactory::ParseOption(config_options, opt_info, opt_name,
|
|
|
|
opt_value, opt_ptr);
|
|
|
|
if (config_options.input_strings_escaped && !status.ok()) { // Got an error
|
|
|
|
// !input_strings_escaped indicates the old API, where everything is
|
|
|
|
// parsable.
|
|
|
|
if (opt_info.IsByName()) {
|
|
|
|
status = Status::OK();
|
2017-07-28 23:23:50 +00:00
|
|
|
}
|
|
|
|
}
|
2020-09-14 23:59:00 +00:00
|
|
|
return status;
|
2017-07-28 23:23:50 +00:00
|
|
|
}
|
|
|
|
|
2020-04-22 00:35:28 +00:00
|
|
|
Status GetBlockBasedTableOptionsFromString(
|
|
|
|
const ConfigOptions& config_options,
|
|
|
|
const BlockBasedTableOptions& table_options, const std::string& opts_str,
|
|
|
|
BlockBasedTableOptions* new_table_options) {
|
2017-07-28 23:23:50 +00:00
|
|
|
std::unordered_map<std::string, std::string> opts_map;
|
|
|
|
Status s = StringToMap(opts_str, &opts_map);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2020-10-20 18:51:51 +00:00
|
|
|
s = GetBlockBasedTableOptionsFromMap(config_options, table_options, opts_map,
|
|
|
|
new_table_options);
|
|
|
|
// Translate any errors (NotFound, NotSupported, to InvalidArgument
|
|
|
|
if (s.ok() || s.IsInvalidArgument()) {
|
|
|
|
return s;
|
|
|
|
} else {
|
|
|
|
return Status::InvalidArgument(s.getState());
|
|
|
|
}
|
2017-07-28 23:23:50 +00:00
|
|
|
}
|
|
|
|
|
2020-04-22 00:35:28 +00:00
|
|
|
Status GetBlockBasedTableOptionsFromMap(
|
|
|
|
const ConfigOptions& config_options,
|
|
|
|
const BlockBasedTableOptions& table_options,
|
|
|
|
const std::unordered_map<std::string, std::string>& opts_map,
|
|
|
|
BlockBasedTableOptions* new_table_options) {
|
2017-07-28 23:23:50 +00:00
|
|
|
assert(new_table_options);
|
2020-09-14 23:59:00 +00:00
|
|
|
BlockBasedTableFactory bbtf(table_options);
|
|
|
|
Status s = bbtf.ConfigureFromMap(config_options, opts_map);
|
|
|
|
if (s.ok()) {
|
|
|
|
*new_table_options = *(bbtf.GetOptions<BlockBasedTableOptions>());
|
|
|
|
} else {
|
|
|
|
*new_table_options = table_options;
|
2017-07-28 23:23:50 +00:00
|
|
|
}
|
2020-09-14 23:59:00 +00:00
|
|
|
return s;
|
2017-07-28 23:23:50 +00:00
|
|
|
}
|
|
|
|
|
2014-01-28 05:58:46 +00:00
|
|
|
TableFactory* NewBlockBasedTableFactory(
|
2015-11-18 00:41:54 +00:00
|
|
|
const BlockBasedTableOptions& _table_options) {
|
|
|
|
return new BlockBasedTableFactory(_table_options);
|
2014-01-28 05:58:46 +00:00
|
|
|
}
|
|
|
|
|
2014-03-01 02:19:07 +00:00
|
|
|
const std::string BlockBasedTablePropertyNames::kIndexType =
|
|
|
|
"rocksdb.block.based.table.index.type";
|
2015-02-05 01:03:57 +00:00
|
|
|
const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
|
|
|
|
"rocksdb.block.based.table.whole.key.filtering";
|
|
|
|
const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
|
|
|
|
"rocksdb.block.based.table.prefix.filtering";
|
2014-05-15 21:09:03 +00:00
|
|
|
const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
|
|
|
|
const std::string kHashIndexPrefixesMetadataBlock =
|
|
|
|
"rocksdb.hashindex.metadata";
|
2015-02-05 01:03:57 +00:00
|
|
|
const std::string kPropTrue = "1";
|
|
|
|
const std::string kPropFalse = "0";
|
2014-03-01 02:19:07 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|