mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-30 13:41:46 +00:00
62fc15f009
Summary: add option `block_protection_bytes_per_key` and implementation for block per key-value checksum. The main changes are 1. checksum construction and verification in block.cc/h 2. pass the option `block_protection_bytes_per_key` around (mainly for methods defined in table_cache.h) 3. unit tests/crash test updates Tests: * Added unit tests * Crash test: `python3 tools/db_crashtest.py blackbox --simple --block_protection_bytes_per_key=1 --write_buffer_size=1048576` Follow up (maybe as a separate PR): make sure corruption status returned from BlockIters are correctly handled. Performance: Turning on block per KV protection has a non-trivial negative impact on read performance and costs additional memory. For memory, each block includes additional 24 bytes for checksum-related states beside checksum itself. For CPU, I set up a DB of size ~1.2GB with 5M keys (32 bytes key and 200 bytes value) which compacts to ~5 SST files (target file size 256 MB) in L6 without compression. I tested readrandom performance with various block cache size (to mimic various cache hit rates): ``` SETUP make OPTIMIZE_LEVEL="-O3" USE_LTO=1 DEBUG_LEVEL=0 -j32 db_bench ./db_bench -benchmarks=fillseq,compact0,waitforcompaction,compact,waitforcompaction -write_buffer_size=33554432 -level_compaction_dynamic_level_bytes=true -max_background_jobs=8 -target_file_size_base=268435456 --num=5000000 --key_size=32 --value_size=200 --compression_type=none BENCHMARK ./db_bench --use_existing_db -benchmarks=readtocache,readrandom[-X10] --num=5000000 --key_size=32 --disable_auto_compactions --reads=1000000 --block_protection_bytes_per_key=[0|1] --cache_size=$CACHESIZE The readrandom ops/sec looks like the following: Block cache size: 2GB 1.2GB * 0.9 1.2GB * 0.8 1.2GB * 0.5 8MB Main 240805 223604 198176 161653 139040 PR prot_bytes=0 238691 226693 200127 161082 141153 PR prot_bytes=1 214983 193199 178532 137013 108211 prot_bytes=1 vs -10% -15% -10.8% -15% -23% prot_bytes=0 ``` The benchmark has a lot of variance, but there was a 5% to 25% regression in this benchmark with different cache hit rates. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11287 Reviewed By: ajkr Differential Revision: D43970708 Pulled By: cbi42 fbshipit-source-id: ef98d898b71779846fa74212b9ec9e08b7183940
131 lines
4.9 KiB
C++
131 lines
4.9 KiB
C++
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
//
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "util/coro_utils.h"
|
|
|
|
#if defined(WITHOUT_COROUTINES) || \
|
|
(defined(USE_COROUTINES) && defined(WITH_COROUTINES))
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
#if defined(WITHOUT_COROUTINES)
|
|
#endif
|
|
|
|
// Batched version of TableCache::MultiGet.
|
|
DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
|
|
(const ReadOptions& options, const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
|
HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions,
|
|
int level, TypedHandle* handle) {
|
|
auto& fd = file_meta.fd;
|
|
Status s;
|
|
TableReader* t = fd.table_reader;
|
|
MultiGetRange table_range(*mget_range, mget_range->begin(),
|
|
mget_range->end());
|
|
if (handle != nullptr && t == nullptr) {
|
|
t = cache_.Value(handle);
|
|
}
|
|
autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
|
|
IterKey row_cache_key;
|
|
size_t row_cache_key_prefix_size = 0;
|
|
KeyContext& first_key = *table_range.begin();
|
|
bool lookup_row_cache =
|
|
ioptions_.row_cache && !first_key.get_context->NeedToReadSequence();
|
|
|
|
// Check row cache if enabled. Since row cache does not currently store
|
|
// sequence numbers, we cannot use it if we need to fetch the sequence.
|
|
if (lookup_row_cache) {
|
|
GetContext* first_context = first_key.get_context;
|
|
CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context,
|
|
row_cache_key);
|
|
row_cache_key_prefix_size = row_cache_key.Size();
|
|
|
|
for (auto miter = table_range.begin(); miter != table_range.end();
|
|
++miter) {
|
|
const Slice& user_key = miter->ukey_with_ts;
|
|
|
|
GetContext* get_context = miter->get_context;
|
|
|
|
if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
|
|
get_context)) {
|
|
table_range.SkipKey(miter);
|
|
} else {
|
|
row_cache_entries.emplace_back();
|
|
get_context->SetReplayLog(&(row_cache_entries.back()));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check that table_range is not empty. Its possible all keys may have been
|
|
// found in the row cache and thus the range may now be empty
|
|
if (s.ok() && !table_range.empty()) {
|
|
if (t == nullptr) {
|
|
assert(handle == nullptr);
|
|
s = FindTable(options, file_options_, internal_comparator, file_meta,
|
|
&handle, block_protection_bytes_per_key, prefix_extractor,
|
|
options.read_tier == kBlockCacheTier /* no_io */,
|
|
true /* record_read_stats */, file_read_hist, skip_filters,
|
|
level, true /* prefetch_index_and_filter_in_cache */,
|
|
0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature);
|
|
TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s);
|
|
if (s.ok()) {
|
|
t = cache_.Value(handle);
|
|
assert(t);
|
|
}
|
|
}
|
|
if (s.ok() && !options.ignore_range_deletions && !skip_range_deletions) {
|
|
UpdateRangeTombstoneSeqnums(options, t, table_range);
|
|
}
|
|
if (s.ok()) {
|
|
CO_AWAIT(t->MultiGet)
|
|
(options, &table_range, prefix_extractor.get(), skip_filters);
|
|
} else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
|
|
for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
|
|
Status* status = iter->s;
|
|
if (status->IsIncomplete()) {
|
|
// Couldn't find Table in cache but treat as kFound if no_io set
|
|
iter->get_context->MarkKeyMayExist();
|
|
s = Status::OK();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (lookup_row_cache) {
|
|
size_t row_idx = 0;
|
|
RowCacheInterface row_cache{ioptions_.row_cache.get()};
|
|
|
|
for (auto miter = table_range.begin(); miter != table_range.end();
|
|
++miter) {
|
|
std::string& row_cache_entry = row_cache_entries[row_idx++];
|
|
const Slice& user_key = miter->ukey_with_ts;
|
|
;
|
|
GetContext* get_context = miter->get_context;
|
|
|
|
get_context->SetReplayLog(nullptr);
|
|
// Compute row cache key.
|
|
row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(),
|
|
user_key.size());
|
|
// Put the replay log in row cache only if something was found.
|
|
if (s.ok() && !row_cache_entry.empty()) {
|
|
size_t charge = row_cache_entry.capacity() + sizeof(std::string);
|
|
auto row_ptr = new std::string(std::move(row_cache_entry));
|
|
// If row cache is full, it's OK.
|
|
row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge)
|
|
.PermitUncheckedError();
|
|
}
|
|
}
|
|
}
|
|
|
|
if (handle != nullptr) {
|
|
cache_.Release(handle);
|
|
}
|
|
CO_RETURN s;
|
|
}
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
#endif
|