mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-26 07:30:54 +00:00
62fc15f009
Summary: add option `block_protection_bytes_per_key` and implementation for block per key-value checksum. The main changes are 1. checksum construction and verification in block.cc/h 2. pass the option `block_protection_bytes_per_key` around (mainly for methods defined in table_cache.h) 3. unit tests/crash test updates Tests: * Added unit tests * Crash test: `python3 tools/db_crashtest.py blackbox --simple --block_protection_bytes_per_key=1 --write_buffer_size=1048576` Follow up (maybe as a separate PR): make sure corruption status returned from BlockIters are correctly handled. Performance: Turning on block per KV protection has a non-trivial negative impact on read performance and costs additional memory. For memory, each block includes additional 24 bytes for checksum-related states beside checksum itself. For CPU, I set up a DB of size ~1.2GB with 5M keys (32 bytes key and 200 bytes value) which compacts to ~5 SST files (target file size 256 MB) in L6 without compression. I tested readrandom performance with various block cache size (to mimic various cache hit rates): ``` SETUP make OPTIMIZE_LEVEL="-O3" USE_LTO=1 DEBUG_LEVEL=0 -j32 db_bench ./db_bench -benchmarks=fillseq,compact0,waitforcompaction,compact,waitforcompaction -write_buffer_size=33554432 -level_compaction_dynamic_level_bytes=true -max_background_jobs=8 -target_file_size_base=268435456 --num=5000000 --key_size=32 --value_size=200 --compression_type=none BENCHMARK ./db_bench --use_existing_db -benchmarks=readtocache,readrandom[-X10] --num=5000000 --key_size=32 --disable_auto_compactions --reads=1000000 --block_protection_bytes_per_key=[0|1] --cache_size=$CACHESIZE The readrandom ops/sec looks like the following: Block cache size: 2GB 1.2GB * 0.9 1.2GB * 0.8 1.2GB * 0.5 8MB Main 240805 223604 198176 161653 139040 PR prot_bytes=0 238691 226693 200127 161082 141153 PR prot_bytes=1 214983 193199 178532 137013 108211 prot_bytes=1 vs -10% -15% -10.8% -15% -23% prot_bytes=0 ``` The benchmark has a lot of variance, but there was a 5% to 25% regression in this benchmark with different cache hit rates. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11287 Reviewed By: ajkr Differential Revision: D43970708 Pulled By: cbi42 fbshipit-source-id: ef98d898b71779846fa74212b9ec9e08b7183940
102 lines
3.6 KiB
C++
102 lines
3.6 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
#include "rocksdb/sst_file_reader.h"
|
|
|
|
#include "db/arena_wrapped_db_iter.h"
|
|
#include "db/db_iter.h"
|
|
#include "db/dbformat.h"
|
|
#include "file/random_access_file_reader.h"
|
|
#include "options/cf_options.h"
|
|
#include "rocksdb/env.h"
|
|
#include "rocksdb/file_system.h"
|
|
#include "table/get_context.h"
|
|
#include "table/table_builder.h"
|
|
#include "table/table_reader.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
struct SstFileReader::Rep {
|
|
Options options;
|
|
EnvOptions soptions;
|
|
ImmutableOptions ioptions;
|
|
MutableCFOptions moptions;
|
|
|
|
std::unique_ptr<TableReader> table_reader;
|
|
|
|
Rep(const Options& opts)
|
|
: options(opts),
|
|
soptions(options),
|
|
ioptions(options),
|
|
moptions(ColumnFamilyOptions(options)) {}
|
|
};
|
|
|
|
SstFileReader::SstFileReader(const Options& options) : rep_(new Rep(options)) {}
|
|
|
|
SstFileReader::~SstFileReader() {}
|
|
|
|
Status SstFileReader::Open(const std::string& file_path) {
|
|
auto r = rep_.get();
|
|
Status s;
|
|
uint64_t file_size = 0;
|
|
std::unique_ptr<FSRandomAccessFile> file;
|
|
std::unique_ptr<RandomAccessFileReader> file_reader;
|
|
FileOptions fopts(r->soptions);
|
|
const auto& fs = r->options.env->GetFileSystem();
|
|
|
|
s = fs->GetFileSize(file_path, fopts.io_options, &file_size, nullptr);
|
|
if (s.ok()) {
|
|
s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
|
|
}
|
|
if (s.ok()) {
|
|
file_reader.reset(new RandomAccessFileReader(std::move(file), file_path));
|
|
}
|
|
if (s.ok()) {
|
|
TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor,
|
|
r->soptions, r->ioptions.internal_comparator,
|
|
r->moptions.block_protection_bytes_per_key);
|
|
// Allow open file with global sequence number for backward compatibility.
|
|
t_opt.largest_seqno = kMaxSequenceNumber;
|
|
s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader),
|
|
file_size, &r->table_reader);
|
|
}
|
|
return s;
|
|
}
|
|
|
|
Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) {
|
|
assert(roptions.io_activity == Env::IOActivity::kUnknown);
|
|
auto r = rep_.get();
|
|
auto sequence = roptions.snapshot != nullptr
|
|
? roptions.snapshot->GetSequenceNumber()
|
|
: kMaxSequenceNumber;
|
|
ArenaWrappedDBIter* res = new ArenaWrappedDBIter();
|
|
res->Init(r->options.env, roptions, r->ioptions, r->moptions,
|
|
nullptr /* version */, sequence,
|
|
r->moptions.max_sequential_skip_in_iterations,
|
|
0 /* version_number */, nullptr /* read_callback */,
|
|
nullptr /* db_impl */, nullptr /* cfd */,
|
|
true /* expose_blob_index */, false /* allow_refresh */);
|
|
auto internal_iter = r->table_reader->NewIterator(
|
|
res->GetReadOptions(), r->moptions.prefix_extractor.get(),
|
|
res->GetArena(), false /* skip_filters */,
|
|
TableReaderCaller::kSSTFileReader);
|
|
res->SetIterUnderDBIter(internal_iter);
|
|
return res;
|
|
}
|
|
|
|
std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties()
|
|
const {
|
|
return rep_->table_reader->GetTableProperties();
|
|
}
|
|
|
|
Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) {
|
|
assert(read_options.io_activity == Env::IOActivity::kUnknown);
|
|
return rep_->table_reader->VerifyChecksum(read_options,
|
|
TableReaderCaller::kSSTFileReader);
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|