rocksdb/db/convenience.cc
Changyu Bi 62fc15f009 Block per key-value checksum (#11287)
Summary:
add option `block_protection_bytes_per_key` and implementation for block per key-value checksum. The main changes are
1. checksum construction and verification in block.cc/h
2. pass the option `block_protection_bytes_per_key` around (mainly for methods defined in table_cache.h)
3. unit tests/crash test updates

Tests:
* Added unit tests
* Crash test: `python3 tools/db_crashtest.py blackbox --simple --block_protection_bytes_per_key=1 --write_buffer_size=1048576`

Follow up (maybe as a separate PR): make sure corruption status returned from BlockIters are correctly handled.

Performance:
Turning on block per KV protection has a non-trivial negative impact on read performance and costs additional memory.
For memory, each block includes additional 24 bytes for checksum-related states beside checksum itself. For CPU, I set up a DB of size ~1.2GB with 5M keys (32 bytes key and 200 bytes value) which compacts to ~5 SST files (target file size 256 MB) in L6 without compression. I tested readrandom performance with various block cache size (to mimic various cache hit rates):

```
SETUP
make OPTIMIZE_LEVEL="-O3" USE_LTO=1 DEBUG_LEVEL=0 -j32 db_bench
./db_bench -benchmarks=fillseq,compact0,waitforcompaction,compact,waitforcompaction -write_buffer_size=33554432 -level_compaction_dynamic_level_bytes=true -max_background_jobs=8 -target_file_size_base=268435456 --num=5000000 --key_size=32 --value_size=200 --compression_type=none

BENCHMARK
./db_bench --use_existing_db -benchmarks=readtocache,readrandom[-X10] --num=5000000 --key_size=32 --disable_auto_compactions --reads=1000000 --block_protection_bytes_per_key=[0|1] --cache_size=$CACHESIZE

The readrandom ops/sec looks like the following:
Block cache size:  2GB        1.2GB * 0.9    1.2GB * 0.8     1.2GB * 0.5   8MB
Main              240805     223604         198176           161653       139040
PR prot_bytes=0   238691     226693         200127           161082       141153
PR prot_bytes=1   214983     193199         178532           137013       108211
prot_bytes=1 vs    -10%        -15%          -10.8%          -15%        -23%
prot_bytes=0
```

The benchmark has a lot of variance, but there was a 5% to 25% regression in this benchmark with different cache hit rates.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11287

Reviewed By: ajkr

Differential Revision: D43970708

Pulled By: cbi42

fbshipit-source-id: ef98d898b71779846fa74212b9ec9e08b7183940
2023-04-25 12:08:23 -07:00

82 lines
3.2 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
#include "rocksdb/convenience.h"
#include "db/db_impl/db_impl.h"
#include "util/cast_util.h"
namespace ROCKSDB_NAMESPACE {
void CancelAllBackgroundWork(DB* db, bool wait) {
(static_cast_with_check<DBImpl>(db->GetRootDB()))
->CancelAllBackgroundWork(wait);
}
Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end,
bool include_end) {
RangePtr range(begin, end);
return DeleteFilesInRanges(db, column_family, &range, 1, include_end);
}
Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
const RangePtr* ranges, size_t n, bool include_end) {
return (static_cast_with_check<DBImpl>(db->GetRootDB()))
->DeleteFilesInRanges(column_family, ranges, n, include_end);
}
Status VerifySstFileChecksum(const Options& options,
const EnvOptions& env_options,
const std::string& file_path) {
// TODO: plumb Env::IOActivity
const ReadOptions read_options;
return VerifySstFileChecksum(options, env_options, read_options, file_path);
}
Status VerifySstFileChecksum(const Options& options,
const EnvOptions& env_options,
const ReadOptions& read_options,
const std::string& file_path,
const SequenceNumber& largest_seqno) {
std::unique_ptr<FSRandomAccessFile> file;
uint64_t file_size;
InternalKeyComparator internal_comparator(options.comparator);
ImmutableOptions ioptions(options);
Status s = ioptions.fs->NewRandomAccessFile(
file_path, FileOptions(env_options), &file, nullptr);
if (s.ok()) {
s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
} else {
return s;
}
std::unique_ptr<TableReader> table_reader;
std::unique_ptr<RandomAccessFileReader> file_reader(
new RandomAccessFileReader(
std::move(file), file_path, ioptions.clock, nullptr /* io_tracer */,
ioptions.stats /* stats */,
Histograms::SST_READ_MICROS /* hist_type */,
nullptr /* file_read_hist */, ioptions.rate_limiter.get()));
const bool kImmortal = true;
auto reader_options = TableReaderOptions(
ioptions, options.prefix_extractor, env_options, internal_comparator,
options.block_protection_bytes_per_key, false /* skip_filters */,
!kImmortal, false /* force_direct_prefetch */, -1 /* level */);
reader_options.largest_seqno = largest_seqno;
s = ioptions.table_factory->NewTableReader(
reader_options, std::move(file_reader), file_size, &table_reader,
false /* prefetch_index_and_filter_in_cache */);
if (!s.ok()) {
return s;
}
s = table_reader->VerifyChecksum(read_options,
TableReaderCaller::kUserVerifyChecksum);
return s;
}
} // namespace ROCKSDB_NAMESPACE