mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-27 11:43:49 +00:00
62fc15f009
Summary: add option `block_protection_bytes_per_key` and implementation for block per key-value checksum. The main changes are 1. checksum construction and verification in block.cc/h 2. pass the option `block_protection_bytes_per_key` around (mainly for methods defined in table_cache.h) 3. unit tests/crash test updates Tests: * Added unit tests * Crash test: `python3 tools/db_crashtest.py blackbox --simple --block_protection_bytes_per_key=1 --write_buffer_size=1048576` Follow up (maybe as a separate PR): make sure corruption status returned from BlockIters are correctly handled. Performance: Turning on block per KV protection has a non-trivial negative impact on read performance and costs additional memory. For memory, each block includes additional 24 bytes for checksum-related states beside checksum itself. For CPU, I set up a DB of size ~1.2GB with 5M keys (32 bytes key and 200 bytes value) which compacts to ~5 SST files (target file size 256 MB) in L6 without compression. I tested readrandom performance with various block cache size (to mimic various cache hit rates): ``` SETUP make OPTIMIZE_LEVEL="-O3" USE_LTO=1 DEBUG_LEVEL=0 -j32 db_bench ./db_bench -benchmarks=fillseq,compact0,waitforcompaction,compact,waitforcompaction -write_buffer_size=33554432 -level_compaction_dynamic_level_bytes=true -max_background_jobs=8 -target_file_size_base=268435456 --num=5000000 --key_size=32 --value_size=200 --compression_type=none BENCHMARK ./db_bench --use_existing_db -benchmarks=readtocache,readrandom[-X10] --num=5000000 --key_size=32 --disable_auto_compactions --reads=1000000 --block_protection_bytes_per_key=[0|1] --cache_size=$CACHESIZE The readrandom ops/sec looks like the following: Block cache size: 2GB 1.2GB * 0.9 1.2GB * 0.8 1.2GB * 0.5 8MB Main 240805 223604 198176 161653 139040 PR prot_bytes=0 238691 226693 200127 161082 141153 PR prot_bytes=1 214983 193199 178532 137013 108211 prot_bytes=1 vs -10% -15% -10.8% -15% -23% prot_bytes=0 ``` The benchmark has a lot of variance, but there was a 5% to 25% regression in this benchmark with different cache hit rates. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11287 Reviewed By: ajkr Differential Revision: D43970708 Pulled By: cbi42 fbshipit-source-id: ef98d898b71779846fa74212b9ec9e08b7183940
94 lines
3.1 KiB
C++
94 lines
3.1 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
//
|
|
#pragma once
|
|
|
|
#include <memory>
|
|
|
|
#include "db/version_edit.h"
|
|
#include "rocksdb/file_system.h"
|
|
#include "rocksdb/metadata.h"
|
|
#include "rocksdb/slice_transform.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
struct ImmutableCFOptions;
|
|
class TableCache;
|
|
class VersionStorageInfo;
|
|
class VersionEdit;
|
|
struct FileMetaData;
|
|
class InternalStats;
|
|
class Version;
|
|
class VersionSet;
|
|
class ColumnFamilyData;
|
|
class CacheReservationManager;
|
|
|
|
// A helper class so we can efficiently apply a whole sequence
|
|
// of edits to a particular state without creating intermediate
|
|
// Versions that contain full copies of the intermediate state.
|
|
class VersionBuilder {
|
|
public:
|
|
VersionBuilder(const FileOptions& file_options,
|
|
const ImmutableCFOptions* ioptions, TableCache* table_cache,
|
|
VersionStorageInfo* base_vstorage, VersionSet* version_set,
|
|
std::shared_ptr<CacheReservationManager>
|
|
file_metadata_cache_res_mgr = nullptr);
|
|
~VersionBuilder();
|
|
|
|
bool CheckConsistencyForNumLevels();
|
|
Status Apply(const VersionEdit* edit);
|
|
Status SaveTo(VersionStorageInfo* vstorage) const;
|
|
Status LoadTableHandlers(
|
|
InternalStats* internal_stats, int max_threads,
|
|
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
|
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
|
|
uint8_t block_protection_bytes_per_key);
|
|
uint64_t GetMinOldestBlobFileNumber() const;
|
|
|
|
private:
|
|
class Rep;
|
|
std::unique_ptr<Rep> rep_;
|
|
};
|
|
|
|
// A wrapper of version builder which references the current version in
|
|
// constructor and unref it in the destructor.
|
|
// Both of the constructor and destructor need to be called inside DB Mutex.
|
|
class BaseReferencedVersionBuilder {
|
|
public:
|
|
explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd);
|
|
BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v);
|
|
~BaseReferencedVersionBuilder();
|
|
VersionBuilder* version_builder() const { return version_builder_.get(); }
|
|
|
|
private:
|
|
std::unique_ptr<VersionBuilder> version_builder_;
|
|
Version* version_;
|
|
};
|
|
|
|
class NewestFirstBySeqNo {
|
|
public:
|
|
bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
|
|
assert(lhs);
|
|
assert(rhs);
|
|
|
|
if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
|
|
return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
|
|
}
|
|
|
|
if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
|
|
return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
|
|
}
|
|
|
|
// Break ties by file number
|
|
return lhs->fd.GetNumber() > rhs->fd.GetNumber();
|
|
}
|
|
};
|
|
} // namespace ROCKSDB_NAMESPACE
|