Add an option to verify memtable key order during reads (#12889)

Summary:
add a new CF option `paranoid_memory_checks` that allows additional data integrity validations during read/scan. Currently, skiplist-based memtable will validate the order of keys visited. Further data validation can be added in different layers. The option will be opt-in due to performance overhead.

The motivation for this feature is for services where data correctness is critical and want to detect in-memory corruption earlier. For a corrupted memtable key, this feature can help to detect it during during reads instead of during flush with existing protections (OutputValidator that verifies key order or per kv checksum). See internally linked task for more context.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12889

Test Plan:
* new unit test added for paranoid_memory_checks=true.
* existing unit test for paranoid_memory_checks=false.
* enable in stress test.

Performance Benchmark: we check for performance regression in read path where data is in memtable only. For each benchmark, the script was run at the same time for main and this PR:
* Memtable-only randomread ops/sec:
```
(for I in $(seq 1 50);do ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --num=250000 --reads=500000  --seed=1723056275 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }';

Main: 608146
PR with paranoid_memory_checks=false: 607727 (- %0.07)
PR with paranoid_memory_checks=true: 521889 (-%14.2)
```

* Memtable-only sequential scan ops/sec:
```
(for I in $(seq 1 50); do ./db_bench--benchmarks=fillseq,readseq[-X10] --write_buffer_size=268435456 --num=1000000  --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }';

Main: 9180077
PR with paranoid_memory_checks=false: 9536241 (+%3.8)
PR with paranoid_memory_checks=true: 7653934 (-%16.6)
```

* Memtable-only reverse scan ops/sec:
```
(for I in $(seq 1 20); do ./db_bench --benchmarks=fillseq,readreverse[-X10] --write_buffer_size=268435456 --num=1000000  --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }';

 Main: 1285719
 PR with integrity_checks=false: 1431626 (+%11.3)
 PR with integrity_checks=true: 811031 (-%36.9)
```

The `readrandom` benchmark shows no regression. The scanning benchmarks show improvement that I can't explain.

Reviewed By: pdillinger

Differential Revision: D60414267

Pulled By: cbi42

fbshipit-source-id: a70b0cbeea131f1a249a5f78f9dc3a62dacfaa91
This commit is contained in:
Changyu Bi 2024-08-19 13:53:25 -07:00 committed by Facebook GitHub Bot
parent 273b3eadf0
commit defd97bc9d
17 changed files with 364 additions and 56 deletions

View File

@ -339,6 +339,91 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
}
}
TEST_F(DBMemTableTest, IntegrityChecks) {
// We insert keys key000000, key000001 and key000002 into skiplist at fixed
// height 1 (smallest height). Then we corrupt the second key to aey000001 to
// make it smaller. With `paranoid_memory_checks` set to true, if the
// skip list sees key000000 and then aey000001, then it will report out of
// order keys with corruption status. With `paranoid_memory_checks` set
// to false, read/scan may return wrong results.
for (bool allow_data_in_error : {false, true}) {
Options options = CurrentOptions();
options.allow_data_in_errors = allow_data_in_error;
options.paranoid_memory_checks = true;
DestroyAndReopen(options);
SyncPoint::GetInstance()->SetCallBack(
"InlineSkipList::RandomHeight::height", [](void* h) {
auto height_ptr = static_cast<int*>(h);
*height_ptr = 1;
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(Put(Key(0), "val0"));
ASSERT_OK(Put(Key(2), "val2"));
// p will point to the buffer for encoded key000001
char* p = nullptr;
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:BeforeReturn:Encoded", [&](void* encoded) {
p = const_cast<char*>(static_cast<Slice*>(encoded)->data());
});
ASSERT_OK(Put(Key(1), "val1"));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
ASSERT_TRUE(p);
// Offset 0 is key size, key bytes start at offset 1.
// "key000001 -> aey000001"
p[1] = 'a';
ReadOptions rops;
std::string val;
Status s = db_->Get(rops, Key(1), &val);
ASSERT_TRUE(s.IsCorruption());
std::string key0 = Slice(Key(0)).ToString(true);
ASSERT_EQ(s.ToString().find(key0) != std::string::npos,
allow_data_in_error);
// Without `paranoid_memory_checks`, NotFound will be returned.
// This would fail an assertion in InlineSkipList::FindGreaterOrEqual().
// If we remove the assertion, this passes.
// ASSERT_TRUE(db_->Get(ReadOptions(), Key(1), &val).IsNotFound());
std::vector<std::string> vals;
std::vector<Status> statuses = db_->MultiGet(
rops, {db_->DefaultColumnFamily()}, {Key(1)}, &vals, nullptr);
ASSERT_TRUE(statuses[0].IsCorruption());
ASSERT_EQ(statuses[0].ToString().find(key0) != std::string::npos,
allow_data_in_error);
std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
ASSERT_OK(iter->status());
iter->Seek(Key(1));
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
iter->Seek(Key(0));
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
// iterating through skip list at height at 1 should catch out-of-order keys
iter->Next();
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
ASSERT_FALSE(iter->Valid());
iter->SeekForPrev(Key(2));
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
// Internally DB Iter will iterate backwards (call Prev()) after
// SeekToLast() to find the correct internal key with the last user key.
// Prev() will do integrity checks and catch corruption.
iter->SeekToLast();
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
ASSERT_FALSE(iter->Valid());
}
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

View File

@ -67,9 +67,10 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
statistics(ioptions.stats),
merge_operator(ioptions.merge_operator.get()),
info_log(ioptions.logger),
allow_data_in_errors(ioptions.allow_data_in_errors),
protection_bytes_per_key(
mutable_cf_options.memtable_protection_bytes_per_key) {}
mutable_cf_options.memtable_protection_bytes_per_key),
allow_data_in_errors(ioptions.allow_data_in_errors),
paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {}
MemTable::MemTable(const InternalKeyComparator& cmp,
const ImmutableOptions& ioptions,
@ -370,15 +371,17 @@ class MemTableIterator : public InternalIterator {
: bloom_(nullptr),
prefix_extractor_(mem.prefix_extractor_),
comparator_(mem.comparator_),
valid_(false),
seqno_to_time_mapping_(seqno_to_time_mapping),
arena_mode_(arena != nullptr),
value_pinned_(
!mem.GetImmutableMemTableOptions()->inplace_update_support),
protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
status_(Status::OK()),
logger_(mem.moptions_.info_log),
ts_sz_(mem.ts_sz_) {
ts_sz_(mem.ts_sz_),
protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
valid_(false),
value_pinned_(
!mem.GetImmutableMemTableOptions()->inplace_update_support),
arena_mode_(arena != nullptr),
paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
allow_data_in_error(mem.moptions_.allow_data_in_errors) {
if (use_range_del_table) {
iter_ = mem.range_del_table_->GetIterator(arena);
} else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
@ -406,6 +409,7 @@ class MemTableIterator : public InternalIterator {
} else {
delete iter_;
}
status_.PermitUncheckedError();
}
#ifndef NDEBUG
@ -415,10 +419,16 @@ class MemTableIterator : public InternalIterator {
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
#endif
bool Valid() const override { return valid_ && status_.ok(); }
bool Valid() const override {
// If inner iter_ is not valid, then this iter should also not be valid.
assert(iter_->Valid() || !(valid_ && status_.ok()));
return valid_ && status_.ok();
}
void Seek(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
status_ = Status::OK();
if (bloom_) {
// iterator should only use prefix bloom filter
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
@ -433,13 +443,18 @@ class MemTableIterator : public InternalIterator {
}
}
}
iter_->Seek(k, nullptr);
if (paranoid_memory_checks_) {
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
} else {
iter_->Seek(k, nullptr);
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
void SeekForPrev(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
status_ = Status::OK();
if (bloom_) {
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
if (prefix_extractor_->InDomain(user_k_without_ts)) {
@ -453,7 +468,11 @@ class MemTableIterator : public InternalIterator {
}
}
}
iter_->Seek(k, nullptr);
if (paranoid_memory_checks_) {
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
} else {
iter_->Seek(k, nullptr);
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
if (!Valid() && status().ok()) {
@ -464,11 +483,13 @@ class MemTableIterator : public InternalIterator {
}
}
void SeekToFirst() override {
status_ = Status::OK();
iter_->SeekToFirst();
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
void SeekToLast() override {
status_ = Status::OK();
iter_->SeekToLast();
valid_ = iter_->Valid();
VerifyEntryChecksum();
@ -476,8 +497,12 @@ class MemTableIterator : public InternalIterator {
void Next() override {
PERF_COUNTER_ADD(next_on_memtable_count, 1);
assert(Valid());
iter_->Next();
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
if (paranoid_memory_checks_) {
status_ = iter_->NextAndValidate(allow_data_in_error);
} else {
iter_->Next();
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
@ -494,7 +519,11 @@ class MemTableIterator : public InternalIterator {
void Prev() override {
PERF_COUNTER_ADD(prev_on_memtable_count, 1);
assert(Valid());
iter_->Prev();
if (paranoid_memory_checks_) {
status_ = iter_->PrevAndValidate(allow_data_in_error);
} else {
iter_->Prev();
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
@ -540,15 +569,17 @@ class MemTableIterator : public InternalIterator {
const SliceTransform* const prefix_extractor_;
const MemTable::KeyComparator comparator_;
MemTableRep::Iterator* iter_;
bool valid_;
// The seqno to time mapping is owned by the SuperVersion.
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping_;
bool arena_mode_;
bool value_pinned_;
uint32_t protection_bytes_per_key_;
Status status_;
Logger* logger_;
size_t ts_sz_;
uint32_t protection_bytes_per_key_;
bool valid_;
bool value_pinned_;
bool arena_mode_;
const bool paranoid_memory_checks_;
const bool allow_data_in_error;
void VerifyEntryChecksum() {
if (protection_bytes_per_key_ > 0 && Valid()) {
@ -1355,7 +1386,18 @@ void MemTable::GetFromTable(const LookupKey& key,
saver.do_merge = do_merge;
saver.allow_data_in_errors = moptions_.allow_data_in_errors;
saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
table_->Get(key, &saver, SaveValue);
if (!moptions_.paranoid_memory_checks) {
table_->Get(key, &saver, SaveValue);
} else {
Status check_s = table_->GetAndValidate(key, &saver, SaveValue,
moptions_.allow_data_in_errors);
if (check_s.IsCorruption()) {
*(saver.status) = check_s;
// Should stop searching the LSM.
*(saver.found_final_value) = true;
}
}
assert(s->ok() || s->IsMergeInProgress() || *found_final_value);
*seq = saver.seq;
}

View File

@ -60,8 +60,9 @@ struct ImmutableMemTableOptions {
Statistics* statistics;
MergeOperator* merge_operator;
Logger* info_log;
bool allow_data_in_errors;
uint32_t protection_bytes_per_key;
bool allow_data_in_errors;
bool paranoid_memory_checks;
};
// Batched counters to updated when inserting keys in one write batch.
@ -266,6 +267,11 @@ class MemTable {
// If do_merge = false then any Merge Operands encountered for key are simply
// stored in merge_context.operands_list and never actually merged to get a
// final value. The raw Merge Operands are eventually returned to the user.
// @param value If not null and memtable contains a value for key, `value`
// will be set to the result value.
// @param column If not null and memtable contains a value/WideColumn for key,
// `column` will be set to the result value/WideColumn.
// Note: only one of `value` and `column` can be non-nullptr.
// @param immutable_memtable Whether this memtable is immutable. Used
// internally by NewRangeTombstoneIterator(). See comment above
// NewRangeTombstoneIterator() for more detail.

View File

@ -274,6 +274,7 @@ DECLARE_bool(verification_only);
DECLARE_string(last_level_temperature);
DECLARE_string(default_write_temperature);
DECLARE_string(default_temperature);
DECLARE_bool(paranoid_memory_checks);
// Options for transaction dbs.
// Use TransactionDB (a.k.a. Pessimistic Transaction DB)

View File

@ -1448,4 +1448,8 @@ DEFINE_uint32(uncache_aggressiveness,
"obsolete. 0 = disabled, 1 = minimum, 100 = moderate, 10000 = "
"normal max");
DEFINE_bool(paranoid_memory_checks,
ROCKSDB_NAMESPACE::Options().paranoid_memory_checks,
"Sets CF option paranoid_memory_checks.");
#endif // GFLAGS

View File

@ -4055,6 +4055,7 @@ void InitializeOptionsFromFlags(
options.memtable_protection_bytes_per_key =
FLAGS_memtable_protection_bytes_per_key;
options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key;
options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
// Integrated BlobDB
options.enable_blob_files = FLAGS_enable_blob_files;

View File

@ -1090,6 +1090,13 @@ struct AdvancedColumnFamilyOptions {
// Dynamically changeable through the SetOptions() API.
uint32_t bottommost_file_compaction_delay = 0;
// Enables additional integrity checks during reads/scans.
// Specifically, for skiplist-based memtables, we verify that keys visited
// are in order. This is helpful to detect corrupted memtable keys during
// reads. Enabling this feature incurs a performance overhead due to an
// additional key comparison during memtable lookup.
bool paranoid_memory_checks = false;
// Create ColumnFamilyOptions with default values for all fields
AdvancedColumnFamilyOptions();
// Create ColumnFamilyOptions from Options

View File

@ -194,6 +194,15 @@ class MemTableRep {
virtual void Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry));
// Same as Get() but performs data integrity validation.
virtual Status GetAndValidate(const LookupKey& /* k */,
void* /* callback_args */,
bool (* /* callback_func */)(void* arg,
const char* entry),
bool /*allow_data_in_error*/) {
return Status::NotSupported("GetAndValidate() not implemented.");
}
virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
const Slice& /*end_key*/) {
return 0;
@ -235,13 +244,38 @@ class MemTableRep {
// REQUIRES: Valid()
virtual void Next() = 0;
// Advances to the next position and performs integrity validations on the
// skip list. Iterator becomes invalid and Corruption is returned if a
// corruption is found.
// REQUIRES: Valid()
virtual Status NextAndValidate(bool /* allow_data_in_errors */) {
return Status::NotSupported("NextAndValidate() not implemented.");
}
// Advances to the previous position.
// REQUIRES: Valid()
virtual void Prev() = 0;
// Advances to the previous position and performs integrity validations on
// the skip list. Iterator becomes invalid and Corruption is returned if a
// corruption is found.
// REQUIRES: Valid()
virtual Status PrevAndValidate(bool /* allow_data_in_errors */) {
return Status::NotSupported("PrevAndValidate() not implemented.");
}
// Advance to the first entry with a key >= target
virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
// Seek and perform integrity validations on the skip list.
// Iterator becomes invalid and Corruption is returned if a
// corruption is found.
virtual Status SeekAndValidate(const Slice& /* internal_key */,
const char* /* memtable_key */,
bool /* allow_data_in_errors */) {
return Status::NotSupported("SeekAndValidate() not implemented.");
}
// retreat to the first entry with a key <= target
virtual void SeekForPrev(const Slice& internal_key,
const char* memtable_key) = 0;

View File

@ -52,6 +52,7 @@
#include "port/likely.h"
#include "port/port.h"
#include "rocksdb/slice.h"
#include "test_util/sync_point.h"
#include "util/coding.h"
#include "util/random.h"
@ -169,13 +170,20 @@ class InlineSkipList {
// REQUIRES: Valid()
void Next();
[[nodiscard]] Status NextAndValidate(bool allow_data_in_errors);
// Advances to the previous position.
// REQUIRES: Valid()
void Prev();
[[nodiscard]] Status PrevAndValidate(bool allow_data_in_errors);
// Advance to the first entry with a key >= target
void Seek(const char* target);
[[nodiscard]] Status SeekAndValidate(const char* target,
bool allow_data_in_errors);
// Retreat to the last entry with a key <= target
void SeekForPrev(const char* target);
@ -237,21 +245,20 @@ class InlineSkipList {
bool KeyIsAfterNode(const DecodedKey& key, Node* n) const;
// Returns the earliest node with a key >= key.
// Return nullptr if there is no such node.
Node* FindGreaterOrEqual(const char* key) const;
// Returns nullptr if there is no such node.
// @param out_of_order_node If not null, will validate the order of visited
// nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
// returned and *out_of_order_node will be set to n2.
Node* FindGreaterOrEqual(const char* key, Node** out_of_order_node) const;
// Return the latest node with a key < key.
// Return head_ if there is no such node.
// Returns the latest node with a key < key.
// Returns head_ if there is no such node.
// Fills prev[level] with pointer to previous node at "level" for every
// level in [0..max_height_-1], if prev is non-null.
Node* FindLessThan(const char* key, Node** prev = nullptr) const;
// Return the latest node with a key < key on bottom_level. Start searching
// from root node on the level below top_level.
// Fills prev[level] with pointer to previous node at "level" for every
// level in [bottom_level..top_level-1], if prev is non-null.
Node* FindLessThan(const char* key, Node** prev, Node* root, int top_level,
int bottom_level) const;
// @param out_of_order_node If not null, will validate the order of visited
// nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
// returned and *out_of_order_node will be set to n2.
Node* FindLessThan(const char* key, Node** out_of_order_node) const;
// Return the last node in the list.
// Return head_ if list is empty.
@ -274,6 +281,8 @@ class InlineSkipList {
// lowest_level (inclusive).
void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice,
int recompute_level);
static Status Corruption(Node* prev, Node* next, bool allow_data_in_errors);
};
// Implementation details follow
@ -392,20 +401,68 @@ inline void InlineSkipList<Comparator>::Iterator::Next() {
node_ = node_->Next(0);
}
template <class Comparator>
inline Status InlineSkipList<Comparator>::Iterator::NextAndValidate(
bool allow_data_in_errors) {
assert(Valid());
Node* prev_node = node_;
node_ = node_->Next(0);
// Verify that keys are increasing.
if (prev_node != list_->head_ && node_ != nullptr &&
list_->compare_(prev_node->Key(), node_->Key()) >= 0) {
Node* node = node_;
// invalidates the iterator
node_ = nullptr;
return Corruption(prev_node, node, allow_data_in_errors);
}
return Status::OK();
}
template <class Comparator>
inline void InlineSkipList<Comparator>::Iterator::Prev() {
// Instead of using explicit "prev" links, we just search for the
// last node that falls before key.
assert(Valid());
node_ = list_->FindLessThan(node_->Key());
node_ = list_->FindLessThan(node_->Key(), nullptr);
if (node_ == list_->head_) {
node_ = nullptr;
}
}
template <class Comparator>
inline Status InlineSkipList<Comparator>::Iterator::PrevAndValidate(
const bool allow_data_in_errors) {
assert(Valid());
// Skip list validation is done in FindLessThan().
Node* out_of_order_node = nullptr;
node_ = list_->FindLessThan(node_->Key(), &out_of_order_node);
if (out_of_order_node) {
Node* node = node_;
node_ = nullptr;
return Corruption(node, out_of_order_node, allow_data_in_errors);
}
if (node_ == list_->head_) {
node_ = nullptr;
}
return Status::OK();
}
template <class Comparator>
inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) {
node_ = list_->FindGreaterOrEqual(target);
node_ = list_->FindGreaterOrEqual(target, nullptr);
}
template <class Comparator>
inline Status InlineSkipList<Comparator>::Iterator::SeekAndValidate(
const char* target, const bool allow_data_in_errors) {
Node* out_of_order_node = nullptr;
node_ = list_->FindGreaterOrEqual(target, &out_of_order_node);
if (out_of_order_node) {
Node* node = node_;
node_ = nullptr;
return Corruption(node, out_of_order_node, allow_data_in_errors);
}
return Status::OK();
}
template <class Comparator>
@ -448,6 +505,7 @@ int InlineSkipList<Comparator>::RandomHeight() {
rnd->Next() < kScaledInverseBranching_) {
height++;
}
TEST_SYNC_POINT_CALLBACK("InlineSkipList::RandomHeight::height", &height);
assert(height > 0);
assert(height <= kMaxHeight_);
assert(height <= kMaxPossibleHeight);
@ -472,7 +530,8 @@ bool InlineSkipList<Comparator>::KeyIsAfterNode(const DecodedKey& key,
template <class Comparator>
typename InlineSkipList<Comparator>::Node*
InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
InlineSkipList<Comparator>::FindGreaterOrEqual(
const char* key, Node** const out_of_order_node) const {
// Note: It looks like we could reduce duplication by implementing
// this function as FindLessThan(key)->Next(0), but we wouldn't be able
// to exit early on equality and the result wouldn't even be correct.
@ -486,6 +545,11 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
Node* next = x->Next(level);
if (next != nullptr) {
PREFETCH(next->Next(level), 0, 1);
if (out_of_order_node && x != head_ &&
compare_(x->Key(), next->Key()) >= 0) {
*out_of_order_node = next;
return x;
}
}
// Make sure the lists are sorted
assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
@ -509,18 +573,11 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
template <class Comparator>
typename InlineSkipList<Comparator>::Node*
InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev) const {
return FindLessThan(key, prev, head_, GetMaxHeight(), 0);
}
template <class Comparator>
typename InlineSkipList<Comparator>::Node*
InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
Node* root, int top_level,
int bottom_level) const {
assert(top_level > bottom_level);
int level = top_level - 1;
Node* x = root;
InlineSkipList<Comparator>::FindLessThan(const char* key,
Node** const out_of_order_node) const {
int level = GetMaxHeight() - 1;
assert(level >= 0);
Node* x = head_;
// KeyIsAfter(key, last_not_after) is definitely false
Node* last_not_after = nullptr;
const DecodedKey key_decoded = compare_.decode_key(key);
@ -529,6 +586,11 @@ InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
Node* next = x->Next(level);
if (next != nullptr) {
PREFETCH(next->Next(level), 0, 1);
if (out_of_order_node && x != head_ &&
compare_(x->Key(), next->Key()) >= 0) {
*out_of_order_node = next;
return x;
}
}
assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
assert(x == head_ || KeyIsAfterNode(key_decoded, x));
@ -537,10 +599,7 @@ InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
assert(next != nullptr);
x = next;
} else {
if (prev != nullptr) {
prev[level] = x;
}
if (level == bottom_level) {
if (level == 0) {
return x;
} else {
// Switch to next list, reuse KeyIsAfterNode() result
@ -999,7 +1058,7 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
template <class Comparator>
bool InlineSkipList<Comparator>::Contains(const char* key) const {
Node* x = FindGreaterOrEqual(key);
Node* x = FindGreaterOrEqual(key, nullptr);
if (x != nullptr && Equal(key, x->Key())) {
return true;
} else {
@ -1048,4 +1107,14 @@ void InlineSkipList<Comparator>::TEST_Validate() const {
}
}
template <class Comparator>
Status InlineSkipList<Comparator>::Corruption(Node* prev, Node* next,
bool allow_data_in_errors) {
std::string msg = "Out-of-order keys found in skiplist.";
if (allow_data_in_errors) {
msg.append(" prev key: " + Slice(prev->Key()).ToString(true));
msg.append(" next key: " + Slice(next->Key()).ToString(true));
}
return Status::Corruption(msg);
}
} // namespace ROCKSDB_NAMESPACE

View File

@ -92,6 +92,20 @@ class SkipListRep : public MemTableRep {
}
}
Status GetAndValidate(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry),
bool allow_data_in_errors) override {
SkipListRep::Iterator iter(&skip_list_);
Slice dummy_slice;
Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(),
allow_data_in_errors);
for (; iter.Valid() && status.ok() &&
callback_func(callback_args, iter.key());
status = iter.NextAndValidate(allow_data_in_errors)) {
}
return status;
}
uint64_t ApproximateNumEntries(const Slice& start_ikey,
const Slice& end_ikey) override {
std::string tmp;
@ -181,15 +195,24 @@ class SkipListRep : public MemTableRep {
// Returns the key at the current position.
// REQUIRES: Valid()
const char* key() const override { return iter_.key(); }
const char* key() const override {
assert(Valid());
return iter_.key();
}
// Advances to the next position.
// REQUIRES: Valid()
void Next() override { iter_.Next(); }
void Next() override {
assert(Valid());
iter_.Next();
}
// Advances to the previous position.
// REQUIRES: Valid()
void Prev() override { iter_.Prev(); }
void Prev() override {
assert(Valid());
iter_.Prev();
}
// Advance to the first entry with a key >= target
void Seek(const Slice& user_key, const char* memtable_key) override {
@ -219,6 +242,26 @@ class SkipListRep : public MemTableRep {
// Final state of iterator is Valid() iff list is not empty.
void SeekToLast() override { iter_.SeekToLast(); }
Status NextAndValidate(bool allow_data_in_errors) override {
assert(Valid());
return iter_.NextAndValidate(allow_data_in_errors);
}
Status SeekAndValidate(const Slice& user_key, const char* memtable_key,
bool allow_data_in_errors) override {
if (memtable_key != nullptr) {
return iter_.SeekAndValidate(memtable_key, allow_data_in_errors);
} else {
return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key),
allow_data_in_errors);
}
}
Status PrevAndValidate(bool allow_data_in_error) override {
assert(Valid());
return iter_.PrevAndValidate(allow_data_in_error);
}
protected:
std::string tmp_; // For passing to EncodeKey
};

View File

@ -531,6 +531,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct MutableCFOptions, block_protection_bytes_per_key),
OptionType::kUInt8T, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"paranoid_memory_checks",
{offsetof(struct MutableCFOptions, paranoid_memory_checks),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{kOptNameCompOpts,
OptionTypeInfo::Struct(
kOptNameCompOpts, &compression_options_type_info,
@ -1104,6 +1108,8 @@ void MutableCFOptions::Dump(Logger* log) const {
ttl);
ROCKS_LOG_INFO(log, " periodic_compaction_seconds: %" PRIu64,
periodic_compaction_seconds);
ROCKS_LOG_INFO(log, " paranoid_memory_checks: %d",
paranoid_memory_checks);
std::string result;
char buf[10];
for (const auto m : max_bytes_for_level_multiplier_additional) {

View File

@ -168,6 +168,7 @@ struct MutableCFOptions {
memtable_protection_bytes_per_key(
options.memtable_protection_bytes_per_key),
block_protection_bytes_per_key(options.block_protection_bytes_per_key),
paranoid_memory_checks(options.paranoid_memory_checks),
sample_for_compression(
options.sample_for_compression), // TODO: is 0 fine here?
compression_per_level(options.compression_per_level),
@ -317,6 +318,7 @@ struct MutableCFOptions {
Temperature default_write_temperature;
uint32_t memtable_protection_bytes_per_key;
uint8_t block_protection_bytes_per_key;
bool paranoid_memory_checks;
uint64_t sample_for_compression;
std::vector<CompressionType> compression_per_level;

View File

@ -213,6 +213,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
moptions.memtable_protection_bytes_per_key;
cf_opts->block_protection_bytes_per_key =
moptions.block_protection_bytes_per_key;
cf_opts->paranoid_memory_checks = moptions.paranoid_memory_checks;
cf_opts->bottommost_file_compaction_delay =
moptions.bottommost_file_compaction_delay;

View File

@ -568,7 +568,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
"block_protection_bytes_per_key=1;"
"memtable_max_range_deletions=999999;"
"bottommost_file_compaction_delay=7200;"
"uncache_aggressiveness=1234;",
"uncache_aggressiveness=1234;"
"paranoid_memory_checks=1;",
new_options));
ASSERT_NE(new_options->blob_cache.get(), nullptr);

View File

@ -1280,6 +1280,9 @@ DEFINE_bool(
auto_readahead_size, false,
"When set true, RocksDB does auto tuning of readahead size during Scans");
DEFINE_bool(paranoid_memory_checks, false,
"Sets CF option paranoid_memory_checks");
static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
const char* ctype) {
assert(ctype);
@ -4739,6 +4742,7 @@ class Benchmark {
FLAGS_memtable_protection_bytes_per_key;
options.block_protection_bytes_per_key =
FLAGS_block_protection_bytes_per_key;
options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
}
void InitializeOptionsGeneral(Options* opts) {

View File

@ -340,6 +340,7 @@ default_params = {
"check_multiget_entity_consistency": lambda: random.choice([0, 0, 0, 1]),
"use_timed_put_one_in": lambda: random.choice([0] * 7 + [1, 5, 10]),
"universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]),
"paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]),
}
_TEST_DIR_ENV_VAR = "TEST_TMPDIR"
# If TEST_TMPDIR_EXPECTED is not specified, default value will be TEST_TMPDIR

View File

@ -0,0 +1 @@
* Introduce a new mutable CF option `paranoid_memory_checks`. It enables additional validation on data integrity during reads/scanning. Currently, skip list based memtable will validate key ordering during look up and scans.