From defd97bc9d5e86d0e003c6cea49fa4062cbc00ff Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Mon, 19 Aug 2024 13:53:25 -0700 Subject: [PATCH] Add an option to verify memtable key order during reads (#12889) Summary: add a new CF option `paranoid_memory_checks` that allows additional data integrity validations during read/scan. Currently, skiplist-based memtable will validate the order of keys visited. Further data validation can be added in different layers. The option will be opt-in due to performance overhead. The motivation for this feature is for services where data correctness is critical and want to detect in-memory corruption earlier. For a corrupted memtable key, this feature can help to detect it during during reads instead of during flush with existing protections (OutputValidator that verifies key order or per kv checksum). See internally linked task for more context. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12889 Test Plan: * new unit test added for paranoid_memory_checks=true. * existing unit test for paranoid_memory_checks=false. * enable in stress test. Performance Benchmark: we check for performance regression in read path where data is in memtable only. For each benchmark, the script was run at the same time for main and this PR: * Memtable-only randomread ops/sec: ``` (for I in $(seq 1 50);do ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --num=250000 --reads=500000 --seed=1723056275 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; Main: 608146 PR with paranoid_memory_checks=false: 607727 (- %0.07) PR with paranoid_memory_checks=true: 521889 (-%14.2) ``` * Memtable-only sequential scan ops/sec: ``` (for I in $(seq 1 50); do ./db_bench--benchmarks=fillseq,readseq[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 9180077 PR with paranoid_memory_checks=false: 9536241 (+%3.8) PR with paranoid_memory_checks=true: 7653934 (-%16.6) ``` * Memtable-only reverse scan ops/sec: ``` (for I in $(seq 1 20); do ./db_bench --benchmarks=fillseq,readreverse[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 1285719 PR with integrity_checks=false: 1431626 (+%11.3) PR with integrity_checks=true: 811031 (-%36.9) ``` The `readrandom` benchmark shows no regression. The scanning benchmarks show improvement that I can't explain. Reviewed By: pdillinger Differential Revision: D60414267 Pulled By: cbi42 fbshipit-source-id: a70b0cbeea131f1a249a5f78f9dc3a62dacfaa91 --- db/db_memtable_test.cc | 85 +++++++++++ db/memtable.cc | 80 ++++++++--- db/memtable.h | 8 +- db_stress_tool/db_stress_common.h | 1 + db_stress_tool/db_stress_gflags.cc | 4 + db_stress_tool/db_stress_test_base.cc | 1 + include/rocksdb/advanced_options.h | 7 + include/rocksdb/memtablerep.h | 34 +++++ memtable/inlineskiplist.h | 133 +++++++++++++----- memtable/skiplistrep.cc | 49 ++++++- options/cf_options.cc | 6 + options/cf_options.h | 2 + options/options_helper.cc | 1 + options/options_settable_test.cc | 3 +- tools/db_bench_tool.cc | 4 + tools/db_crashtest.py | 1 + .../new_features/memtable-paranoid-checks.md | 1 + 17 files changed, 364 insertions(+), 56 deletions(-) create mode 100644 unreleased_history/new_features/memtable-paranoid-checks.md diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index 5c8b6db2ba..3f7b029572 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -339,6 +339,91 @@ TEST_F(DBMemTableTest, ColumnFamilyId) { } } +TEST_F(DBMemTableTest, IntegrityChecks) { + // We insert keys key000000, key000001 and key000002 into skiplist at fixed + // height 1 (smallest height). Then we corrupt the second key to aey000001 to + // make it smaller. With `paranoid_memory_checks` set to true, if the + // skip list sees key000000 and then aey000001, then it will report out of + // order keys with corruption status. With `paranoid_memory_checks` set + // to false, read/scan may return wrong results. + for (bool allow_data_in_error : {false, true}) { + Options options = CurrentOptions(); + options.allow_data_in_errors = allow_data_in_error; + options.paranoid_memory_checks = true; + DestroyAndReopen(options); + SyncPoint::GetInstance()->SetCallBack( + "InlineSkipList::RandomHeight::height", [](void* h) { + auto height_ptr = static_cast(h); + *height_ptr = 1; + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(0), "val0")); + ASSERT_OK(Put(Key(2), "val2")); + // p will point to the buffer for encoded key000001 + char* p = nullptr; + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:BeforeReturn:Encoded", [&](void* encoded) { + p = const_cast(static_cast(encoded)->data()); + }); + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_TRUE(p); + // Offset 0 is key size, key bytes start at offset 1. + // "key000001 -> aey000001" + p[1] = 'a'; + + ReadOptions rops; + std::string val; + Status s = db_->Get(rops, Key(1), &val); + ASSERT_TRUE(s.IsCorruption()); + std::string key0 = Slice(Key(0)).ToString(true); + ASSERT_EQ(s.ToString().find(key0) != std::string::npos, + allow_data_in_error); + // Without `paranoid_memory_checks`, NotFound will be returned. + // This would fail an assertion in InlineSkipList::FindGreaterOrEqual(). + // If we remove the assertion, this passes. + // ASSERT_TRUE(db_->Get(ReadOptions(), Key(1), &val).IsNotFound()); + + std::vector vals; + std::vector statuses = db_->MultiGet( + rops, {db_->DefaultColumnFamily()}, {Key(1)}, &vals, nullptr); + ASSERT_TRUE(statuses[0].IsCorruption()); + ASSERT_EQ(statuses[0].ToString().find(key0) != std::string::npos, + allow_data_in_error); + + std::unique_ptr iter{db_->NewIterator(rops)}; + ASSERT_OK(iter->status()); + iter->Seek(Key(1)); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + + iter->Seek(Key(0)); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + // iterating through skip list at height at 1 should catch out-of-order keys + iter->Next(); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + ASSERT_FALSE(iter->Valid()); + + iter->SeekForPrev(Key(2)); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + + // Internally DB Iter will iterate backwards (call Prev()) after + // SeekToLast() to find the correct internal key with the last user key. + // Prev() will do integrity checks and catch corruption. + iter->SeekToLast(); + ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos, + allow_data_in_error); + ASSERT_FALSE(iter->Valid()); + } +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/memtable.cc b/db/memtable.cc index b1df2ae9c5..ef1184ded4 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -67,9 +67,10 @@ ImmutableMemTableOptions::ImmutableMemTableOptions( statistics(ioptions.stats), merge_operator(ioptions.merge_operator.get()), info_log(ioptions.logger), - allow_data_in_errors(ioptions.allow_data_in_errors), protection_bytes_per_key( - mutable_cf_options.memtable_protection_bytes_per_key) {} + mutable_cf_options.memtable_protection_bytes_per_key), + allow_data_in_errors(ioptions.allow_data_in_errors), + paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {} MemTable::MemTable(const InternalKeyComparator& cmp, const ImmutableOptions& ioptions, @@ -370,15 +371,17 @@ class MemTableIterator : public InternalIterator { : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), comparator_(mem.comparator_), - valid_(false), seqno_to_time_mapping_(seqno_to_time_mapping), - arena_mode_(arena != nullptr), - value_pinned_( - !mem.GetImmutableMemTableOptions()->inplace_update_support), - protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key), status_(Status::OK()), logger_(mem.moptions_.info_log), - ts_sz_(mem.ts_sz_) { + ts_sz_(mem.ts_sz_), + protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key), + valid_(false), + value_pinned_( + !mem.GetImmutableMemTableOptions()->inplace_update_support), + arena_mode_(arena != nullptr), + paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks), + allow_data_in_error(mem.moptions_.allow_data_in_errors) { if (use_range_del_table) { iter_ = mem.range_del_table_->GetIterator(arena); } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && @@ -406,6 +409,7 @@ class MemTableIterator : public InternalIterator { } else { delete iter_; } + status_.PermitUncheckedError(); } #ifndef NDEBUG @@ -415,10 +419,16 @@ class MemTableIterator : public InternalIterator { PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; #endif - bool Valid() const override { return valid_ && status_.ok(); } + bool Valid() const override { + // If inner iter_ is not valid, then this iter should also not be valid. + assert(iter_->Valid() || !(valid_ && status_.ok())); + return valid_ && status_.ok(); + } + void Seek(const Slice& k) override { PERF_TIMER_GUARD(seek_on_memtable_time); PERF_COUNTER_ADD(seek_on_memtable_count, 1); + status_ = Status::OK(); if (bloom_) { // iterator should only use prefix bloom filter Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); @@ -433,13 +443,18 @@ class MemTableIterator : public InternalIterator { } } } - iter_->Seek(k, nullptr); + if (paranoid_memory_checks_) { + status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error); + } else { + iter_->Seek(k, nullptr); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); } void SeekForPrev(const Slice& k) override { PERF_TIMER_GUARD(seek_on_memtable_time); PERF_COUNTER_ADD(seek_on_memtable_count, 1); + status_ = Status::OK(); if (bloom_) { Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); if (prefix_extractor_->InDomain(user_k_without_ts)) { @@ -453,7 +468,11 @@ class MemTableIterator : public InternalIterator { } } } - iter_->Seek(k, nullptr); + if (paranoid_memory_checks_) { + status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error); + } else { + iter_->Seek(k, nullptr); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); if (!Valid() && status().ok()) { @@ -464,11 +483,13 @@ class MemTableIterator : public InternalIterator { } } void SeekToFirst() override { + status_ = Status::OK(); iter_->SeekToFirst(); valid_ = iter_->Valid(); VerifyEntryChecksum(); } void SeekToLast() override { + status_ = Status::OK(); iter_->SeekToLast(); valid_ = iter_->Valid(); VerifyEntryChecksum(); @@ -476,8 +497,12 @@ class MemTableIterator : public InternalIterator { void Next() override { PERF_COUNTER_ADD(next_on_memtable_count, 1); assert(Valid()); - iter_->Next(); - TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_); + if (paranoid_memory_checks_) { + status_ = iter_->NextAndValidate(allow_data_in_error); + } else { + iter_->Next(); + TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); } @@ -494,7 +519,11 @@ class MemTableIterator : public InternalIterator { void Prev() override { PERF_COUNTER_ADD(prev_on_memtable_count, 1); assert(Valid()); - iter_->Prev(); + if (paranoid_memory_checks_) { + status_ = iter_->PrevAndValidate(allow_data_in_error); + } else { + iter_->Prev(); + } valid_ = iter_->Valid(); VerifyEntryChecksum(); } @@ -540,15 +569,17 @@ class MemTableIterator : public InternalIterator { const SliceTransform* const prefix_extractor_; const MemTable::KeyComparator comparator_; MemTableRep::Iterator* iter_; - bool valid_; // The seqno to time mapping is owned by the SuperVersion. UnownedPtr seqno_to_time_mapping_; - bool arena_mode_; - bool value_pinned_; - uint32_t protection_bytes_per_key_; Status status_; Logger* logger_; size_t ts_sz_; + uint32_t protection_bytes_per_key_; + bool valid_; + bool value_pinned_; + bool arena_mode_; + const bool paranoid_memory_checks_; + const bool allow_data_in_error; void VerifyEntryChecksum() { if (protection_bytes_per_key_ > 0 && Valid()) { @@ -1355,7 +1386,18 @@ void MemTable::GetFromTable(const LookupKey& key, saver.do_merge = do_merge; saver.allow_data_in_errors = moptions_.allow_data_in_errors; saver.protection_bytes_per_key = moptions_.protection_bytes_per_key; - table_->Get(key, &saver, SaveValue); + + if (!moptions_.paranoid_memory_checks) { + table_->Get(key, &saver, SaveValue); + } else { + Status check_s = table_->GetAndValidate(key, &saver, SaveValue, + moptions_.allow_data_in_errors); + if (check_s.IsCorruption()) { + *(saver.status) = check_s; + // Should stop searching the LSM. + *(saver.found_final_value) = true; + } + } assert(s->ok() || s->IsMergeInProgress() || *found_final_value); *seq = saver.seq; } diff --git a/db/memtable.h b/db/memtable.h index a2bf354c59..ca0652bc04 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -60,8 +60,9 @@ struct ImmutableMemTableOptions { Statistics* statistics; MergeOperator* merge_operator; Logger* info_log; - bool allow_data_in_errors; uint32_t protection_bytes_per_key; + bool allow_data_in_errors; + bool paranoid_memory_checks; }; // Batched counters to updated when inserting keys in one write batch. @@ -266,6 +267,11 @@ class MemTable { // If do_merge = false then any Merge Operands encountered for key are simply // stored in merge_context.operands_list and never actually merged to get a // final value. The raw Merge Operands are eventually returned to the user. + // @param value If not null and memtable contains a value for key, `value` + // will be set to the result value. + // @param column If not null and memtable contains a value/WideColumn for key, + // `column` will be set to the result value/WideColumn. + // Note: only one of `value` and `column` can be non-nullptr. // @param immutable_memtable Whether this memtable is immutable. Used // internally by NewRangeTombstoneIterator(). See comment above // NewRangeTombstoneIterator() for more detail. diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 08b529d0da..67f82808fe 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -274,6 +274,7 @@ DECLARE_bool(verification_only); DECLARE_string(last_level_temperature); DECLARE_string(default_write_temperature); DECLARE_string(default_temperature); +DECLARE_bool(paranoid_memory_checks); // Options for transaction dbs. // Use TransactionDB (a.k.a. Pessimistic Transaction DB) diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 160095d476..bb2d9d453e 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -1448,4 +1448,8 @@ DEFINE_uint32(uncache_aggressiveness, "obsolete. 0 = disabled, 1 = minimum, 100 = moderate, 10000 = " "normal max"); +DEFINE_bool(paranoid_memory_checks, + ROCKSDB_NAMESPACE::Options().paranoid_memory_checks, + "Sets CF option paranoid_memory_checks."); + #endif // GFLAGS diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 11998a98da..b8ab0cc4f5 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -4055,6 +4055,7 @@ void InitializeOptionsFromFlags( options.memtable_protection_bytes_per_key = FLAGS_memtable_protection_bytes_per_key; options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key; + options.paranoid_memory_checks = FLAGS_paranoid_memory_checks; // Integrated BlobDB options.enable_blob_files = FLAGS_enable_blob_files; diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index cbe1eb52fc..11f971c242 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -1090,6 +1090,13 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through the SetOptions() API. uint32_t bottommost_file_compaction_delay = 0; + // Enables additional integrity checks during reads/scans. + // Specifically, for skiplist-based memtables, we verify that keys visited + // are in order. This is helpful to detect corrupted memtable keys during + // reads. Enabling this feature incurs a performance overhead due to an + // additional key comparison during memtable lookup. + bool paranoid_memory_checks = false; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index d109a542fe..fd63f127f4 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -194,6 +194,15 @@ class MemTableRep { virtual void Get(const LookupKey& k, void* callback_args, bool (*callback_func)(void* arg, const char* entry)); + // Same as Get() but performs data integrity validation. + virtual Status GetAndValidate(const LookupKey& /* k */, + void* /* callback_args */, + bool (* /* callback_func */)(void* arg, + const char* entry), + bool /*allow_data_in_error*/) { + return Status::NotSupported("GetAndValidate() not implemented."); + } + virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, const Slice& /*end_key*/) { return 0; @@ -235,13 +244,38 @@ class MemTableRep { // REQUIRES: Valid() virtual void Next() = 0; + // Advances to the next position and performs integrity validations on the + // skip list. Iterator becomes invalid and Corruption is returned if a + // corruption is found. + // REQUIRES: Valid() + virtual Status NextAndValidate(bool /* allow_data_in_errors */) { + return Status::NotSupported("NextAndValidate() not implemented."); + } + // Advances to the previous position. // REQUIRES: Valid() virtual void Prev() = 0; + // Advances to the previous position and performs integrity validations on + // the skip list. Iterator becomes invalid and Corruption is returned if a + // corruption is found. + // REQUIRES: Valid() + virtual Status PrevAndValidate(bool /* allow_data_in_errors */) { + return Status::NotSupported("PrevAndValidate() not implemented."); + } + // Advance to the first entry with a key >= target virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; + // Seek and perform integrity validations on the skip list. + // Iterator becomes invalid and Corruption is returned if a + // corruption is found. + virtual Status SeekAndValidate(const Slice& /* internal_key */, + const char* /* memtable_key */, + bool /* allow_data_in_errors */) { + return Status::NotSupported("SeekAndValidate() not implemented."); + } + // retreat to the first entry with a key <= target virtual void SeekForPrev(const Slice& internal_key, const char* memtable_key) = 0; diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h index 8e2d548b43..ceaa246ae6 100644 --- a/memtable/inlineskiplist.h +++ b/memtable/inlineskiplist.h @@ -52,6 +52,7 @@ #include "port/likely.h" #include "port/port.h" #include "rocksdb/slice.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/random.h" @@ -169,13 +170,20 @@ class InlineSkipList { // REQUIRES: Valid() void Next(); + [[nodiscard]] Status NextAndValidate(bool allow_data_in_errors); + // Advances to the previous position. // REQUIRES: Valid() void Prev(); + [[nodiscard]] Status PrevAndValidate(bool allow_data_in_errors); + // Advance to the first entry with a key >= target void Seek(const char* target); + [[nodiscard]] Status SeekAndValidate(const char* target, + bool allow_data_in_errors); + // Retreat to the last entry with a key <= target void SeekForPrev(const char* target); @@ -237,21 +245,20 @@ class InlineSkipList { bool KeyIsAfterNode(const DecodedKey& key, Node* n) const; // Returns the earliest node with a key >= key. - // Return nullptr if there is no such node. - Node* FindGreaterOrEqual(const char* key) const; + // Returns nullptr if there is no such node. + // @param out_of_order_node If not null, will validate the order of visited + // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be + // returned and *out_of_order_node will be set to n2. + Node* FindGreaterOrEqual(const char* key, Node** out_of_order_node) const; - // Return the latest node with a key < key. - // Return head_ if there is no such node. + // Returns the latest node with a key < key. + // Returns head_ if there is no such node. // Fills prev[level] with pointer to previous node at "level" for every // level in [0..max_height_-1], if prev is non-null. - Node* FindLessThan(const char* key, Node** prev = nullptr) const; - - // Return the latest node with a key < key on bottom_level. Start searching - // from root node on the level below top_level. - // Fills prev[level] with pointer to previous node at "level" for every - // level in [bottom_level..top_level-1], if prev is non-null. - Node* FindLessThan(const char* key, Node** prev, Node* root, int top_level, - int bottom_level) const; + // @param out_of_order_node If not null, will validate the order of visited + // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be + // returned and *out_of_order_node will be set to n2. + Node* FindLessThan(const char* key, Node** out_of_order_node) const; // Return the last node in the list. // Return head_ if list is empty. @@ -274,6 +281,8 @@ class InlineSkipList { // lowest_level (inclusive). void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice, int recompute_level); + + static Status Corruption(Node* prev, Node* next, bool allow_data_in_errors); }; // Implementation details follow @@ -392,20 +401,68 @@ inline void InlineSkipList::Iterator::Next() { node_ = node_->Next(0); } +template +inline Status InlineSkipList::Iterator::NextAndValidate( + bool allow_data_in_errors) { + assert(Valid()); + Node* prev_node = node_; + node_ = node_->Next(0); + // Verify that keys are increasing. + if (prev_node != list_->head_ && node_ != nullptr && + list_->compare_(prev_node->Key(), node_->Key()) >= 0) { + Node* node = node_; + // invalidates the iterator + node_ = nullptr; + return Corruption(prev_node, node, allow_data_in_errors); + } + return Status::OK(); +} + template inline void InlineSkipList::Iterator::Prev() { // Instead of using explicit "prev" links, we just search for the // last node that falls before key. assert(Valid()); - node_ = list_->FindLessThan(node_->Key()); + node_ = list_->FindLessThan(node_->Key(), nullptr); if (node_ == list_->head_) { node_ = nullptr; } } +template +inline Status InlineSkipList::Iterator::PrevAndValidate( + const bool allow_data_in_errors) { + assert(Valid()); + // Skip list validation is done in FindLessThan(). + Node* out_of_order_node = nullptr; + node_ = list_->FindLessThan(node_->Key(), &out_of_order_node); + if (out_of_order_node) { + Node* node = node_; + node_ = nullptr; + return Corruption(node, out_of_order_node, allow_data_in_errors); + } + if (node_ == list_->head_) { + node_ = nullptr; + } + return Status::OK(); +} + template inline void InlineSkipList::Iterator::Seek(const char* target) { - node_ = list_->FindGreaterOrEqual(target); + node_ = list_->FindGreaterOrEqual(target, nullptr); +} + +template +inline Status InlineSkipList::Iterator::SeekAndValidate( + const char* target, const bool allow_data_in_errors) { + Node* out_of_order_node = nullptr; + node_ = list_->FindGreaterOrEqual(target, &out_of_order_node); + if (out_of_order_node) { + Node* node = node_; + node_ = nullptr; + return Corruption(node, out_of_order_node, allow_data_in_errors); + } + return Status::OK(); } template @@ -448,6 +505,7 @@ int InlineSkipList::RandomHeight() { rnd->Next() < kScaledInverseBranching_) { height++; } + TEST_SYNC_POINT_CALLBACK("InlineSkipList::RandomHeight::height", &height); assert(height > 0); assert(height <= kMaxHeight_); assert(height <= kMaxPossibleHeight); @@ -472,7 +530,8 @@ bool InlineSkipList::KeyIsAfterNode(const DecodedKey& key, template typename InlineSkipList::Node* -InlineSkipList::FindGreaterOrEqual(const char* key) const { +InlineSkipList::FindGreaterOrEqual( + const char* key, Node** const out_of_order_node) const { // Note: It looks like we could reduce duplication by implementing // this function as FindLessThan(key)->Next(0), but we wouldn't be able // to exit early on equality and the result wouldn't even be correct. @@ -486,6 +545,11 @@ InlineSkipList::FindGreaterOrEqual(const char* key) const { Node* next = x->Next(level); if (next != nullptr) { PREFETCH(next->Next(level), 0, 1); + if (out_of_order_node && x != head_ && + compare_(x->Key(), next->Key()) >= 0) { + *out_of_order_node = next; + return x; + } } // Make sure the lists are sorted assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); @@ -509,18 +573,11 @@ InlineSkipList::FindGreaterOrEqual(const char* key) const { template typename InlineSkipList::Node* -InlineSkipList::FindLessThan(const char* key, Node** prev) const { - return FindLessThan(key, prev, head_, GetMaxHeight(), 0); -} - -template -typename InlineSkipList::Node* -InlineSkipList::FindLessThan(const char* key, Node** prev, - Node* root, int top_level, - int bottom_level) const { - assert(top_level > bottom_level); - int level = top_level - 1; - Node* x = root; +InlineSkipList::FindLessThan(const char* key, + Node** const out_of_order_node) const { + int level = GetMaxHeight() - 1; + assert(level >= 0); + Node* x = head_; // KeyIsAfter(key, last_not_after) is definitely false Node* last_not_after = nullptr; const DecodedKey key_decoded = compare_.decode_key(key); @@ -529,6 +586,11 @@ InlineSkipList::FindLessThan(const char* key, Node** prev, Node* next = x->Next(level); if (next != nullptr) { PREFETCH(next->Next(level), 0, 1); + if (out_of_order_node && x != head_ && + compare_(x->Key(), next->Key()) >= 0) { + *out_of_order_node = next; + return x; + } } assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); assert(x == head_ || KeyIsAfterNode(key_decoded, x)); @@ -537,10 +599,7 @@ InlineSkipList::FindLessThan(const char* key, Node** prev, assert(next != nullptr); x = next; } else { - if (prev != nullptr) { - prev[level] = x; - } - if (level == bottom_level) { + if (level == 0) { return x; } else { // Switch to next list, reuse KeyIsAfterNode() result @@ -999,7 +1058,7 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, template bool InlineSkipList::Contains(const char* key) const { - Node* x = FindGreaterOrEqual(key); + Node* x = FindGreaterOrEqual(key, nullptr); if (x != nullptr && Equal(key, x->Key())) { return true; } else { @@ -1048,4 +1107,14 @@ void InlineSkipList::TEST_Validate() const { } } +template +Status InlineSkipList::Corruption(Node* prev, Node* next, + bool allow_data_in_errors) { + std::string msg = "Out-of-order keys found in skiplist."; + if (allow_data_in_errors) { + msg.append(" prev key: " + Slice(prev->Key()).ToString(true)); + msg.append(" next key: " + Slice(next->Key()).ToString(true)); + } + return Status::Corruption(msg); +} } // namespace ROCKSDB_NAMESPACE diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index e615ef9f68..3b2f3f4d8d 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -92,6 +92,20 @@ class SkipListRep : public MemTableRep { } } + Status GetAndValidate(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry), + bool allow_data_in_errors) override { + SkipListRep::Iterator iter(&skip_list_); + Slice dummy_slice; + Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(), + allow_data_in_errors); + for (; iter.Valid() && status.ok() && + callback_func(callback_args, iter.key()); + status = iter.NextAndValidate(allow_data_in_errors)) { + } + return status; + } + uint64_t ApproximateNumEntries(const Slice& start_ikey, const Slice& end_ikey) override { std::string tmp; @@ -181,15 +195,24 @@ class SkipListRep : public MemTableRep { // Returns the key at the current position. // REQUIRES: Valid() - const char* key() const override { return iter_.key(); } + const char* key() const override { + assert(Valid()); + return iter_.key(); + } // Advances to the next position. // REQUIRES: Valid() - void Next() override { iter_.Next(); } + void Next() override { + assert(Valid()); + iter_.Next(); + } // Advances to the previous position. // REQUIRES: Valid() - void Prev() override { iter_.Prev(); } + void Prev() override { + assert(Valid()); + iter_.Prev(); + } // Advance to the first entry with a key >= target void Seek(const Slice& user_key, const char* memtable_key) override { @@ -219,6 +242,26 @@ class SkipListRep : public MemTableRep { // Final state of iterator is Valid() iff list is not empty. void SeekToLast() override { iter_.SeekToLast(); } + Status NextAndValidate(bool allow_data_in_errors) override { + assert(Valid()); + return iter_.NextAndValidate(allow_data_in_errors); + } + + Status SeekAndValidate(const Slice& user_key, const char* memtable_key, + bool allow_data_in_errors) override { + if (memtable_key != nullptr) { + return iter_.SeekAndValidate(memtable_key, allow_data_in_errors); + } else { + return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key), + allow_data_in_errors); + } + } + + Status PrevAndValidate(bool allow_data_in_error) override { + assert(Valid()); + return iter_.PrevAndValidate(allow_data_in_error); + } + protected: std::string tmp_; // For passing to EncodeKey }; diff --git a/options/cf_options.cc b/options/cf_options.cc index cc9e630b9c..7f2cd03132 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -531,6 +531,10 @@ static std::unordered_map {offsetof(struct MutableCFOptions, block_protection_bytes_per_key), OptionType::kUInt8T, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"paranoid_memory_checks", + {offsetof(struct MutableCFOptions, paranoid_memory_checks), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {kOptNameCompOpts, OptionTypeInfo::Struct( kOptNameCompOpts, &compression_options_type_info, @@ -1104,6 +1108,8 @@ void MutableCFOptions::Dump(Logger* log) const { ttl); ROCKS_LOG_INFO(log, " periodic_compaction_seconds: %" PRIu64, periodic_compaction_seconds); + ROCKS_LOG_INFO(log, " paranoid_memory_checks: %d", + paranoid_memory_checks); std::string result; char buf[10]; for (const auto m : max_bytes_for_level_multiplier_additional) { diff --git a/options/cf_options.h b/options/cf_options.h index 372a0daf54..3a0c3b09a8 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -168,6 +168,7 @@ struct MutableCFOptions { memtable_protection_bytes_per_key( options.memtable_protection_bytes_per_key), block_protection_bytes_per_key(options.block_protection_bytes_per_key), + paranoid_memory_checks(options.paranoid_memory_checks), sample_for_compression( options.sample_for_compression), // TODO: is 0 fine here? compression_per_level(options.compression_per_level), @@ -317,6 +318,7 @@ struct MutableCFOptions { Temperature default_write_temperature; uint32_t memtable_protection_bytes_per_key; uint8_t block_protection_bytes_per_key; + bool paranoid_memory_checks; uint64_t sample_for_compression; std::vector compression_per_level; diff --git a/options/options_helper.cc b/options/options_helper.cc index 5cc13f4fe4..ec62dd1f5c 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -213,6 +213,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, moptions.memtable_protection_bytes_per_key; cf_opts->block_protection_bytes_per_key = moptions.block_protection_bytes_per_key; + cf_opts->paranoid_memory_checks = moptions.paranoid_memory_checks; cf_opts->bottommost_file_compaction_delay = moptions.bottommost_file_compaction_delay; diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 31a07373c4..2bf349b1cb 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -568,7 +568,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "block_protection_bytes_per_key=1;" "memtable_max_range_deletions=999999;" "bottommost_file_compaction_delay=7200;" - "uncache_aggressiveness=1234;", + "uncache_aggressiveness=1234;" + "paranoid_memory_checks=1;", new_options)); ASSERT_NE(new_options->blob_cache.get(), nullptr); diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index b4554bface..d51dbf30ad 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1280,6 +1280,9 @@ DEFINE_bool( auto_readahead_size, false, "When set true, RocksDB does auto tuning of readahead size during Scans"); +DEFINE_bool(paranoid_memory_checks, false, + "Sets CF option paranoid_memory_checks"); + static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType( const char* ctype) { assert(ctype); @@ -4739,6 +4742,7 @@ class Benchmark { FLAGS_memtable_protection_bytes_per_key; options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key; + options.paranoid_memory_checks = FLAGS_paranoid_memory_checks; } void InitializeOptionsGeneral(Options* opts) { diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 9d320ad2cf..c95851310d 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -340,6 +340,7 @@ default_params = { "check_multiget_entity_consistency": lambda: random.choice([0, 0, 0, 1]), "use_timed_put_one_in": lambda: random.choice([0] * 7 + [1, 5, 10]), "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]), + "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]), } _TEST_DIR_ENV_VAR = "TEST_TMPDIR" # If TEST_TMPDIR_EXPECTED is not specified, default value will be TEST_TMPDIR diff --git a/unreleased_history/new_features/memtable-paranoid-checks.md b/unreleased_history/new_features/memtable-paranoid-checks.md new file mode 100644 index 0000000000..e2a4510dcf --- /dev/null +++ b/unreleased_history/new_features/memtable-paranoid-checks.md @@ -0,0 +1 @@ +* Introduce a new mutable CF option `paranoid_memory_checks`. It enables additional validation on data integrity during reads/scanning. Currently, skip list based memtable will validate key ordering during look up and scans. \ No newline at end of file