From 4e00748098ce54ba08da43404976755fe5b99320 Mon Sep 17 00:00:00 2001 From: sdong Date: Mon, 25 Jul 2022 11:33:28 -0700 Subject: [PATCH] Fix a bug in hash linked list (#10401) Summary: In hash linked list, with a bucket of only one record, following sequence can cause users to temporarily miss a record: Thread 1: Fetch the structure bucket x points too, which would be a Node n1 for a key, with next pointer to be null Thread 2: Insert a key to bucket x that is larger than the existing key. This will make n1->next points to a new node n2, and update bucket x to point to n1. Thread 1: see n1->next is not null, so it thinks it is a header of linked list and ignore the key of n1. Fix it by refetch structure that bucket x points to when it sees n1->next is not null. This should work because if n1->next is not null, bucket x should already point to a linked list or skip list header. A related change is to revert th order of testing for linked list and skip list. This is because after refetching the bucket, it might end up with a skip list, rather than linked list. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10401 Test Plan: Run existing tests and make sure at least it doesn't regress. Reviewed By: jay-zhuang Differential Revision: D38064471 fbshipit-source-id: 142bb85e1546c803f47e3357aef3e76debccd8df --- HISTORY.md | 1 + memtable/hash_linklist_rep.cc | 178 +++++++++++++++++++--------------- 2 files changed, 101 insertions(+), 78 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 7d4766ce91..4d6b6d46db 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -13,6 +13,7 @@ * Fix a bug where `GenericRateLimiter` could revert the bandwidth set dynamically using `SetBytesPerSecond()` when a user configures a structure enclosing it, e.g., using `GetOptionsFromString()` to configure an `Options` that references an existing `RateLimiter` object. * Fix race conditions in `GenericRateLimiter`. * Fix a bug in `FIFOCompactionPicker::PickTTLCompaction` where total_size calculating might cause underflow +* Fix data race bug in hash linked list memtable. With this bug, read request might temporarily miss an old record in the memtable in a race condition to the hash bucket. ## 7.5.0 (07/15/2022) ### New Features diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc index cf1f1f85fe..f990e89f71 100644 --- a/memtable/hash_linklist_rep.cc +++ b/memtable/hash_linklist_rep.cc @@ -209,10 +209,16 @@ class HashLinkListRep : public MemTableRep { bool LinkListContains(Node* head, const Slice& key) const; - SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer) - const; + bool IsEmptyBucket(Pointer& bucket_pointer) const { + return bucket_pointer.load(std::memory_order_acquire) == nullptr; + } - Node* GetLinkListFirstNode(Pointer* first_next_pointer) const; + // Precondition: GetLinkListFirstNode() must have been called first and return + // null so that it must be a skip list bucket + SkipListBucketHeader* GetSkipListBucketHeader(Pointer& bucket_pointer) const; + + // Returning nullptr indicates it is a skip list bucket. + Node* GetLinkListFirstNode(Pointer& bucket_pointer) const; Slice GetPrefix(const Slice& internal_key) const { return transform_->Transform(ExtractUserKey(internal_key)); @@ -222,11 +228,9 @@ class HashLinkListRep : public MemTableRep { return GetSliceRangedNPHash(slice, bucket_size_); } - Pointer* GetBucket(size_t i) const { - return static_cast(buckets_[i].load(std::memory_order_acquire)); - } + Pointer& GetBucket(size_t i) const { return buckets_[i]; } - Pointer* GetBucket(const Slice& slice) const { + Pointer& GetBucket(const Slice& slice) const { return GetBucket(GetHash(slice)); } @@ -414,30 +418,39 @@ class HashLinkListRep : public MemTableRep { // Advance to the first entry with a key >= target void Seek(const Slice& k, const char* memtable_key) override { auto transformed = memtable_rep_.GetPrefix(k); - auto* bucket = memtable_rep_.GetBucket(transformed); + Pointer& bucket = memtable_rep_.GetBucket(transformed); - SkipListBucketHeader* skip_list_header = - memtable_rep_.GetSkipListBucketHeader(bucket); - if (skip_list_header != nullptr) { - // The bucket is organized as a skip list - if (!skip_list_iter_) { - skip_list_iter_.reset( - new MemtableSkipList::Iterator(&skip_list_header->skip_list)); - } else { - skip_list_iter_->SetList(&skip_list_header->skip_list); - } - if (memtable_key != nullptr) { - skip_list_iter_->Seek(memtable_key); - } else { - IterKey encoded_key; - encoded_key.EncodeLengthPrefixedKey(k); - skip_list_iter_->Seek(encoded_key.GetUserKey().data()); - } - } else { - // The bucket is organized as a linked list + if (memtable_rep_.IsEmptyBucket(bucket)) { skip_list_iter_.reset(); - Reset(memtable_rep_.GetLinkListFirstNode(bucket)); - HashLinkListRep::LinkListIterator::Seek(k, memtable_key); + Reset(nullptr); + } else { + Node* first_linked_list_node = + memtable_rep_.GetLinkListFirstNode(bucket); + if (first_linked_list_node != nullptr) { + // The bucket is organized as a linked list + skip_list_iter_.reset(); + Reset(first_linked_list_node); + HashLinkListRep::LinkListIterator::Seek(k, memtable_key); + + } else { + SkipListBucketHeader* skip_list_header = + memtable_rep_.GetSkipListBucketHeader(bucket); + assert(skip_list_header != nullptr); + // The bucket is organized as a skip list + if (!skip_list_iter_) { + skip_list_iter_.reset( + new MemtableSkipList::Iterator(&skip_list_header->skip_list)); + } else { + skip_list_iter_->SetList(&skip_list_header->skip_list); + } + if (memtable_key != nullptr) { + skip_list_iter_->Seek(memtable_key); + } else { + IterKey encoded_key; + encoded_key.EncodeLengthPrefixedKey(k); + skip_list_iter_->Seek(encoded_key.GetUserKey().data()); + } + } } } @@ -528,36 +541,38 @@ KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) { } SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader( - Pointer* first_next_pointer) const { - if (first_next_pointer == nullptr) { - return nullptr; - } - if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { - // Single entry bucket - return nullptr; - } + Pointer& bucket_pointer) const { + Pointer* first_next_pointer = + static_cast(bucket_pointer.load(std::memory_order_acquire)); + assert(first_next_pointer != nullptr); + assert(first_next_pointer->load(std::memory_order_relaxed) != nullptr); + // Counting header BucketHeader* header = reinterpret_cast(first_next_pointer); - if (header->IsSkipListBucket()) { - assert(header->GetNumEntries() > threshold_use_skiplist_); - auto* skip_list_bucket_header = - reinterpret_cast(header); - assert(skip_list_bucket_header->Counting_header.next.load( - std::memory_order_relaxed) == header); - return skip_list_bucket_header; - } - assert(header->GetNumEntries() <= threshold_use_skiplist_); - return nullptr; + assert(header->IsSkipListBucket()); + assert(header->GetNumEntries() > threshold_use_skiplist_); + auto* skip_list_bucket_header = + reinterpret_cast(header); + assert(skip_list_bucket_header->Counting_header.next.load( + std::memory_order_relaxed) == header); + return skip_list_bucket_header; } -Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const { - if (first_next_pointer == nullptr) { - return nullptr; - } +Node* HashLinkListRep::GetLinkListFirstNode(Pointer& bucket_pointer) const { + Pointer* first_next_pointer = + static_cast(bucket_pointer.load(std::memory_order_acquire)); + assert(first_next_pointer != nullptr); if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { // Single entry bucket return reinterpret_cast(first_next_pointer); } + + // It is possible that after we fetch first_next_pointer it is modified + // and the next is not null anymore. In this case, the bucket should have been + // modified to a counting header, so we should reload the first_next_pointer + // to make sure we see the update. + first_next_pointer = + static_cast(bucket_pointer.load(std::memory_order_acquire)); // Counting header BucketHeader* header = reinterpret_cast(first_next_pointer); if (!header->IsSkipListBucket()) { @@ -695,17 +710,21 @@ bool HashLinkListRep::Contains(const char* key) const { Slice internal_key = GetLengthPrefixedSlice(key); auto transformed = GetPrefix(internal_key); - auto bucket = GetBucket(transformed); - if (bucket == nullptr) { + Pointer& bucket = GetBucket(transformed); + if (IsEmptyBucket(bucket)) { return false; } + Node* linked_list_node = GetLinkListFirstNode(bucket); + if (linked_list_node != nullptr) { + return LinkListContains(linked_list_node, internal_key); + } + SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket); if (skip_list_header != nullptr) { return skip_list_header->skip_list.Contains(key); - } else { - return LinkListContains(GetLinkListFirstNode(bucket), internal_key); } + return false; } size_t HashLinkListRep::ApproximateMemoryUsage() { @@ -716,21 +735,25 @@ size_t HashLinkListRep::ApproximateMemoryUsage() { void HashLinkListRep::Get(const LookupKey& k, void* callback_args, bool (*callback_func)(void* arg, const char* entry)) { auto transformed = transform_->Transform(k.user_key()); - auto bucket = GetBucket(transformed); + Pointer& bucket = GetBucket(transformed); - auto* skip_list_header = GetSkipListBucketHeader(bucket); - if (skip_list_header != nullptr) { - // Is a skip list - MemtableSkipList::Iterator iter(&skip_list_header->skip_list); - for (iter.Seek(k.memtable_key().data()); + if (IsEmptyBucket(bucket)) { + return; + } + + auto* link_list_head = GetLinkListFirstNode(bucket); + if (link_list_head != nullptr) { + LinkListIterator iter(this, link_list_head); + for (iter.Seek(k.internal_key(), nullptr); iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) { } } else { - auto* link_list_head = GetLinkListFirstNode(bucket); - if (link_list_head != nullptr) { - LinkListIterator iter(this, link_list_head); - for (iter.Seek(k.internal_key(), nullptr); + auto* skip_list_header = GetSkipListBucketHeader(bucket); + if (skip_list_header != nullptr) { + // Is a skip list + MemtableSkipList::Iterator iter(&skip_list_header->skip_list); + for (iter.Seek(k.memtable_key().data()); iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) { } @@ -746,25 +769,24 @@ MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) { for (size_t i = 0; i < bucket_size_; ++i) { int count = 0; - auto* bucket = GetBucket(i); - if (bucket != nullptr) { - auto* skip_list_header = GetSkipListBucketHeader(bucket); - if (skip_list_header != nullptr) { + Pointer& bucket = GetBucket(i); + if (!IsEmptyBucket(bucket)) { + auto* link_list_head = GetLinkListFirstNode(bucket); + if (link_list_head != nullptr) { + LinkListIterator itr(this, link_list_head); + for (itr.SeekToHead(); itr.Valid(); itr.Next()) { + list->Insert(itr.key()); + count++; + } + } else { + auto* skip_list_header = GetSkipListBucketHeader(bucket); + assert(skip_list_header != nullptr); // Is a skip list MemtableSkipList::Iterator itr(&skip_list_header->skip_list); for (itr.SeekToFirst(); itr.Valid(); itr.Next()) { list->Insert(itr.key()); count++; - } - } else { - auto* link_list_head = GetLinkListFirstNode(bucket); - if (link_list_head != nullptr) { - LinkListIterator itr(this, link_list_head); - for (itr.SeekToHead(); itr.Valid(); itr.Next()) { - list->Insert(itr.key()); - count++; } - } } } if (if_log_bucket_dist_when_flash_) {