Fix a bug in hash linked list (#10401)

Summary:
In hash linked list, with a bucket of only one record, following sequence can cause users to temporarily miss a record:

Thread 1: Fetch the structure bucket x points too, which would be a Node n1 for a key, with next pointer to be null
Thread 2: Insert a key to bucket x that is larger than the existing key. This will make n1->next points to a new node n2, and update bucket x to point to n1.
Thread 1: see n1->next is not null, so it thinks it is a header of linked list and ignore the key of n1.

Fix it by refetch structure that bucket x points to when it sees n1->next is not null. This should work because if n1->next is not null, bucket x should already point to a linked list or skip list header.

A related change is to revert th order of testing for linked list and skip list. This is because after refetching the bucket, it might end up with a skip list, rather than linked list.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10401

Test Plan: Run existing tests and make sure at least it doesn't regress.

Reviewed By: jay-zhuang

Differential Revision: D38064471

fbshipit-source-id: 142bb85e1546c803f47e3357aef3e76debccd8df
This commit is contained in:
sdong 2022-07-25 11:33:28 -07:00 committed by Facebook GitHub Bot
parent 6a160e1fec
commit 4e00748098
2 changed files with 101 additions and 78 deletions

View file

@ -13,6 +13,7 @@
* Fix a bug where `GenericRateLimiter` could revert the bandwidth set dynamically using `SetBytesPerSecond()` when a user configures a structure enclosing it, e.g., using `GetOptionsFromString()` to configure an `Options` that references an existing `RateLimiter` object. * Fix a bug where `GenericRateLimiter` could revert the bandwidth set dynamically using `SetBytesPerSecond()` when a user configures a structure enclosing it, e.g., using `GetOptionsFromString()` to configure an `Options` that references an existing `RateLimiter` object.
* Fix race conditions in `GenericRateLimiter`. * Fix race conditions in `GenericRateLimiter`.
* Fix a bug in `FIFOCompactionPicker::PickTTLCompaction` where total_size calculating might cause underflow * Fix a bug in `FIFOCompactionPicker::PickTTLCompaction` where total_size calculating might cause underflow
* Fix data race bug in hash linked list memtable. With this bug, read request might temporarily miss an old record in the memtable in a race condition to the hash bucket.
## 7.5.0 (07/15/2022) ## 7.5.0 (07/15/2022)
### New Features ### New Features

View file

@ -209,10 +209,16 @@ class HashLinkListRep : public MemTableRep {
bool LinkListContains(Node* head, const Slice& key) const; bool LinkListContains(Node* head, const Slice& key) const;
SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer) bool IsEmptyBucket(Pointer& bucket_pointer) const {
const; return bucket_pointer.load(std::memory_order_acquire) == nullptr;
}
Node* GetLinkListFirstNode(Pointer* first_next_pointer) const; // Precondition: GetLinkListFirstNode() must have been called first and return
// null so that it must be a skip list bucket
SkipListBucketHeader* GetSkipListBucketHeader(Pointer& bucket_pointer) const;
// Returning nullptr indicates it is a skip list bucket.
Node* GetLinkListFirstNode(Pointer& bucket_pointer) const;
Slice GetPrefix(const Slice& internal_key) const { Slice GetPrefix(const Slice& internal_key) const {
return transform_->Transform(ExtractUserKey(internal_key)); return transform_->Transform(ExtractUserKey(internal_key));
@ -222,11 +228,9 @@ class HashLinkListRep : public MemTableRep {
return GetSliceRangedNPHash(slice, bucket_size_); return GetSliceRangedNPHash(slice, bucket_size_);
} }
Pointer* GetBucket(size_t i) const { Pointer& GetBucket(size_t i) const { return buckets_[i]; }
return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire));
}
Pointer* GetBucket(const Slice& slice) const { Pointer& GetBucket(const Slice& slice) const {
return GetBucket(GetHash(slice)); return GetBucket(GetHash(slice));
} }
@ -414,30 +418,39 @@ class HashLinkListRep : public MemTableRep {
// Advance to the first entry with a key >= target // Advance to the first entry with a key >= target
void Seek(const Slice& k, const char* memtable_key) override { void Seek(const Slice& k, const char* memtable_key) override {
auto transformed = memtable_rep_.GetPrefix(k); auto transformed = memtable_rep_.GetPrefix(k);
auto* bucket = memtable_rep_.GetBucket(transformed); Pointer& bucket = memtable_rep_.GetBucket(transformed);
SkipListBucketHeader* skip_list_header = if (memtable_rep_.IsEmptyBucket(bucket)) {
memtable_rep_.GetSkipListBucketHeader(bucket);
if (skip_list_header != nullptr) {
// The bucket is organized as a skip list
if (!skip_list_iter_) {
skip_list_iter_.reset(
new MemtableSkipList::Iterator(&skip_list_header->skip_list));
} else {
skip_list_iter_->SetList(&skip_list_header->skip_list);
}
if (memtable_key != nullptr) {
skip_list_iter_->Seek(memtable_key);
} else {
IterKey encoded_key;
encoded_key.EncodeLengthPrefixedKey(k);
skip_list_iter_->Seek(encoded_key.GetUserKey().data());
}
} else {
// The bucket is organized as a linked list
skip_list_iter_.reset(); skip_list_iter_.reset();
Reset(memtable_rep_.GetLinkListFirstNode(bucket)); Reset(nullptr);
HashLinkListRep::LinkListIterator::Seek(k, memtable_key); } else {
Node* first_linked_list_node =
memtable_rep_.GetLinkListFirstNode(bucket);
if (first_linked_list_node != nullptr) {
// The bucket is organized as a linked list
skip_list_iter_.reset();
Reset(first_linked_list_node);
HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
} else {
SkipListBucketHeader* skip_list_header =
memtable_rep_.GetSkipListBucketHeader(bucket);
assert(skip_list_header != nullptr);
// The bucket is organized as a skip list
if (!skip_list_iter_) {
skip_list_iter_.reset(
new MemtableSkipList::Iterator(&skip_list_header->skip_list));
} else {
skip_list_iter_->SetList(&skip_list_header->skip_list);
}
if (memtable_key != nullptr) {
skip_list_iter_->Seek(memtable_key);
} else {
IterKey encoded_key;
encoded_key.EncodeLengthPrefixedKey(k);
skip_list_iter_->Seek(encoded_key.GetUserKey().data());
}
}
} }
} }
@ -528,36 +541,38 @@ KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
} }
SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader( SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
Pointer* first_next_pointer) const { Pointer& bucket_pointer) const {
if (first_next_pointer == nullptr) { Pointer* first_next_pointer =
return nullptr; static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
} assert(first_next_pointer != nullptr);
if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { assert(first_next_pointer->load(std::memory_order_relaxed) != nullptr);
// Single entry bucket
return nullptr;
}
// Counting header // Counting header
BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer); BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
if (header->IsSkipListBucket()) { assert(header->IsSkipListBucket());
assert(header->GetNumEntries() > threshold_use_skiplist_); assert(header->GetNumEntries() > threshold_use_skiplist_);
auto* skip_list_bucket_header = auto* skip_list_bucket_header =
reinterpret_cast<SkipListBucketHeader*>(header); reinterpret_cast<SkipListBucketHeader*>(header);
assert(skip_list_bucket_header->Counting_header.next.load( assert(skip_list_bucket_header->Counting_header.next.load(
std::memory_order_relaxed) == header); std::memory_order_relaxed) == header);
return skip_list_bucket_header; return skip_list_bucket_header;
}
assert(header->GetNumEntries() <= threshold_use_skiplist_);
return nullptr;
} }
Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const { Node* HashLinkListRep::GetLinkListFirstNode(Pointer& bucket_pointer) const {
if (first_next_pointer == nullptr) { Pointer* first_next_pointer =
return nullptr; static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
} assert(first_next_pointer != nullptr);
if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
// Single entry bucket // Single entry bucket
return reinterpret_cast<Node*>(first_next_pointer); return reinterpret_cast<Node*>(first_next_pointer);
} }
// It is possible that after we fetch first_next_pointer it is modified
// and the next is not null anymore. In this case, the bucket should have been
// modified to a counting header, so we should reload the first_next_pointer
// to make sure we see the update.
first_next_pointer =
static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
// Counting header // Counting header
BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer); BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
if (!header->IsSkipListBucket()) { if (!header->IsSkipListBucket()) {
@ -695,17 +710,21 @@ bool HashLinkListRep::Contains(const char* key) const {
Slice internal_key = GetLengthPrefixedSlice(key); Slice internal_key = GetLengthPrefixedSlice(key);
auto transformed = GetPrefix(internal_key); auto transformed = GetPrefix(internal_key);
auto bucket = GetBucket(transformed); Pointer& bucket = GetBucket(transformed);
if (bucket == nullptr) { if (IsEmptyBucket(bucket)) {
return false; return false;
} }
Node* linked_list_node = GetLinkListFirstNode(bucket);
if (linked_list_node != nullptr) {
return LinkListContains(linked_list_node, internal_key);
}
SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket); SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
if (skip_list_header != nullptr) { if (skip_list_header != nullptr) {
return skip_list_header->skip_list.Contains(key); return skip_list_header->skip_list.Contains(key);
} else {
return LinkListContains(GetLinkListFirstNode(bucket), internal_key);
} }
return false;
} }
size_t HashLinkListRep::ApproximateMemoryUsage() { size_t HashLinkListRep::ApproximateMemoryUsage() {
@ -716,21 +735,25 @@ size_t HashLinkListRep::ApproximateMemoryUsage() {
void HashLinkListRep::Get(const LookupKey& k, void* callback_args, void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry)) { bool (*callback_func)(void* arg, const char* entry)) {
auto transformed = transform_->Transform(k.user_key()); auto transformed = transform_->Transform(k.user_key());
auto bucket = GetBucket(transformed); Pointer& bucket = GetBucket(transformed);
auto* skip_list_header = GetSkipListBucketHeader(bucket); if (IsEmptyBucket(bucket)) {
if (skip_list_header != nullptr) { return;
// Is a skip list }
MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
for (iter.Seek(k.memtable_key().data()); auto* link_list_head = GetLinkListFirstNode(bucket);
if (link_list_head != nullptr) {
LinkListIterator iter(this, link_list_head);
for (iter.Seek(k.internal_key(), nullptr);
iter.Valid() && callback_func(callback_args, iter.key()); iter.Valid() && callback_func(callback_args, iter.key());
iter.Next()) { iter.Next()) {
} }
} else { } else {
auto* link_list_head = GetLinkListFirstNode(bucket); auto* skip_list_header = GetSkipListBucketHeader(bucket);
if (link_list_head != nullptr) { if (skip_list_header != nullptr) {
LinkListIterator iter(this, link_list_head); // Is a skip list
for (iter.Seek(k.internal_key(), nullptr); MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
for (iter.Seek(k.memtable_key().data());
iter.Valid() && callback_func(callback_args, iter.key()); iter.Valid() && callback_func(callback_args, iter.key());
iter.Next()) { iter.Next()) {
} }
@ -746,25 +769,24 @@ MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
for (size_t i = 0; i < bucket_size_; ++i) { for (size_t i = 0; i < bucket_size_; ++i) {
int count = 0; int count = 0;
auto* bucket = GetBucket(i); Pointer& bucket = GetBucket(i);
if (bucket != nullptr) { if (!IsEmptyBucket(bucket)) {
auto* skip_list_header = GetSkipListBucketHeader(bucket); auto* link_list_head = GetLinkListFirstNode(bucket);
if (skip_list_header != nullptr) { if (link_list_head != nullptr) {
LinkListIterator itr(this, link_list_head);
for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
list->Insert(itr.key());
count++;
}
} else {
auto* skip_list_header = GetSkipListBucketHeader(bucket);
assert(skip_list_header != nullptr);
// Is a skip list // Is a skip list
MemtableSkipList::Iterator itr(&skip_list_header->skip_list); MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
for (itr.SeekToFirst(); itr.Valid(); itr.Next()) { for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
list->Insert(itr.key()); list->Insert(itr.key());
count++; count++;
}
} else {
auto* link_list_head = GetLinkListFirstNode(bucket);
if (link_list_head != nullptr) {
LinkListIterator itr(this, link_list_head);
for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
list->Insert(itr.key());
count++;
} }
}
} }
} }
if (if_log_bucket_dist_when_flash_) { if (if_log_bucket_dist_when_flash_) {