mirror of https://github.com/facebook/rocksdb.git
Fix a bug in hash linked list (#10401)
Summary: In hash linked list, with a bucket of only one record, following sequence can cause users to temporarily miss a record: Thread 1: Fetch the structure bucket x points too, which would be a Node n1 for a key, with next pointer to be null Thread 2: Insert a key to bucket x that is larger than the existing key. This will make n1->next points to a new node n2, and update bucket x to point to n1. Thread 1: see n1->next is not null, so it thinks it is a header of linked list and ignore the key of n1. Fix it by refetch structure that bucket x points to when it sees n1->next is not null. This should work because if n1->next is not null, bucket x should already point to a linked list or skip list header. A related change is to revert th order of testing for linked list and skip list. This is because after refetching the bucket, it might end up with a skip list, rather than linked list. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10401 Test Plan: Run existing tests and make sure at least it doesn't regress. Reviewed By: jay-zhuang Differential Revision: D38064471 fbshipit-source-id: 142bb85e1546c803f47e3357aef3e76debccd8df
This commit is contained in:
parent
6a160e1fec
commit
4e00748098
|
@ -13,6 +13,7 @@
|
|||
* Fix a bug where `GenericRateLimiter` could revert the bandwidth set dynamically using `SetBytesPerSecond()` when a user configures a structure enclosing it, e.g., using `GetOptionsFromString()` to configure an `Options` that references an existing `RateLimiter` object.
|
||||
* Fix race conditions in `GenericRateLimiter`.
|
||||
* Fix a bug in `FIFOCompactionPicker::PickTTLCompaction` where total_size calculating might cause underflow
|
||||
* Fix data race bug in hash linked list memtable. With this bug, read request might temporarily miss an old record in the memtable in a race condition to the hash bucket.
|
||||
|
||||
## 7.5.0 (07/15/2022)
|
||||
### New Features
|
||||
|
|
|
@ -209,10 +209,16 @@ class HashLinkListRep : public MemTableRep {
|
|||
|
||||
bool LinkListContains(Node* head, const Slice& key) const;
|
||||
|
||||
SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer)
|
||||
const;
|
||||
bool IsEmptyBucket(Pointer& bucket_pointer) const {
|
||||
return bucket_pointer.load(std::memory_order_acquire) == nullptr;
|
||||
}
|
||||
|
||||
Node* GetLinkListFirstNode(Pointer* first_next_pointer) const;
|
||||
// Precondition: GetLinkListFirstNode() must have been called first and return
|
||||
// null so that it must be a skip list bucket
|
||||
SkipListBucketHeader* GetSkipListBucketHeader(Pointer& bucket_pointer) const;
|
||||
|
||||
// Returning nullptr indicates it is a skip list bucket.
|
||||
Node* GetLinkListFirstNode(Pointer& bucket_pointer) const;
|
||||
|
||||
Slice GetPrefix(const Slice& internal_key) const {
|
||||
return transform_->Transform(ExtractUserKey(internal_key));
|
||||
|
@ -222,11 +228,9 @@ class HashLinkListRep : public MemTableRep {
|
|||
return GetSliceRangedNPHash(slice, bucket_size_);
|
||||
}
|
||||
|
||||
Pointer* GetBucket(size_t i) const {
|
||||
return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire));
|
||||
}
|
||||
Pointer& GetBucket(size_t i) const { return buckets_[i]; }
|
||||
|
||||
Pointer* GetBucket(const Slice& slice) const {
|
||||
Pointer& GetBucket(const Slice& slice) const {
|
||||
return GetBucket(GetHash(slice));
|
||||
}
|
||||
|
||||
|
@ -414,30 +418,39 @@ class HashLinkListRep : public MemTableRep {
|
|||
// Advance to the first entry with a key >= target
|
||||
void Seek(const Slice& k, const char* memtable_key) override {
|
||||
auto transformed = memtable_rep_.GetPrefix(k);
|
||||
auto* bucket = memtable_rep_.GetBucket(transformed);
|
||||
Pointer& bucket = memtable_rep_.GetBucket(transformed);
|
||||
|
||||
SkipListBucketHeader* skip_list_header =
|
||||
memtable_rep_.GetSkipListBucketHeader(bucket);
|
||||
if (skip_list_header != nullptr) {
|
||||
// The bucket is organized as a skip list
|
||||
if (!skip_list_iter_) {
|
||||
skip_list_iter_.reset(
|
||||
new MemtableSkipList::Iterator(&skip_list_header->skip_list));
|
||||
} else {
|
||||
skip_list_iter_->SetList(&skip_list_header->skip_list);
|
||||
}
|
||||
if (memtable_key != nullptr) {
|
||||
skip_list_iter_->Seek(memtable_key);
|
||||
} else {
|
||||
IterKey encoded_key;
|
||||
encoded_key.EncodeLengthPrefixedKey(k);
|
||||
skip_list_iter_->Seek(encoded_key.GetUserKey().data());
|
||||
}
|
||||
} else {
|
||||
// The bucket is organized as a linked list
|
||||
if (memtable_rep_.IsEmptyBucket(bucket)) {
|
||||
skip_list_iter_.reset();
|
||||
Reset(memtable_rep_.GetLinkListFirstNode(bucket));
|
||||
HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
|
||||
Reset(nullptr);
|
||||
} else {
|
||||
Node* first_linked_list_node =
|
||||
memtable_rep_.GetLinkListFirstNode(bucket);
|
||||
if (first_linked_list_node != nullptr) {
|
||||
// The bucket is organized as a linked list
|
||||
skip_list_iter_.reset();
|
||||
Reset(first_linked_list_node);
|
||||
HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
|
||||
|
||||
} else {
|
||||
SkipListBucketHeader* skip_list_header =
|
||||
memtable_rep_.GetSkipListBucketHeader(bucket);
|
||||
assert(skip_list_header != nullptr);
|
||||
// The bucket is organized as a skip list
|
||||
if (!skip_list_iter_) {
|
||||
skip_list_iter_.reset(
|
||||
new MemtableSkipList::Iterator(&skip_list_header->skip_list));
|
||||
} else {
|
||||
skip_list_iter_->SetList(&skip_list_header->skip_list);
|
||||
}
|
||||
if (memtable_key != nullptr) {
|
||||
skip_list_iter_->Seek(memtable_key);
|
||||
} else {
|
||||
IterKey encoded_key;
|
||||
encoded_key.EncodeLengthPrefixedKey(k);
|
||||
skip_list_iter_->Seek(encoded_key.GetUserKey().data());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -528,36 +541,38 @@ KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
|
|||
}
|
||||
|
||||
SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
|
||||
Pointer* first_next_pointer) const {
|
||||
if (first_next_pointer == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
|
||||
// Single entry bucket
|
||||
return nullptr;
|
||||
}
|
||||
Pointer& bucket_pointer) const {
|
||||
Pointer* first_next_pointer =
|
||||
static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
|
||||
assert(first_next_pointer != nullptr);
|
||||
assert(first_next_pointer->load(std::memory_order_relaxed) != nullptr);
|
||||
|
||||
// Counting header
|
||||
BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
|
||||
if (header->IsSkipListBucket()) {
|
||||
assert(header->GetNumEntries() > threshold_use_skiplist_);
|
||||
auto* skip_list_bucket_header =
|
||||
reinterpret_cast<SkipListBucketHeader*>(header);
|
||||
assert(skip_list_bucket_header->Counting_header.next.load(
|
||||
std::memory_order_relaxed) == header);
|
||||
return skip_list_bucket_header;
|
||||
}
|
||||
assert(header->GetNumEntries() <= threshold_use_skiplist_);
|
||||
return nullptr;
|
||||
assert(header->IsSkipListBucket());
|
||||
assert(header->GetNumEntries() > threshold_use_skiplist_);
|
||||
auto* skip_list_bucket_header =
|
||||
reinterpret_cast<SkipListBucketHeader*>(header);
|
||||
assert(skip_list_bucket_header->Counting_header.next.load(
|
||||
std::memory_order_relaxed) == header);
|
||||
return skip_list_bucket_header;
|
||||
}
|
||||
|
||||
Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
|
||||
if (first_next_pointer == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
Node* HashLinkListRep::GetLinkListFirstNode(Pointer& bucket_pointer) const {
|
||||
Pointer* first_next_pointer =
|
||||
static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
|
||||
assert(first_next_pointer != nullptr);
|
||||
if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
|
||||
// Single entry bucket
|
||||
return reinterpret_cast<Node*>(first_next_pointer);
|
||||
}
|
||||
|
||||
// It is possible that after we fetch first_next_pointer it is modified
|
||||
// and the next is not null anymore. In this case, the bucket should have been
|
||||
// modified to a counting header, so we should reload the first_next_pointer
|
||||
// to make sure we see the update.
|
||||
first_next_pointer =
|
||||
static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
|
||||
// Counting header
|
||||
BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
|
||||
if (!header->IsSkipListBucket()) {
|
||||
|
@ -695,17 +710,21 @@ bool HashLinkListRep::Contains(const char* key) const {
|
|||
Slice internal_key = GetLengthPrefixedSlice(key);
|
||||
|
||||
auto transformed = GetPrefix(internal_key);
|
||||
auto bucket = GetBucket(transformed);
|
||||
if (bucket == nullptr) {
|
||||
Pointer& bucket = GetBucket(transformed);
|
||||
if (IsEmptyBucket(bucket)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Node* linked_list_node = GetLinkListFirstNode(bucket);
|
||||
if (linked_list_node != nullptr) {
|
||||
return LinkListContains(linked_list_node, internal_key);
|
||||
}
|
||||
|
||||
SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
|
||||
if (skip_list_header != nullptr) {
|
||||
return skip_list_header->skip_list.Contains(key);
|
||||
} else {
|
||||
return LinkListContains(GetLinkListFirstNode(bucket), internal_key);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t HashLinkListRep::ApproximateMemoryUsage() {
|
||||
|
@ -716,21 +735,25 @@ size_t HashLinkListRep::ApproximateMemoryUsage() {
|
|||
void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
|
||||
bool (*callback_func)(void* arg, const char* entry)) {
|
||||
auto transformed = transform_->Transform(k.user_key());
|
||||
auto bucket = GetBucket(transformed);
|
||||
Pointer& bucket = GetBucket(transformed);
|
||||
|
||||
auto* skip_list_header = GetSkipListBucketHeader(bucket);
|
||||
if (skip_list_header != nullptr) {
|
||||
// Is a skip list
|
||||
MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
|
||||
for (iter.Seek(k.memtable_key().data());
|
||||
if (IsEmptyBucket(bucket)) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto* link_list_head = GetLinkListFirstNode(bucket);
|
||||
if (link_list_head != nullptr) {
|
||||
LinkListIterator iter(this, link_list_head);
|
||||
for (iter.Seek(k.internal_key(), nullptr);
|
||||
iter.Valid() && callback_func(callback_args, iter.key());
|
||||
iter.Next()) {
|
||||
}
|
||||
} else {
|
||||
auto* link_list_head = GetLinkListFirstNode(bucket);
|
||||
if (link_list_head != nullptr) {
|
||||
LinkListIterator iter(this, link_list_head);
|
||||
for (iter.Seek(k.internal_key(), nullptr);
|
||||
auto* skip_list_header = GetSkipListBucketHeader(bucket);
|
||||
if (skip_list_header != nullptr) {
|
||||
// Is a skip list
|
||||
MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
|
||||
for (iter.Seek(k.memtable_key().data());
|
||||
iter.Valid() && callback_func(callback_args, iter.key());
|
||||
iter.Next()) {
|
||||
}
|
||||
|
@ -746,25 +769,24 @@ MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
|
|||
|
||||
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||
int count = 0;
|
||||
auto* bucket = GetBucket(i);
|
||||
if (bucket != nullptr) {
|
||||
auto* skip_list_header = GetSkipListBucketHeader(bucket);
|
||||
if (skip_list_header != nullptr) {
|
||||
Pointer& bucket = GetBucket(i);
|
||||
if (!IsEmptyBucket(bucket)) {
|
||||
auto* link_list_head = GetLinkListFirstNode(bucket);
|
||||
if (link_list_head != nullptr) {
|
||||
LinkListIterator itr(this, link_list_head);
|
||||
for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
|
||||
list->Insert(itr.key());
|
||||
count++;
|
||||
}
|
||||
} else {
|
||||
auto* skip_list_header = GetSkipListBucketHeader(bucket);
|
||||
assert(skip_list_header != nullptr);
|
||||
// Is a skip list
|
||||
MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
|
||||
for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
|
||||
list->Insert(itr.key());
|
||||
count++;
|
||||
}
|
||||
} else {
|
||||
auto* link_list_head = GetLinkListFirstNode(bucket);
|
||||
if (link_list_head != nullptr) {
|
||||
LinkListIterator itr(this, link_list_head);
|
||||
for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
|
||||
list->Insert(itr.key());
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (if_log_bucket_dist_when_flash_) {
|
||||
|
|
Loading…
Reference in New Issue