mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-28 05:43:50 +00:00
Fix a bug in hash linked list (#10401)
Summary: In hash linked list, with a bucket of only one record, following sequence can cause users to temporarily miss a record: Thread 1: Fetch the structure bucket x points too, which would be a Node n1 for a key, with next pointer to be null Thread 2: Insert a key to bucket x that is larger than the existing key. This will make n1->next points to a new node n2, and update bucket x to point to n1. Thread 1: see n1->next is not null, so it thinks it is a header of linked list and ignore the key of n1. Fix it by refetch structure that bucket x points to when it sees n1->next is not null. This should work because if n1->next is not null, bucket x should already point to a linked list or skip list header. A related change is to revert th order of testing for linked list and skip list. This is because after refetching the bucket, it might end up with a skip list, rather than linked list. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10401 Test Plan: Run existing tests and make sure at least it doesn't regress. Reviewed By: jay-zhuang Differential Revision: D38064471 fbshipit-source-id: 142bb85e1546c803f47e3357aef3e76debccd8df
This commit is contained in:
parent
6a160e1fec
commit
4e00748098
|
@ -13,6 +13,7 @@
|
||||||
* Fix a bug where `GenericRateLimiter` could revert the bandwidth set dynamically using `SetBytesPerSecond()` when a user configures a structure enclosing it, e.g., using `GetOptionsFromString()` to configure an `Options` that references an existing `RateLimiter` object.
|
* Fix a bug where `GenericRateLimiter` could revert the bandwidth set dynamically using `SetBytesPerSecond()` when a user configures a structure enclosing it, e.g., using `GetOptionsFromString()` to configure an `Options` that references an existing `RateLimiter` object.
|
||||||
* Fix race conditions in `GenericRateLimiter`.
|
* Fix race conditions in `GenericRateLimiter`.
|
||||||
* Fix a bug in `FIFOCompactionPicker::PickTTLCompaction` where total_size calculating might cause underflow
|
* Fix a bug in `FIFOCompactionPicker::PickTTLCompaction` where total_size calculating might cause underflow
|
||||||
|
* Fix data race bug in hash linked list memtable. With this bug, read request might temporarily miss an old record in the memtable in a race condition to the hash bucket.
|
||||||
|
|
||||||
## 7.5.0 (07/15/2022)
|
## 7.5.0 (07/15/2022)
|
||||||
### New Features
|
### New Features
|
||||||
|
|
|
@ -209,10 +209,16 @@ class HashLinkListRep : public MemTableRep {
|
||||||
|
|
||||||
bool LinkListContains(Node* head, const Slice& key) const;
|
bool LinkListContains(Node* head, const Slice& key) const;
|
||||||
|
|
||||||
SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer)
|
bool IsEmptyBucket(Pointer& bucket_pointer) const {
|
||||||
const;
|
return bucket_pointer.load(std::memory_order_acquire) == nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
Node* GetLinkListFirstNode(Pointer* first_next_pointer) const;
|
// Precondition: GetLinkListFirstNode() must have been called first and return
|
||||||
|
// null so that it must be a skip list bucket
|
||||||
|
SkipListBucketHeader* GetSkipListBucketHeader(Pointer& bucket_pointer) const;
|
||||||
|
|
||||||
|
// Returning nullptr indicates it is a skip list bucket.
|
||||||
|
Node* GetLinkListFirstNode(Pointer& bucket_pointer) const;
|
||||||
|
|
||||||
Slice GetPrefix(const Slice& internal_key) const {
|
Slice GetPrefix(const Slice& internal_key) const {
|
||||||
return transform_->Transform(ExtractUserKey(internal_key));
|
return transform_->Transform(ExtractUserKey(internal_key));
|
||||||
|
@ -222,11 +228,9 @@ class HashLinkListRep : public MemTableRep {
|
||||||
return GetSliceRangedNPHash(slice, bucket_size_);
|
return GetSliceRangedNPHash(slice, bucket_size_);
|
||||||
}
|
}
|
||||||
|
|
||||||
Pointer* GetBucket(size_t i) const {
|
Pointer& GetBucket(size_t i) const { return buckets_[i]; }
|
||||||
return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire));
|
|
||||||
}
|
|
||||||
|
|
||||||
Pointer* GetBucket(const Slice& slice) const {
|
Pointer& GetBucket(const Slice& slice) const {
|
||||||
return GetBucket(GetHash(slice));
|
return GetBucket(GetHash(slice));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -414,30 +418,39 @@ class HashLinkListRep : public MemTableRep {
|
||||||
// Advance to the first entry with a key >= target
|
// Advance to the first entry with a key >= target
|
||||||
void Seek(const Slice& k, const char* memtable_key) override {
|
void Seek(const Slice& k, const char* memtable_key) override {
|
||||||
auto transformed = memtable_rep_.GetPrefix(k);
|
auto transformed = memtable_rep_.GetPrefix(k);
|
||||||
auto* bucket = memtable_rep_.GetBucket(transformed);
|
Pointer& bucket = memtable_rep_.GetBucket(transformed);
|
||||||
|
|
||||||
SkipListBucketHeader* skip_list_header =
|
if (memtable_rep_.IsEmptyBucket(bucket)) {
|
||||||
memtable_rep_.GetSkipListBucketHeader(bucket);
|
|
||||||
if (skip_list_header != nullptr) {
|
|
||||||
// The bucket is organized as a skip list
|
|
||||||
if (!skip_list_iter_) {
|
|
||||||
skip_list_iter_.reset(
|
|
||||||
new MemtableSkipList::Iterator(&skip_list_header->skip_list));
|
|
||||||
} else {
|
|
||||||
skip_list_iter_->SetList(&skip_list_header->skip_list);
|
|
||||||
}
|
|
||||||
if (memtable_key != nullptr) {
|
|
||||||
skip_list_iter_->Seek(memtable_key);
|
|
||||||
} else {
|
|
||||||
IterKey encoded_key;
|
|
||||||
encoded_key.EncodeLengthPrefixedKey(k);
|
|
||||||
skip_list_iter_->Seek(encoded_key.GetUserKey().data());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// The bucket is organized as a linked list
|
|
||||||
skip_list_iter_.reset();
|
skip_list_iter_.reset();
|
||||||
Reset(memtable_rep_.GetLinkListFirstNode(bucket));
|
Reset(nullptr);
|
||||||
HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
|
} else {
|
||||||
|
Node* first_linked_list_node =
|
||||||
|
memtable_rep_.GetLinkListFirstNode(bucket);
|
||||||
|
if (first_linked_list_node != nullptr) {
|
||||||
|
// The bucket is organized as a linked list
|
||||||
|
skip_list_iter_.reset();
|
||||||
|
Reset(first_linked_list_node);
|
||||||
|
HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
SkipListBucketHeader* skip_list_header =
|
||||||
|
memtable_rep_.GetSkipListBucketHeader(bucket);
|
||||||
|
assert(skip_list_header != nullptr);
|
||||||
|
// The bucket is organized as a skip list
|
||||||
|
if (!skip_list_iter_) {
|
||||||
|
skip_list_iter_.reset(
|
||||||
|
new MemtableSkipList::Iterator(&skip_list_header->skip_list));
|
||||||
|
} else {
|
||||||
|
skip_list_iter_->SetList(&skip_list_header->skip_list);
|
||||||
|
}
|
||||||
|
if (memtable_key != nullptr) {
|
||||||
|
skip_list_iter_->Seek(memtable_key);
|
||||||
|
} else {
|
||||||
|
IterKey encoded_key;
|
||||||
|
encoded_key.EncodeLengthPrefixedKey(k);
|
||||||
|
skip_list_iter_->Seek(encoded_key.GetUserKey().data());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -528,36 +541,38 @@ KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
|
||||||
}
|
}
|
||||||
|
|
||||||
SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
|
SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
|
||||||
Pointer* first_next_pointer) const {
|
Pointer& bucket_pointer) const {
|
||||||
if (first_next_pointer == nullptr) {
|
Pointer* first_next_pointer =
|
||||||
return nullptr;
|
static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
|
||||||
}
|
assert(first_next_pointer != nullptr);
|
||||||
if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
|
assert(first_next_pointer->load(std::memory_order_relaxed) != nullptr);
|
||||||
// Single entry bucket
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
// Counting header
|
// Counting header
|
||||||
BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
|
BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
|
||||||
if (header->IsSkipListBucket()) {
|
assert(header->IsSkipListBucket());
|
||||||
assert(header->GetNumEntries() > threshold_use_skiplist_);
|
assert(header->GetNumEntries() > threshold_use_skiplist_);
|
||||||
auto* skip_list_bucket_header =
|
auto* skip_list_bucket_header =
|
||||||
reinterpret_cast<SkipListBucketHeader*>(header);
|
reinterpret_cast<SkipListBucketHeader*>(header);
|
||||||
assert(skip_list_bucket_header->Counting_header.next.load(
|
assert(skip_list_bucket_header->Counting_header.next.load(
|
||||||
std::memory_order_relaxed) == header);
|
std::memory_order_relaxed) == header);
|
||||||
return skip_list_bucket_header;
|
return skip_list_bucket_header;
|
||||||
}
|
|
||||||
assert(header->GetNumEntries() <= threshold_use_skiplist_);
|
|
||||||
return nullptr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
|
Node* HashLinkListRep::GetLinkListFirstNode(Pointer& bucket_pointer) const {
|
||||||
if (first_next_pointer == nullptr) {
|
Pointer* first_next_pointer =
|
||||||
return nullptr;
|
static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
|
||||||
}
|
assert(first_next_pointer != nullptr);
|
||||||
if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
|
if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
|
||||||
// Single entry bucket
|
// Single entry bucket
|
||||||
return reinterpret_cast<Node*>(first_next_pointer);
|
return reinterpret_cast<Node*>(first_next_pointer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// It is possible that after we fetch first_next_pointer it is modified
|
||||||
|
// and the next is not null anymore. In this case, the bucket should have been
|
||||||
|
// modified to a counting header, so we should reload the first_next_pointer
|
||||||
|
// to make sure we see the update.
|
||||||
|
first_next_pointer =
|
||||||
|
static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
|
||||||
// Counting header
|
// Counting header
|
||||||
BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
|
BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
|
||||||
if (!header->IsSkipListBucket()) {
|
if (!header->IsSkipListBucket()) {
|
||||||
|
@ -695,17 +710,21 @@ bool HashLinkListRep::Contains(const char* key) const {
|
||||||
Slice internal_key = GetLengthPrefixedSlice(key);
|
Slice internal_key = GetLengthPrefixedSlice(key);
|
||||||
|
|
||||||
auto transformed = GetPrefix(internal_key);
|
auto transformed = GetPrefix(internal_key);
|
||||||
auto bucket = GetBucket(transformed);
|
Pointer& bucket = GetBucket(transformed);
|
||||||
if (bucket == nullptr) {
|
if (IsEmptyBucket(bucket)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Node* linked_list_node = GetLinkListFirstNode(bucket);
|
||||||
|
if (linked_list_node != nullptr) {
|
||||||
|
return LinkListContains(linked_list_node, internal_key);
|
||||||
|
}
|
||||||
|
|
||||||
SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
|
SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
|
||||||
if (skip_list_header != nullptr) {
|
if (skip_list_header != nullptr) {
|
||||||
return skip_list_header->skip_list.Contains(key);
|
return skip_list_header->skip_list.Contains(key);
|
||||||
} else {
|
|
||||||
return LinkListContains(GetLinkListFirstNode(bucket), internal_key);
|
|
||||||
}
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t HashLinkListRep::ApproximateMemoryUsage() {
|
size_t HashLinkListRep::ApproximateMemoryUsage() {
|
||||||
|
@ -716,21 +735,25 @@ size_t HashLinkListRep::ApproximateMemoryUsage() {
|
||||||
void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
|
void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
|
||||||
bool (*callback_func)(void* arg, const char* entry)) {
|
bool (*callback_func)(void* arg, const char* entry)) {
|
||||||
auto transformed = transform_->Transform(k.user_key());
|
auto transformed = transform_->Transform(k.user_key());
|
||||||
auto bucket = GetBucket(transformed);
|
Pointer& bucket = GetBucket(transformed);
|
||||||
|
|
||||||
auto* skip_list_header = GetSkipListBucketHeader(bucket);
|
if (IsEmptyBucket(bucket)) {
|
||||||
if (skip_list_header != nullptr) {
|
return;
|
||||||
// Is a skip list
|
}
|
||||||
MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
|
|
||||||
for (iter.Seek(k.memtable_key().data());
|
auto* link_list_head = GetLinkListFirstNode(bucket);
|
||||||
|
if (link_list_head != nullptr) {
|
||||||
|
LinkListIterator iter(this, link_list_head);
|
||||||
|
for (iter.Seek(k.internal_key(), nullptr);
|
||||||
iter.Valid() && callback_func(callback_args, iter.key());
|
iter.Valid() && callback_func(callback_args, iter.key());
|
||||||
iter.Next()) {
|
iter.Next()) {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto* link_list_head = GetLinkListFirstNode(bucket);
|
auto* skip_list_header = GetSkipListBucketHeader(bucket);
|
||||||
if (link_list_head != nullptr) {
|
if (skip_list_header != nullptr) {
|
||||||
LinkListIterator iter(this, link_list_head);
|
// Is a skip list
|
||||||
for (iter.Seek(k.internal_key(), nullptr);
|
MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
|
||||||
|
for (iter.Seek(k.memtable_key().data());
|
||||||
iter.Valid() && callback_func(callback_args, iter.key());
|
iter.Valid() && callback_func(callback_args, iter.key());
|
||||||
iter.Next()) {
|
iter.Next()) {
|
||||||
}
|
}
|
||||||
|
@ -746,25 +769,24 @@ MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
|
||||||
|
|
||||||
for (size_t i = 0; i < bucket_size_; ++i) {
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
auto* bucket = GetBucket(i);
|
Pointer& bucket = GetBucket(i);
|
||||||
if (bucket != nullptr) {
|
if (!IsEmptyBucket(bucket)) {
|
||||||
auto* skip_list_header = GetSkipListBucketHeader(bucket);
|
auto* link_list_head = GetLinkListFirstNode(bucket);
|
||||||
if (skip_list_header != nullptr) {
|
if (link_list_head != nullptr) {
|
||||||
|
LinkListIterator itr(this, link_list_head);
|
||||||
|
for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
|
||||||
|
list->Insert(itr.key());
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
auto* skip_list_header = GetSkipListBucketHeader(bucket);
|
||||||
|
assert(skip_list_header != nullptr);
|
||||||
// Is a skip list
|
// Is a skip list
|
||||||
MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
|
MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
|
||||||
for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
|
for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
|
||||||
list->Insert(itr.key());
|
list->Insert(itr.key());
|
||||||
count++;
|
count++;
|
||||||
}
|
|
||||||
} else {
|
|
||||||
auto* link_list_head = GetLinkListFirstNode(bucket);
|
|
||||||
if (link_list_head != nullptr) {
|
|
||||||
LinkListIterator itr(this, link_list_head);
|
|
||||||
for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
|
|
||||||
list->Insert(itr.key());
|
|
||||||
count++;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (if_log_bucket_dist_when_flash_) {
|
if (if_log_bucket_dist_when_flash_) {
|
||||||
|
|
Loading…
Reference in a new issue