rocksdb/memtable/wbwi_memtable.cc
Changyu Bi 1c7652fcef Introduce a WriteBatchWithIndex-based implementation of ReadOnlyMemTable (#13123)
Summary:
introduce the class WBWIMemTable that implements ReadOnlyMemTable interface with data stored in a WriteBatchWithIndex object.

This PR implements the main read path: Get, MultiGet and Iterator. It only supports Put, Delete and SingleDelete operations for now. All the keys in the WBWIMemTable will be assigned a global sequence number through WBWIMemTable::SetGlobalSequenceNumber().

Planned follow up PRs:
- Create WBWIMemTable with a transaction's WBWI and ingest it into a DB during Transaction::Commit()
- Support for Merge. This will be more complicated since we can have multiple updates with the same user key for Merge.
- Support for other operations like WideColumn and other ReadOnlyMemTable methods.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13123

Test Plan: * A mini-stress test for the read path is added as a new unit test

Reviewed By: jowlyzhang

Differential Revision: D65633419

Pulled By: cbi42

fbshipit-source-id: 0684fe47260b41f51ca39c300eb72ca5bc9c5a3b
2024-11-12 09:27:11 -08:00

148 lines
5.8 KiB
C++

// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "memtable/wbwi_memtable.h"
#include "db/memtable.h"
namespace ROCKSDB_NAMESPACE {
const std::unordered_map<WriteType, ValueType>
WBWIMemTableIterator::WriteTypeToValueTypeMap = {
{kPutRecord, kTypeValue},
{kMergeRecord, kTypeMerge},
{kDeleteRecord, kTypeDeletion},
{kSingleDeleteRecord, kTypeSingleDeletion},
{kDeleteRangeRecord, kTypeRangeDeletion},
{kPutEntityRecord, kTypeWideColumnEntity},
// Only the above record types are added to WBWI.
// kLogDataRecord, kXIDRecord, kUnknownRecord
};
bool WBWIMemTable::Get(const LookupKey& key, std::string* value,
PinnableWideColumns* columns, std::string* timestamp,
Status* s, MergeContext* merge_context,
SequenceNumber* max_covering_tombstone_seq,
SequenceNumber* out_seq, const ReadOptions&,
bool immutable_memtable, ReadCallback* callback,
bool* is_blob_index, bool do_merge) {
(void)immutable_memtable;
(void)timestamp;
(void)columns;
assert(immutable_memtable);
assert(!timestamp); // TODO: support UDT
assert(!columns); // TODO: support WideColumn
assert(global_seqno_ != kMaxSequenceNumber);
// WBWI does not support DeleteRange yet.
assert(!wbwi_->GetWriteBatch()->HasDeleteRange());
[[maybe_unused]] SequenceNumber read_seq =
GetInternalKeySeqno(key.internal_key());
std::unique_ptr<InternalIterator> iter{NewIterator()};
iter->Seek(key.internal_key());
const Slice lookup_user_key = key.user_key();
while (iter->Valid() && comparator_->EqualWithoutTimestamp(
ExtractUserKey(iter->key()), lookup_user_key)) {
uint64_t tag = ExtractInternalKeyFooter(iter->key());
ValueType type;
SequenceNumber seq;
UnPackSequenceAndType(tag, &seq, &type);
// Unsupported operations.
assert(type != kTypeBlobIndex);
assert(type != kTypeWideColumnEntity);
assert(type != kTypeValuePreferredSeqno);
assert(type != kTypeDeletionWithTimestamp);
assert(type != kTypeMerge);
if (!callback || callback->IsVisible(seq)) {
if (*out_seq == kMaxSequenceNumber) {
*out_seq = std::max(seq, *max_covering_tombstone_seq);
}
if (*max_covering_tombstone_seq > seq) {
type = kTypeRangeDeletion;
}
switch (type) {
case kTypeValue: {
HandleTypeValue(lookup_user_key, iter->value(), iter->IsValuePinned(),
do_merge, s->IsMergeInProgress(), merge_context,
moptions_.merge_operator, clock_,
moptions_.statistics, moptions_.info_log, s, value,
columns, is_blob_index);
assert(seq <= read_seq);
return /*found_final_value=*/true;
}
case kTypeDeletion:
case kTypeSingleDeletion:
case kTypeRangeDeletion: {
HandleTypeDeletion(lookup_user_key, s->IsMergeInProgress(),
merge_context, moptions_.merge_operator, clock_,
moptions_.statistics, moptions_.info_log, s, value,
columns);
assert(seq <= read_seq);
return /*found_final_value=*/true;
}
default: {
std::string msg("Unrecognized or unsupported value type: " +
std::to_string(static_cast<int>(type)) + ". ");
msg.append("User key: " +
ExtractUserKey(iter->key()).ToString(/*hex=*/true) + ". ");
msg.append("seq: " + std::to_string(seq) + ".");
*s = Status::Corruption(msg.c_str());
return /*found_final_value=*/true;
}
}
}
// Current key not visible or we read a merge key
assert(s->IsMergeInProgress() || (callback && !callback->IsVisible(seq)));
iter->Next();
}
if (!iter->status().ok() &&
(s->ok() || s->IsMergeInProgress() || s->IsNotFound())) {
*s = iter->status();
// stop further look up
return true;
}
return /*found_final_value=*/false;
}
void WBWIMemTable::MultiGet(const ReadOptions& read_options,
MultiGetRange* range, ReadCallback* callback,
bool immutable_memtable) {
(void)immutable_memtable;
// Should only be used as immutable memtable.
assert(immutable_memtable);
// TODO: reuse the InternalIterator created in Get().
for (auto iter = range->begin(); iter != range->end(); ++iter) {
SequenceNumber dummy_seq;
bool found_final_value =
Get(*iter->lkey, iter->value ? iter->value->GetSelf() : nullptr,
iter->columns, iter->timestamp, iter->s, &(iter->merge_context),
&(iter->max_covering_tombstone_seq), &dummy_seq, read_options, true,
callback, nullptr, true);
if (found_final_value) {
if (iter->s->ok() || iter->s->IsNotFound()) {
if (iter->value) {
iter->value->PinSelf();
range->AddValueSize(iter->value->size());
} else {
assert(iter->columns);
range->AddValueSize(iter->columns->serialized_size());
}
}
range->MarkKeyDone(iter);
if (range->GetValueSize() > read_options.value_size_soft_limit) {
// Set all remaining keys in range to Abort
for (auto range_iter = range->begin(); range_iter != range->end();
++range_iter) {
range->MarkKeyDone(range_iter);
*(range_iter->s) = Status::Aborted();
}
break;
}
}
}
}
} // namespace ROCKSDB_NAMESPACE