mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-27 11:43:49 +00:00
1c7652fcef
Summary: introduce the class WBWIMemTable that implements ReadOnlyMemTable interface with data stored in a WriteBatchWithIndex object. This PR implements the main read path: Get, MultiGet and Iterator. It only supports Put, Delete and SingleDelete operations for now. All the keys in the WBWIMemTable will be assigned a global sequence number through WBWIMemTable::SetGlobalSequenceNumber(). Planned follow up PRs: - Create WBWIMemTable with a transaction's WBWI and ingest it into a DB during Transaction::Commit() - Support for Merge. This will be more complicated since we can have multiple updates with the same user key for Merge. - Support for other operations like WideColumn and other ReadOnlyMemTable methods. Pull Request resolved: https://github.com/facebook/rocksdb/pull/13123 Test Plan: * A mini-stress test for the read path is added as a new unit test Reviewed By: jowlyzhang Differential Revision: D65633419 Pulled By: cbi42 fbshipit-source-id: 0684fe47260b41f51ca39c300eb72ca5bc9c5a3b
148 lines
5.8 KiB
C++
148 lines
5.8 KiB
C++
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "memtable/wbwi_memtable.h"
|
|
|
|
#include "db/memtable.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
const std::unordered_map<WriteType, ValueType>
|
|
WBWIMemTableIterator::WriteTypeToValueTypeMap = {
|
|
{kPutRecord, kTypeValue},
|
|
{kMergeRecord, kTypeMerge},
|
|
{kDeleteRecord, kTypeDeletion},
|
|
{kSingleDeleteRecord, kTypeSingleDeletion},
|
|
{kDeleteRangeRecord, kTypeRangeDeletion},
|
|
{kPutEntityRecord, kTypeWideColumnEntity},
|
|
// Only the above record types are added to WBWI.
|
|
// kLogDataRecord, kXIDRecord, kUnknownRecord
|
|
};
|
|
|
|
bool WBWIMemTable::Get(const LookupKey& key, std::string* value,
|
|
PinnableWideColumns* columns, std::string* timestamp,
|
|
Status* s, MergeContext* merge_context,
|
|
SequenceNumber* max_covering_tombstone_seq,
|
|
SequenceNumber* out_seq, const ReadOptions&,
|
|
bool immutable_memtable, ReadCallback* callback,
|
|
bool* is_blob_index, bool do_merge) {
|
|
(void)immutable_memtable;
|
|
(void)timestamp;
|
|
(void)columns;
|
|
assert(immutable_memtable);
|
|
assert(!timestamp); // TODO: support UDT
|
|
assert(!columns); // TODO: support WideColumn
|
|
assert(global_seqno_ != kMaxSequenceNumber);
|
|
// WBWI does not support DeleteRange yet.
|
|
assert(!wbwi_->GetWriteBatch()->HasDeleteRange());
|
|
|
|
[[maybe_unused]] SequenceNumber read_seq =
|
|
GetInternalKeySeqno(key.internal_key());
|
|
std::unique_ptr<InternalIterator> iter{NewIterator()};
|
|
iter->Seek(key.internal_key());
|
|
const Slice lookup_user_key = key.user_key();
|
|
|
|
while (iter->Valid() && comparator_->EqualWithoutTimestamp(
|
|
ExtractUserKey(iter->key()), lookup_user_key)) {
|
|
uint64_t tag = ExtractInternalKeyFooter(iter->key());
|
|
ValueType type;
|
|
SequenceNumber seq;
|
|
UnPackSequenceAndType(tag, &seq, &type);
|
|
// Unsupported operations.
|
|
assert(type != kTypeBlobIndex);
|
|
assert(type != kTypeWideColumnEntity);
|
|
assert(type != kTypeValuePreferredSeqno);
|
|
assert(type != kTypeDeletionWithTimestamp);
|
|
assert(type != kTypeMerge);
|
|
if (!callback || callback->IsVisible(seq)) {
|
|
if (*out_seq == kMaxSequenceNumber) {
|
|
*out_seq = std::max(seq, *max_covering_tombstone_seq);
|
|
}
|
|
if (*max_covering_tombstone_seq > seq) {
|
|
type = kTypeRangeDeletion;
|
|
}
|
|
switch (type) {
|
|
case kTypeValue: {
|
|
HandleTypeValue(lookup_user_key, iter->value(), iter->IsValuePinned(),
|
|
do_merge, s->IsMergeInProgress(), merge_context,
|
|
moptions_.merge_operator, clock_,
|
|
moptions_.statistics, moptions_.info_log, s, value,
|
|
columns, is_blob_index);
|
|
assert(seq <= read_seq);
|
|
return /*found_final_value=*/true;
|
|
}
|
|
case kTypeDeletion:
|
|
case kTypeSingleDeletion:
|
|
case kTypeRangeDeletion: {
|
|
HandleTypeDeletion(lookup_user_key, s->IsMergeInProgress(),
|
|
merge_context, moptions_.merge_operator, clock_,
|
|
moptions_.statistics, moptions_.info_log, s, value,
|
|
columns);
|
|
assert(seq <= read_seq);
|
|
return /*found_final_value=*/true;
|
|
}
|
|
default: {
|
|
std::string msg("Unrecognized or unsupported value type: " +
|
|
std::to_string(static_cast<int>(type)) + ". ");
|
|
msg.append("User key: " +
|
|
ExtractUserKey(iter->key()).ToString(/*hex=*/true) + ". ");
|
|
msg.append("seq: " + std::to_string(seq) + ".");
|
|
*s = Status::Corruption(msg.c_str());
|
|
return /*found_final_value=*/true;
|
|
}
|
|
}
|
|
}
|
|
// Current key not visible or we read a merge key
|
|
assert(s->IsMergeInProgress() || (callback && !callback->IsVisible(seq)));
|
|
iter->Next();
|
|
}
|
|
if (!iter->status().ok() &&
|
|
(s->ok() || s->IsMergeInProgress() || s->IsNotFound())) {
|
|
*s = iter->status();
|
|
// stop further look up
|
|
return true;
|
|
}
|
|
return /*found_final_value=*/false;
|
|
}
|
|
|
|
void WBWIMemTable::MultiGet(const ReadOptions& read_options,
|
|
MultiGetRange* range, ReadCallback* callback,
|
|
bool immutable_memtable) {
|
|
(void)immutable_memtable;
|
|
// Should only be used as immutable memtable.
|
|
assert(immutable_memtable);
|
|
// TODO: reuse the InternalIterator created in Get().
|
|
for (auto iter = range->begin(); iter != range->end(); ++iter) {
|
|
SequenceNumber dummy_seq;
|
|
bool found_final_value =
|
|
Get(*iter->lkey, iter->value ? iter->value->GetSelf() : nullptr,
|
|
iter->columns, iter->timestamp, iter->s, &(iter->merge_context),
|
|
&(iter->max_covering_tombstone_seq), &dummy_seq, read_options, true,
|
|
callback, nullptr, true);
|
|
if (found_final_value) {
|
|
if (iter->s->ok() || iter->s->IsNotFound()) {
|
|
if (iter->value) {
|
|
iter->value->PinSelf();
|
|
range->AddValueSize(iter->value->size());
|
|
} else {
|
|
assert(iter->columns);
|
|
range->AddValueSize(iter->columns->serialized_size());
|
|
}
|
|
}
|
|
range->MarkKeyDone(iter);
|
|
if (range->GetValueSize() > read_options.value_size_soft_limit) {
|
|
// Set all remaining keys in range to Abort
|
|
for (auto range_iter = range->begin(); range_iter != range->end();
|
|
++range_iter) {
|
|
range->MarkKeyDone(range_iter);
|
|
*(range_iter->s) = Status::Aborted();
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} // namespace ROCKSDB_NAMESPACE
|