mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-26 16:30:56 +00:00
06e593376c
Summary: ## Context/Summary Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity. For that, this PR does the following: - Tag different write IOs by passing down and converting WriteOptions to IOOptions - Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS Some related code refactory to make implementation cleaner: - Blob stats - Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info. - Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write. - Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority - Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification - Build table - TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables - Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder. This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more - Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority ## Test ### db bench Flush ``` ./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100 rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377 rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377 rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0 rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0 ``` compaction, db oopen ``` Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1 rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279 rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0 rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213 rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66 ``` blob stats - just to make sure they aren't broken by this PR ``` Integrated Blob DB Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1 pre-PR: rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600 rocksdb.blobdb.blob.file.synced COUNT : 1 rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 post-PR: rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614 - COUNT is higher and values are smaller as it includes header and footer write - COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164 rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same) rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same) ``` ``` Stacked Blob DB Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench pre-PR: rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876 rocksdb.blobdb.blob.file.synced COUNT : 8 rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 post-PR: rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924 - COUNT is higher and values are smaller as it includes header and footer write - COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164 rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same) rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same) ``` ### Rehearsal CI stress test Trigger 3 full runs of all our CI stress tests ### Performance Flush ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000 -- default: 1 thread is used to run benchmark; enable_statistics = true Pre-pr: avg 507515519.3 ns 497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908, Post-pr: avg 511971266.5 ns, regressed 0.88% 502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408, ``` Compaction ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000 -- default: 1 thread is used to run benchmark Pre-pr: avg 495346098.30 ns 492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846 Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97% 502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007 ``` Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats) ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000 -- default: 1 thread is used to run benchmark Pre-pr: avg 3848.10 ns 3814,3838,3839,3848,3854,3854,3854,3860,3860,3860 Post-pr: avg 3874.20 ns, regressed 0.68% 3863,3867,3871,3874,3875,3877,3877,3877,3880,3881 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910 Reviewed By: ajkr Differential Revision: D49788060 Pulled By: hx235 fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
1663 lines
63 KiB
C++
1663 lines
63 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "db/memtable.h"
|
|
|
|
#include <algorithm>
|
|
#include <array>
|
|
#include <limits>
|
|
#include <memory>
|
|
|
|
#include "db/dbformat.h"
|
|
#include "db/kv_checksum.h"
|
|
#include "db/merge_context.h"
|
|
#include "db/merge_helper.h"
|
|
#include "db/pinned_iterators_manager.h"
|
|
#include "db/range_tombstone_fragmenter.h"
|
|
#include "db/read_callback.h"
|
|
#include "db/wide/wide_column_serialization.h"
|
|
#include "logging/logging.h"
|
|
#include "memory/arena.h"
|
|
#include "memory/memory_usage.h"
|
|
#include "monitoring/perf_context_imp.h"
|
|
#include "monitoring/statistics_impl.h"
|
|
#include "port/lang.h"
|
|
#include "port/port.h"
|
|
#include "rocksdb/comparator.h"
|
|
#include "rocksdb/env.h"
|
|
#include "rocksdb/iterator.h"
|
|
#include "rocksdb/merge_operator.h"
|
|
#include "rocksdb/slice_transform.h"
|
|
#include "rocksdb/types.h"
|
|
#include "rocksdb/write_buffer_manager.h"
|
|
#include "table/internal_iterator.h"
|
|
#include "table/iterator_wrapper.h"
|
|
#include "table/merging_iterator.h"
|
|
#include "util/autovector.h"
|
|
#include "util/coding.h"
|
|
#include "util/mutexlock.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
ImmutableMemTableOptions::ImmutableMemTableOptions(
|
|
const ImmutableOptions& ioptions,
|
|
const MutableCFOptions& mutable_cf_options)
|
|
: arena_block_size(mutable_cf_options.arena_block_size),
|
|
memtable_prefix_bloom_bits(
|
|
static_cast<uint32_t>(
|
|
static_cast<double>(mutable_cf_options.write_buffer_size) *
|
|
mutable_cf_options.memtable_prefix_bloom_size_ratio) *
|
|
8u),
|
|
memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size),
|
|
memtable_whole_key_filtering(
|
|
mutable_cf_options.memtable_whole_key_filtering),
|
|
inplace_update_support(ioptions.inplace_update_support),
|
|
inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
|
|
inplace_callback(ioptions.inplace_callback),
|
|
max_successive_merges(mutable_cf_options.max_successive_merges),
|
|
statistics(ioptions.stats),
|
|
merge_operator(ioptions.merge_operator.get()),
|
|
info_log(ioptions.logger),
|
|
allow_data_in_errors(ioptions.allow_data_in_errors),
|
|
protection_bytes_per_key(
|
|
mutable_cf_options.memtable_protection_bytes_per_key) {}
|
|
|
|
MemTable::MemTable(const InternalKeyComparator& cmp,
|
|
const ImmutableOptions& ioptions,
|
|
const MutableCFOptions& mutable_cf_options,
|
|
WriteBufferManager* write_buffer_manager,
|
|
SequenceNumber latest_seq, uint32_t column_family_id)
|
|
: comparator_(cmp),
|
|
moptions_(ioptions, mutable_cf_options),
|
|
refs_(0),
|
|
kArenaBlockSize(Arena::OptimizeBlockSize(moptions_.arena_block_size)),
|
|
mem_tracker_(write_buffer_manager),
|
|
arena_(moptions_.arena_block_size,
|
|
(write_buffer_manager != nullptr &&
|
|
(write_buffer_manager->enabled() ||
|
|
write_buffer_manager->cost_to_cache()))
|
|
? &mem_tracker_
|
|
: nullptr,
|
|
mutable_cf_options.memtable_huge_page_size),
|
|
table_(ioptions.memtable_factory->CreateMemTableRep(
|
|
comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
|
|
ioptions.logger, column_family_id)),
|
|
range_del_table_(SkipListFactory().CreateMemTableRep(
|
|
comparator_, &arena_, nullptr /* transform */, ioptions.logger,
|
|
column_family_id)),
|
|
is_range_del_table_empty_(true),
|
|
data_size_(0),
|
|
num_entries_(0),
|
|
num_deletes_(0),
|
|
num_range_deletes_(0),
|
|
write_buffer_size_(mutable_cf_options.write_buffer_size),
|
|
flush_in_progress_(false),
|
|
flush_completed_(false),
|
|
file_number_(0),
|
|
first_seqno_(0),
|
|
earliest_seqno_(latest_seq),
|
|
creation_seq_(latest_seq),
|
|
mem_next_logfile_number_(0),
|
|
min_prep_log_referenced_(0),
|
|
locks_(moptions_.inplace_update_support
|
|
? moptions_.inplace_update_num_locks
|
|
: 0),
|
|
prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
|
|
flush_state_(FLUSH_NOT_REQUESTED),
|
|
clock_(ioptions.clock),
|
|
insert_with_hint_prefix_extractor_(
|
|
ioptions.memtable_insert_with_hint_prefix_extractor.get()),
|
|
oldest_key_time_(std::numeric_limits<uint64_t>::max()),
|
|
atomic_flush_seqno_(kMaxSequenceNumber),
|
|
approximate_memory_usage_(0),
|
|
memtable_max_range_deletions_(
|
|
mutable_cf_options.memtable_max_range_deletions) {
|
|
UpdateFlushState();
|
|
// something went wrong if we need to flush before inserting anything
|
|
assert(!ShouldScheduleFlush());
|
|
|
|
// use bloom_filter_ for both whole key and prefix bloom filter
|
|
if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) &&
|
|
moptions_.memtable_prefix_bloom_bits > 0) {
|
|
bloom_filter_.reset(
|
|
new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
|
|
6 /* hard coded 6 probes */,
|
|
moptions_.memtable_huge_page_size, ioptions.logger));
|
|
}
|
|
// Initialize cached_range_tombstone_ here since it could
|
|
// be read before it is constructed in MemTable::Add(), which could also lead
|
|
// to a data race on the global mutex table backing atomic shared_ptr.
|
|
auto new_cache = std::make_shared<FragmentedRangeTombstoneListCache>();
|
|
size_t size = cached_range_tombstone_.Size();
|
|
for (size_t i = 0; i < size; ++i) {
|
|
std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
|
|
cached_range_tombstone_.AccessAtCore(i);
|
|
auto new_local_cache_ref = std::make_shared<
|
|
const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
|
|
std::atomic_store_explicit(
|
|
local_cache_ref_ptr,
|
|
std::shared_ptr<FragmentedRangeTombstoneListCache>(new_local_cache_ref,
|
|
new_cache.get()),
|
|
std::memory_order_relaxed);
|
|
}
|
|
const Comparator* ucmp = cmp.user_comparator();
|
|
assert(ucmp);
|
|
ts_sz_ = ucmp->timestamp_size();
|
|
persist_user_defined_timestamps_ = ioptions.persist_user_defined_timestamps;
|
|
}
|
|
|
|
MemTable::~MemTable() {
|
|
mem_tracker_.FreeMem();
|
|
assert(refs_ == 0);
|
|
}
|
|
|
|
size_t MemTable::ApproximateMemoryUsage() {
|
|
autovector<size_t> usages = {
|
|
arena_.ApproximateMemoryUsage(), table_->ApproximateMemoryUsage(),
|
|
range_del_table_->ApproximateMemoryUsage(),
|
|
ROCKSDB_NAMESPACE::ApproximateMemoryUsage(insert_hints_)};
|
|
size_t total_usage = 0;
|
|
for (size_t usage : usages) {
|
|
// If usage + total_usage >= kMaxSizet, return kMaxSizet.
|
|
// the following variation is to avoid numeric overflow.
|
|
if (usage >= std::numeric_limits<size_t>::max() - total_usage) {
|
|
return std::numeric_limits<size_t>::max();
|
|
}
|
|
total_usage += usage;
|
|
}
|
|
approximate_memory_usage_.store(total_usage, std::memory_order_relaxed);
|
|
// otherwise, return the actual usage
|
|
return total_usage;
|
|
}
|
|
|
|
bool MemTable::ShouldFlushNow() {
|
|
// This is set if memtable_max_range_deletions is > 0,
|
|
// and that many range deletions are done
|
|
if (memtable_max_range_deletions_ > 0 &&
|
|
num_range_deletes_.load(std::memory_order_relaxed) >=
|
|
static_cast<uint64_t>(memtable_max_range_deletions_)) {
|
|
return true;
|
|
}
|
|
|
|
size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
|
|
// In a lot of times, we cannot allocate arena blocks that exactly matches the
|
|
// buffer size. Thus we have to decide if we should over-allocate or
|
|
// under-allocate.
|
|
// This constant variable can be interpreted as: if we still have more than
|
|
// "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
|
|
// allocate one more block.
|
|
const double kAllowOverAllocationRatio = 0.6;
|
|
|
|
// If arena still have room for new block allocation, we can safely say it
|
|
// shouldn't flush.
|
|
auto allocated_memory = table_->ApproximateMemoryUsage() +
|
|
range_del_table_->ApproximateMemoryUsage() +
|
|
arena_.MemoryAllocatedBytes();
|
|
|
|
approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
|
|
|
|
// if we can still allocate one more block without exceeding the
|
|
// over-allocation ratio, then we should not flush.
|
|
if (allocated_memory + kArenaBlockSize <
|
|
write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
|
|
return false;
|
|
}
|
|
|
|
// if user keeps adding entries that exceeds write_buffer_size, we need to
|
|
// flush earlier even though we still have much available memory left.
|
|
if (allocated_memory >
|
|
write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
|
|
return true;
|
|
}
|
|
|
|
// In this code path, Arena has already allocated its "last block", which
|
|
// means the total allocatedmemory size is either:
|
|
// (1) "moderately" over allocated the memory (no more than `0.6 * arena
|
|
// block size`. Or,
|
|
// (2) the allocated memory is less than write buffer size, but we'll stop
|
|
// here since if we allocate a new arena block, we'll over allocate too much
|
|
// more (half of the arena block size) memory.
|
|
//
|
|
// In either case, to avoid over-allocate, the last block will stop allocation
|
|
// when its usage reaches a certain ratio, which we carefully choose "0.75
|
|
// full" as the stop condition because it addresses the following issue with
|
|
// great simplicity: What if the next inserted entry's size is
|
|
// bigger than AllocatedAndUnused()?
|
|
//
|
|
// The answer is: if the entry size is also bigger than 0.25 *
|
|
// kArenaBlockSize, a dedicated block will be allocated for it; otherwise
|
|
// arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
|
|
// and regular block. In either case, we *overly* over-allocated.
|
|
//
|
|
// Therefore, setting the last block to be at most "0.75 full" avoids both
|
|
// cases.
|
|
//
|
|
// NOTE: the average percentage of waste space of this approach can be counted
|
|
// as: "arena block size * 0.25 / write buffer size". User who specify a small
|
|
// write buffer size and/or big arena block size may suffer.
|
|
return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
|
|
}
|
|
|
|
void MemTable::UpdateFlushState() {
|
|
auto state = flush_state_.load(std::memory_order_relaxed);
|
|
if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) {
|
|
// ignore CAS failure, because that means somebody else requested
|
|
// a flush
|
|
flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED,
|
|
std::memory_order_relaxed,
|
|
std::memory_order_relaxed);
|
|
}
|
|
}
|
|
|
|
void MemTable::UpdateOldestKeyTime() {
|
|
uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
|
|
if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
|
|
int64_t current_time = 0;
|
|
auto s = clock_->GetCurrentTime(¤t_time);
|
|
if (s.ok()) {
|
|
assert(current_time >= 0);
|
|
// If fail, the timestamp is already set.
|
|
oldest_key_time_.compare_exchange_strong(
|
|
oldest_key_time, static_cast<uint64_t>(current_time),
|
|
std::memory_order_relaxed, std::memory_order_relaxed);
|
|
}
|
|
}
|
|
}
|
|
|
|
Status MemTable::VerifyEntryChecksum(const char* entry,
|
|
uint32_t protection_bytes_per_key,
|
|
bool allow_data_in_errors) {
|
|
if (protection_bytes_per_key == 0) {
|
|
return Status::OK();
|
|
}
|
|
uint32_t key_length;
|
|
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
|
if (key_ptr == nullptr) {
|
|
return Status::Corruption("Unable to parse internal key length");
|
|
}
|
|
if (key_length < 8) {
|
|
return Status::Corruption("Memtable entry internal key length too short.");
|
|
}
|
|
Slice user_key = Slice(key_ptr, key_length - 8);
|
|
|
|
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
|
ValueType type;
|
|
SequenceNumber seq;
|
|
UnPackSequenceAndType(tag, &seq, &type);
|
|
|
|
uint32_t value_length = 0;
|
|
const char* value_ptr = GetVarint32Ptr(
|
|
key_ptr + key_length, key_ptr + key_length + 5, &value_length);
|
|
if (value_ptr == nullptr) {
|
|
return Status::Corruption("Unable to parse internal key value");
|
|
}
|
|
Slice value = Slice(value_ptr, value_length);
|
|
|
|
const char* checksum_ptr = value_ptr + value_length;
|
|
bool match =
|
|
ProtectionInfo64()
|
|
.ProtectKVO(user_key, value, type)
|
|
.ProtectS(seq)
|
|
.Verify(static_cast<uint8_t>(protection_bytes_per_key), checksum_ptr);
|
|
if (!match) {
|
|
std::string msg(
|
|
"Corrupted memtable entry, per key-value checksum verification "
|
|
"failed.");
|
|
if (allow_data_in_errors) {
|
|
msg.append("Unrecognized value type: " +
|
|
std::to_string(static_cast<int>(type)) + ". ");
|
|
msg.append("User key: " + user_key.ToString(/*hex=*/true) + ". ");
|
|
msg.append("seq: " + std::to_string(seq) + ".");
|
|
}
|
|
return Status::Corruption(msg.c_str());
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
|
|
const char* prefix_len_key2) const {
|
|
// Internal keys are encoded as length-prefixed strings.
|
|
Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
|
|
Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
|
|
return comparator.CompareKeySeq(k1, k2);
|
|
}
|
|
|
|
int MemTable::KeyComparator::operator()(
|
|
const char* prefix_len_key, const KeyComparator::DecodedType& key) const {
|
|
// Internal keys are encoded as length-prefixed strings.
|
|
Slice a = GetLengthPrefixedSlice(prefix_len_key);
|
|
return comparator.CompareKeySeq(a, key);
|
|
}
|
|
|
|
void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) {
|
|
throw std::runtime_error("concurrent insert not supported");
|
|
}
|
|
|
|
Slice MemTableRep::UserKey(const char* key) const {
|
|
Slice slice = GetLengthPrefixedSlice(key);
|
|
return Slice(slice.data(), slice.size() - 8);
|
|
}
|
|
|
|
KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
|
|
*buf = allocator_->Allocate(len);
|
|
return static_cast<KeyHandle>(*buf);
|
|
}
|
|
|
|
// Encode a suitable internal key target for "target" and return it.
|
|
// Uses *scratch as scratch space, and the returned pointer will point
|
|
// into this scratch space.
|
|
const char* EncodeKey(std::string* scratch, const Slice& target) {
|
|
scratch->clear();
|
|
PutVarint32(scratch, static_cast<uint32_t>(target.size()));
|
|
scratch->append(target.data(), target.size());
|
|
return scratch->data();
|
|
}
|
|
|
|
class MemTableIterator : public InternalIterator {
|
|
public:
|
|
MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
|
|
Arena* arena, bool use_range_del_table = false)
|
|
: bloom_(nullptr),
|
|
prefix_extractor_(mem.prefix_extractor_),
|
|
comparator_(mem.comparator_),
|
|
valid_(false),
|
|
arena_mode_(arena != nullptr),
|
|
value_pinned_(
|
|
!mem.GetImmutableMemTableOptions()->inplace_update_support),
|
|
protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
|
|
status_(Status::OK()),
|
|
logger_(mem.moptions_.info_log),
|
|
ts_sz_(mem.ts_sz_) {
|
|
if (use_range_del_table) {
|
|
iter_ = mem.range_del_table_->GetIterator(arena);
|
|
} else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
|
|
!read_options.auto_prefix_mode) {
|
|
// Auto prefix mode is not implemented in memtable yet.
|
|
bloom_ = mem.bloom_filter_.get();
|
|
iter_ = mem.table_->GetDynamicPrefixIterator(arena);
|
|
} else {
|
|
iter_ = mem.table_->GetIterator(arena);
|
|
}
|
|
status_.PermitUncheckedError();
|
|
}
|
|
// No copying allowed
|
|
MemTableIterator(const MemTableIterator&) = delete;
|
|
void operator=(const MemTableIterator&) = delete;
|
|
|
|
~MemTableIterator() override {
|
|
#ifndef NDEBUG
|
|
// Assert that the MemTableIterator is never deleted while
|
|
// Pinning is Enabled.
|
|
assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
|
|
#endif
|
|
if (arena_mode_) {
|
|
iter_->~Iterator();
|
|
} else {
|
|
delete iter_;
|
|
}
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
|
|
pinned_iters_mgr_ = pinned_iters_mgr;
|
|
}
|
|
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
|
|
#endif
|
|
|
|
bool Valid() const override { return valid_ && status_.ok(); }
|
|
void Seek(const Slice& k) override {
|
|
PERF_TIMER_GUARD(seek_on_memtable_time);
|
|
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
|
|
if (bloom_) {
|
|
// iterator should only use prefix bloom filter
|
|
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
|
|
if (prefix_extractor_->InDomain(user_k_without_ts)) {
|
|
if (!bloom_->MayContain(
|
|
prefix_extractor_->Transform(user_k_without_ts))) {
|
|
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
|
|
valid_ = false;
|
|
return;
|
|
} else {
|
|
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
|
|
}
|
|
}
|
|
}
|
|
iter_->Seek(k, nullptr);
|
|
valid_ = iter_->Valid();
|
|
VerifyEntryChecksum();
|
|
}
|
|
void SeekForPrev(const Slice& k) override {
|
|
PERF_TIMER_GUARD(seek_on_memtable_time);
|
|
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
|
|
if (bloom_) {
|
|
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
|
|
if (prefix_extractor_->InDomain(user_k_without_ts)) {
|
|
if (!bloom_->MayContain(
|
|
prefix_extractor_->Transform(user_k_without_ts))) {
|
|
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
|
|
valid_ = false;
|
|
return;
|
|
} else {
|
|
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
|
|
}
|
|
}
|
|
}
|
|
iter_->Seek(k, nullptr);
|
|
valid_ = iter_->Valid();
|
|
VerifyEntryChecksum();
|
|
if (!Valid() && status().ok()) {
|
|
SeekToLast();
|
|
}
|
|
while (Valid() && comparator_.comparator.Compare(k, key()) < 0) {
|
|
Prev();
|
|
}
|
|
}
|
|
void SeekToFirst() override {
|
|
iter_->SeekToFirst();
|
|
valid_ = iter_->Valid();
|
|
VerifyEntryChecksum();
|
|
}
|
|
void SeekToLast() override {
|
|
iter_->SeekToLast();
|
|
valid_ = iter_->Valid();
|
|
VerifyEntryChecksum();
|
|
}
|
|
void Next() override {
|
|
PERF_COUNTER_ADD(next_on_memtable_count, 1);
|
|
assert(Valid());
|
|
iter_->Next();
|
|
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
|
|
valid_ = iter_->Valid();
|
|
VerifyEntryChecksum();
|
|
}
|
|
bool NextAndGetResult(IterateResult* result) override {
|
|
Next();
|
|
bool is_valid = Valid();
|
|
if (is_valid) {
|
|
result->key = key();
|
|
result->bound_check_result = IterBoundCheck::kUnknown;
|
|
result->value_prepared = true;
|
|
}
|
|
return is_valid;
|
|
}
|
|
void Prev() override {
|
|
PERF_COUNTER_ADD(prev_on_memtable_count, 1);
|
|
assert(Valid());
|
|
iter_->Prev();
|
|
valid_ = iter_->Valid();
|
|
VerifyEntryChecksum();
|
|
}
|
|
Slice key() const override {
|
|
assert(Valid());
|
|
return GetLengthPrefixedSlice(iter_->key());
|
|
}
|
|
Slice value() const override {
|
|
assert(Valid());
|
|
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
|
|
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
|
|
}
|
|
|
|
Status status() const override { return status_; }
|
|
|
|
bool IsKeyPinned() const override {
|
|
// memtable data is always pinned
|
|
return true;
|
|
}
|
|
|
|
bool IsValuePinned() const override {
|
|
// memtable value is always pinned, except if we allow inplace update.
|
|
return value_pinned_;
|
|
}
|
|
|
|
private:
|
|
DynamicBloom* bloom_;
|
|
const SliceTransform* const prefix_extractor_;
|
|
const MemTable::KeyComparator comparator_;
|
|
MemTableRep::Iterator* iter_;
|
|
bool valid_;
|
|
bool arena_mode_;
|
|
bool value_pinned_;
|
|
uint32_t protection_bytes_per_key_;
|
|
Status status_;
|
|
Logger* logger_;
|
|
size_t ts_sz_;
|
|
|
|
void VerifyEntryChecksum() {
|
|
if (protection_bytes_per_key_ > 0 && Valid()) {
|
|
status_ = MemTable::VerifyEntryChecksum(iter_->key(),
|
|
protection_bytes_per_key_);
|
|
if (!status_.ok()) {
|
|
ROCKS_LOG_ERROR(logger_, "In MemtableIterator: %s", status_.getState());
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
|
|
Arena* arena) {
|
|
assert(arena != nullptr);
|
|
auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
|
|
return new (mem) MemTableIterator(*this, read_options, arena);
|
|
}
|
|
|
|
FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
|
|
const ReadOptions& read_options, SequenceNumber read_seq,
|
|
bool immutable_memtable) {
|
|
if (read_options.ignore_range_deletions ||
|
|
is_range_del_table_empty_.load(std::memory_order_relaxed)) {
|
|
return nullptr;
|
|
}
|
|
return NewRangeTombstoneIteratorInternal(read_options, read_seq,
|
|
immutable_memtable);
|
|
}
|
|
|
|
FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
|
|
const ReadOptions& read_options, SequenceNumber read_seq,
|
|
bool immutable_memtable) {
|
|
if (immutable_memtable) {
|
|
// Note that caller should already have verified that
|
|
// !is_range_del_table_empty_
|
|
assert(IsFragmentedRangeTombstonesConstructed());
|
|
return new FragmentedRangeTombstoneIterator(
|
|
fragmented_range_tombstone_list_.get(), comparator_.comparator,
|
|
read_seq, read_options.timestamp);
|
|
}
|
|
|
|
// takes current cache
|
|
std::shared_ptr<FragmentedRangeTombstoneListCache> cache =
|
|
std::atomic_load_explicit(cached_range_tombstone_.Access(),
|
|
std::memory_order_relaxed);
|
|
// construct fragmented tombstone list if necessary
|
|
if (!cache->initialized.load(std::memory_order_acquire)) {
|
|
cache->reader_mutex.lock();
|
|
if (!cache->tombstones) {
|
|
auto* unfragmented_iter =
|
|
new MemTableIterator(*this, read_options, nullptr /* arena */,
|
|
true /* use_range_del_table */);
|
|
cache->tombstones.reset(new FragmentedRangeTombstoneList(
|
|
std::unique_ptr<InternalIterator>(unfragmented_iter),
|
|
comparator_.comparator));
|
|
cache->initialized.store(true, std::memory_order_release);
|
|
}
|
|
cache->reader_mutex.unlock();
|
|
}
|
|
|
|
auto* fragmented_iter = new FragmentedRangeTombstoneIterator(
|
|
cache, comparator_.comparator, read_seq, read_options.timestamp);
|
|
return fragmented_iter;
|
|
}
|
|
|
|
void MemTable::ConstructFragmentedRangeTombstones() {
|
|
assert(!IsFragmentedRangeTombstonesConstructed(false));
|
|
// There should be no concurrent Construction
|
|
if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
auto* unfragmented_iter =
|
|
new MemTableIterator(*this, ReadOptions(), nullptr /* arena */,
|
|
true /* use_range_del_table */);
|
|
|
|
fragmented_range_tombstone_list_ =
|
|
std::make_unique<FragmentedRangeTombstoneList>(
|
|
std::unique_ptr<InternalIterator>(unfragmented_iter),
|
|
comparator_.comparator);
|
|
}
|
|
}
|
|
|
|
port::RWMutex* MemTable::GetLock(const Slice& key) {
|
|
return &locks_[GetSliceRangedNPHash(key, locks_.size())];
|
|
}
|
|
|
|
MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
|
|
const Slice& end_ikey) {
|
|
uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey);
|
|
entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey);
|
|
if (entry_count == 0) {
|
|
return {0, 0};
|
|
}
|
|
uint64_t n = num_entries_.load(std::memory_order_relaxed);
|
|
if (n == 0) {
|
|
return {0, 0};
|
|
}
|
|
if (entry_count > n) {
|
|
// (range_del_)table_->ApproximateNumEntries() is just an estimate so it can
|
|
// be larger than actual entries we have. Cap it to entries we have to limit
|
|
// the inaccuracy.
|
|
entry_count = n;
|
|
}
|
|
uint64_t data_size = data_size_.load(std::memory_order_relaxed);
|
|
return {entry_count * (data_size / n), entry_count};
|
|
}
|
|
|
|
Status MemTable::VerifyEncodedEntry(Slice encoded,
|
|
const ProtectionInfoKVOS64& kv_prot_info) {
|
|
uint32_t ikey_len = 0;
|
|
if (!GetVarint32(&encoded, &ikey_len)) {
|
|
return Status::Corruption("Unable to parse internal key length");
|
|
}
|
|
if (ikey_len < 8 + ts_sz_) {
|
|
return Status::Corruption("Internal key length too short");
|
|
}
|
|
if (ikey_len > encoded.size()) {
|
|
return Status::Corruption("Internal key length too long");
|
|
}
|
|
uint32_t value_len = 0;
|
|
const size_t user_key_len = ikey_len - 8;
|
|
Slice key(encoded.data(), user_key_len);
|
|
encoded.remove_prefix(user_key_len);
|
|
|
|
uint64_t packed = DecodeFixed64(encoded.data());
|
|
ValueType value_type = kMaxValue;
|
|
SequenceNumber sequence_number = kMaxSequenceNumber;
|
|
UnPackSequenceAndType(packed, &sequence_number, &value_type);
|
|
encoded.remove_prefix(8);
|
|
|
|
if (!GetVarint32(&encoded, &value_len)) {
|
|
return Status::Corruption("Unable to parse value length");
|
|
}
|
|
if (value_len < encoded.size()) {
|
|
return Status::Corruption("Value length too short");
|
|
}
|
|
if (value_len > encoded.size()) {
|
|
return Status::Corruption("Value length too long");
|
|
}
|
|
Slice value(encoded.data(), value_len);
|
|
|
|
return kv_prot_info.StripS(sequence_number)
|
|
.StripKVO(key, value, value_type)
|
|
.GetStatus();
|
|
}
|
|
|
|
void MemTable::UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
|
|
const Slice& key, const Slice& value,
|
|
ValueType type, SequenceNumber s,
|
|
char* checksum_ptr) {
|
|
if (moptions_.protection_bytes_per_key == 0) {
|
|
return;
|
|
}
|
|
|
|
if (kv_prot_info == nullptr) {
|
|
ProtectionInfo64()
|
|
.ProtectKVO(key, value, type)
|
|
.ProtectS(s)
|
|
.Encode(static_cast<uint8_t>(moptions_.protection_bytes_per_key),
|
|
checksum_ptr);
|
|
} else {
|
|
kv_prot_info->Encode(
|
|
static_cast<uint8_t>(moptions_.protection_bytes_per_key), checksum_ptr);
|
|
}
|
|
}
|
|
|
|
Status MemTable::Add(SequenceNumber s, ValueType type,
|
|
const Slice& key, /* user key */
|
|
const Slice& value,
|
|
const ProtectionInfoKVOS64* kv_prot_info,
|
|
bool allow_concurrent,
|
|
MemTablePostProcessInfo* post_process_info, void** hint) {
|
|
// Format of an entry is concatenation of:
|
|
// key_size : varint32 of internal_key.size()
|
|
// key bytes : char[internal_key.size()]
|
|
// value_size : varint32 of value.size()
|
|
// value bytes : char[value.size()]
|
|
// checksum : char[moptions_.protection_bytes_per_key]
|
|
uint32_t key_size = static_cast<uint32_t>(key.size());
|
|
uint32_t val_size = static_cast<uint32_t>(value.size());
|
|
uint32_t internal_key_size = key_size + 8;
|
|
const uint32_t encoded_len = VarintLength(internal_key_size) +
|
|
internal_key_size + VarintLength(val_size) +
|
|
val_size + moptions_.protection_bytes_per_key;
|
|
char* buf = nullptr;
|
|
std::unique_ptr<MemTableRep>& table =
|
|
type == kTypeRangeDeletion ? range_del_table_ : table_;
|
|
KeyHandle handle = table->Allocate(encoded_len, &buf);
|
|
|
|
char* p = EncodeVarint32(buf, internal_key_size);
|
|
memcpy(p, key.data(), key_size);
|
|
Slice key_slice(p, key_size);
|
|
p += key_size;
|
|
uint64_t packed = PackSequenceAndType(s, type);
|
|
EncodeFixed64(p, packed);
|
|
p += 8;
|
|
p = EncodeVarint32(p, val_size);
|
|
memcpy(p, value.data(), val_size);
|
|
assert((unsigned)(p + val_size - buf + moptions_.protection_bytes_per_key) ==
|
|
(unsigned)encoded_len);
|
|
|
|
UpdateEntryChecksum(kv_prot_info, key, value, type, s,
|
|
buf + encoded_len - moptions_.protection_bytes_per_key);
|
|
Slice encoded(buf, encoded_len - moptions_.protection_bytes_per_key);
|
|
if (kv_prot_info != nullptr) {
|
|
TEST_SYNC_POINT_CALLBACK("MemTable::Add:Encoded", &encoded);
|
|
Status status = VerifyEncodedEntry(encoded, *kv_prot_info);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
}
|
|
|
|
Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz_);
|
|
|
|
if (!allow_concurrent) {
|
|
// Extract prefix for insert with hint.
|
|
if (insert_with_hint_prefix_extractor_ != nullptr &&
|
|
insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
|
|
Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
|
|
bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
|
|
if (UNLIKELY(!res)) {
|
|
return Status::TryAgain("key+seq exists");
|
|
}
|
|
} else {
|
|
bool res = table->InsertKey(handle);
|
|
if (UNLIKELY(!res)) {
|
|
return Status::TryAgain("key+seq exists");
|
|
}
|
|
}
|
|
|
|
// this is a bit ugly, but is the way to avoid locked instructions
|
|
// when incrementing an atomic
|
|
num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
|
|
std::memory_order_relaxed);
|
|
data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
|
|
std::memory_order_relaxed);
|
|
if (type == kTypeDeletion || type == kTypeSingleDeletion ||
|
|
type == kTypeDeletionWithTimestamp) {
|
|
num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1,
|
|
std::memory_order_relaxed);
|
|
} else if (type == kTypeRangeDeletion) {
|
|
uint64_t val = num_range_deletes_.load(std::memory_order_relaxed) + 1;
|
|
num_range_deletes_.store(val, std::memory_order_relaxed);
|
|
}
|
|
|
|
if (bloom_filter_ && prefix_extractor_ &&
|
|
prefix_extractor_->InDomain(key_without_ts)) {
|
|
bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts));
|
|
}
|
|
if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
|
|
bloom_filter_->Add(key_without_ts);
|
|
}
|
|
|
|
// The first sequence number inserted into the memtable
|
|
assert(first_seqno_ == 0 || s >= first_seqno_);
|
|
if (first_seqno_ == 0) {
|
|
first_seqno_.store(s, std::memory_order_relaxed);
|
|
|
|
if (earliest_seqno_ == kMaxSequenceNumber) {
|
|
earliest_seqno_.store(GetFirstSequenceNumber(),
|
|
std::memory_order_relaxed);
|
|
}
|
|
assert(first_seqno_.load() >= earliest_seqno_.load());
|
|
}
|
|
assert(post_process_info == nullptr);
|
|
// TODO(yuzhangyu): support updating newest UDT for when `allow_concurrent`
|
|
// is true.
|
|
MaybeUpdateNewestUDT(key_slice);
|
|
UpdateFlushState();
|
|
} else {
|
|
bool res = (hint == nullptr)
|
|
? table->InsertKeyConcurrently(handle)
|
|
: table->InsertKeyWithHintConcurrently(handle, hint);
|
|
if (UNLIKELY(!res)) {
|
|
return Status::TryAgain("key+seq exists");
|
|
}
|
|
|
|
assert(post_process_info != nullptr);
|
|
post_process_info->num_entries++;
|
|
post_process_info->data_size += encoded_len;
|
|
if (type == kTypeDeletion) {
|
|
post_process_info->num_deletes++;
|
|
}
|
|
|
|
if (bloom_filter_ && prefix_extractor_ &&
|
|
prefix_extractor_->InDomain(key_without_ts)) {
|
|
bloom_filter_->AddConcurrently(
|
|
prefix_extractor_->Transform(key_without_ts));
|
|
}
|
|
if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
|
|
bloom_filter_->AddConcurrently(key_without_ts);
|
|
}
|
|
|
|
// atomically update first_seqno_ and earliest_seqno_.
|
|
uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed);
|
|
while ((cur_seq_num == 0 || s < cur_seq_num) &&
|
|
!first_seqno_.compare_exchange_weak(cur_seq_num, s)) {
|
|
}
|
|
uint64_t cur_earliest_seqno =
|
|
earliest_seqno_.load(std::memory_order_relaxed);
|
|
while (
|
|
(cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) &&
|
|
!earliest_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) {
|
|
}
|
|
}
|
|
if (type == kTypeRangeDeletion) {
|
|
auto new_cache = std::make_shared<FragmentedRangeTombstoneListCache>();
|
|
size_t size = cached_range_tombstone_.Size();
|
|
if (allow_concurrent) {
|
|
post_process_info->num_range_deletes++;
|
|
range_del_mutex_.lock();
|
|
}
|
|
for (size_t i = 0; i < size; ++i) {
|
|
std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
|
|
cached_range_tombstone_.AccessAtCore(i);
|
|
auto new_local_cache_ref = std::make_shared<
|
|
const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
|
|
// It is okay for some reader to load old cache during invalidation as
|
|
// the new sequence number is not published yet.
|
|
// Each core will have a shared_ptr to a shared_ptr to the cached
|
|
// fragmented range tombstones, so that ref count is maintianed locally
|
|
// per-core using the per-core shared_ptr.
|
|
std::atomic_store_explicit(
|
|
local_cache_ref_ptr,
|
|
std::shared_ptr<FragmentedRangeTombstoneListCache>(
|
|
new_local_cache_ref, new_cache.get()),
|
|
std::memory_order_relaxed);
|
|
}
|
|
|
|
if (allow_concurrent) {
|
|
range_del_mutex_.unlock();
|
|
}
|
|
is_range_del_table_empty_.store(false, std::memory_order_relaxed);
|
|
}
|
|
UpdateOldestKeyTime();
|
|
|
|
TEST_SYNC_POINT_CALLBACK("MemTable::Add:BeforeReturn:Encoded", &encoded);
|
|
return Status::OK();
|
|
}
|
|
|
|
// Callback from MemTable::Get()
|
|
namespace {
|
|
|
|
struct Saver {
|
|
Status* status;
|
|
const LookupKey* key;
|
|
bool* found_final_value; // Is value set correctly? Used by KeyMayExist
|
|
bool* merge_in_progress;
|
|
std::string* value;
|
|
PinnableWideColumns* columns;
|
|
SequenceNumber seq;
|
|
std::string* timestamp;
|
|
const MergeOperator* merge_operator;
|
|
// the merge operations encountered;
|
|
MergeContext* merge_context;
|
|
SequenceNumber max_covering_tombstone_seq;
|
|
MemTable* mem;
|
|
Logger* logger;
|
|
Statistics* statistics;
|
|
bool inplace_update_support;
|
|
bool do_merge;
|
|
SystemClock* clock;
|
|
|
|
ReadCallback* callback_;
|
|
bool* is_blob_index;
|
|
bool allow_data_in_errors;
|
|
uint32_t protection_bytes_per_key;
|
|
bool CheckCallback(SequenceNumber _seq) {
|
|
if (callback_) {
|
|
return callback_->IsVisible(_seq);
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
} // anonymous namespace
|
|
|
|
static bool SaveValue(void* arg, const char* entry) {
|
|
TEST_SYNC_POINT_CALLBACK("Memtable::SaveValue:Begin:entry", &entry);
|
|
Saver* s = reinterpret_cast<Saver*>(arg);
|
|
assert(s != nullptr);
|
|
assert(!s->value || !s->columns);
|
|
|
|
if (s->protection_bytes_per_key > 0) {
|
|
*(s->status) = MemTable::VerifyEntryChecksum(
|
|
entry, s->protection_bytes_per_key, s->allow_data_in_errors);
|
|
if (!s->status->ok()) {
|
|
ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState());
|
|
// Memtable entry corrupted
|
|
return false;
|
|
}
|
|
}
|
|
|
|
MergeContext* merge_context = s->merge_context;
|
|
SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
|
|
const MergeOperator* merge_operator = s->merge_operator;
|
|
|
|
assert(merge_context != nullptr);
|
|
|
|
// Refer to comments under MemTable::Add() for entry format.
|
|
// Check that it belongs to same user key.
|
|
uint32_t key_length = 0;
|
|
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
|
assert(key_length >= 8);
|
|
Slice user_key_slice = Slice(key_ptr, key_length - 8);
|
|
const Comparator* user_comparator =
|
|
s->mem->GetInternalKeyComparator().user_comparator();
|
|
size_t ts_sz = user_comparator->timestamp_size();
|
|
if (ts_sz && s->timestamp && max_covering_tombstone_seq > 0) {
|
|
// timestamp should already be set to range tombstone timestamp
|
|
assert(s->timestamp->size() == ts_sz);
|
|
}
|
|
if (user_comparator->EqualWithoutTimestamp(user_key_slice,
|
|
s->key->user_key())) {
|
|
// Correct user key
|
|
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
|
ValueType type;
|
|
SequenceNumber seq;
|
|
UnPackSequenceAndType(tag, &seq, &type);
|
|
// If the value is not in the snapshot, skip it
|
|
if (!s->CheckCallback(seq)) {
|
|
return true; // to continue to the next seq
|
|
}
|
|
|
|
if (s->seq == kMaxSequenceNumber) {
|
|
s->seq = seq;
|
|
if (s->seq > max_covering_tombstone_seq) {
|
|
if (ts_sz && s->timestamp != nullptr) {
|
|
// `timestamp` was set to range tombstone's timestamp before
|
|
// `SaveValue` is ever called. This key has a higher sequence number
|
|
// than range tombstone, and is the key with the highest seqno across
|
|
// all keys with this user_key, so we update timestamp here.
|
|
Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz);
|
|
s->timestamp->assign(ts.data(), ts_sz);
|
|
}
|
|
} else {
|
|
s->seq = max_covering_tombstone_seq;
|
|
}
|
|
}
|
|
|
|
if (ts_sz > 0 && s->timestamp != nullptr) {
|
|
if (!s->timestamp->empty()) {
|
|
assert(ts_sz == s->timestamp->size());
|
|
}
|
|
// TODO optimize for smaller size ts
|
|
const std::string kMaxTs(ts_sz, '\xff');
|
|
if (s->timestamp->empty() ||
|
|
user_comparator->CompareTimestamp(*(s->timestamp), kMaxTs) == 0) {
|
|
Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz);
|
|
s->timestamp->assign(ts.data(), ts_sz);
|
|
}
|
|
}
|
|
|
|
if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex ||
|
|
type == kTypeWideColumnEntity || type == kTypeDeletion ||
|
|
type == kTypeSingleDeletion || type == kTypeDeletionWithTimestamp) &&
|
|
max_covering_tombstone_seq > seq) {
|
|
type = kTypeRangeDeletion;
|
|
}
|
|
switch (type) {
|
|
case kTypeBlobIndex: {
|
|
if (!s->do_merge) {
|
|
*(s->status) = Status::NotSupported(
|
|
"GetMergeOperands not supported by stacked BlobDB");
|
|
*(s->found_final_value) = true;
|
|
return false;
|
|
}
|
|
|
|
if (*(s->merge_in_progress)) {
|
|
*(s->status) = Status::NotSupported(
|
|
"Merge operator not supported by stacked BlobDB");
|
|
*(s->found_final_value) = true;
|
|
return false;
|
|
}
|
|
|
|
if (s->is_blob_index == nullptr) {
|
|
ROCKS_LOG_ERROR(s->logger, "Encountered unexpected blob index.");
|
|
*(s->status) = Status::NotSupported(
|
|
"Encountered unexpected blob index. Please open DB with "
|
|
"ROCKSDB_NAMESPACE::blob_db::BlobDB.");
|
|
*(s->found_final_value) = true;
|
|
return false;
|
|
}
|
|
|
|
if (s->inplace_update_support) {
|
|
s->mem->GetLock(s->key->user_key())->ReadLock();
|
|
}
|
|
|
|
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
|
|
|
*(s->status) = Status::OK();
|
|
|
|
if (s->value) {
|
|
s->value->assign(v.data(), v.size());
|
|
} else if (s->columns) {
|
|
s->columns->SetPlainValue(v);
|
|
}
|
|
|
|
if (s->inplace_update_support) {
|
|
s->mem->GetLock(s->key->user_key())->ReadUnlock();
|
|
}
|
|
|
|
*(s->found_final_value) = true;
|
|
*(s->is_blob_index) = true;
|
|
|
|
return false;
|
|
}
|
|
case kTypeValue: {
|
|
if (s->inplace_update_support) {
|
|
s->mem->GetLock(s->key->user_key())->ReadLock();
|
|
}
|
|
|
|
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
|
|
|
*(s->status) = Status::OK();
|
|
|
|
if (!s->do_merge) {
|
|
// Preserve the value with the goal of returning it as part of
|
|
// raw merge operands to the user
|
|
// TODO(yanqin) update MergeContext so that timestamps information
|
|
// can also be retained.
|
|
|
|
merge_context->PushOperand(
|
|
v, s->inplace_update_support == false /* operand_pinned */);
|
|
} else if (*(s->merge_in_progress)) {
|
|
assert(s->do_merge);
|
|
|
|
if (s->value || s->columns) {
|
|
// `op_failure_scope` (an output parameter) is not provided (set to
|
|
// nullptr) since a failure must be propagated regardless of its
|
|
// value.
|
|
*(s->status) = MergeHelper::TimedFullMerge(
|
|
merge_operator, s->key->user_key(),
|
|
MergeHelper::kPlainBaseValue, v, merge_context->GetOperands(),
|
|
s->logger, s->statistics, s->clock,
|
|
/* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
|
|
s->value, s->columns);
|
|
}
|
|
} else if (s->value) {
|
|
s->value->assign(v.data(), v.size());
|
|
} else if (s->columns) {
|
|
s->columns->SetPlainValue(v);
|
|
}
|
|
|
|
if (s->inplace_update_support) {
|
|
s->mem->GetLock(s->key->user_key())->ReadUnlock();
|
|
}
|
|
|
|
*(s->found_final_value) = true;
|
|
|
|
if (s->is_blob_index != nullptr) {
|
|
*(s->is_blob_index) = false;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
case kTypeWideColumnEntity: {
|
|
if (s->inplace_update_support) {
|
|
s->mem->GetLock(s->key->user_key())->ReadLock();
|
|
}
|
|
|
|
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
|
|
|
*(s->status) = Status::OK();
|
|
|
|
if (!s->do_merge) {
|
|
// Preserve the value with the goal of returning it as part of
|
|
// raw merge operands to the user
|
|
|
|
Slice value_of_default;
|
|
*(s->status) = WideColumnSerialization::GetValueOfDefaultColumn(
|
|
v, value_of_default);
|
|
|
|
if (s->status->ok()) {
|
|
merge_context->PushOperand(
|
|
value_of_default,
|
|
s->inplace_update_support == false /* operand_pinned */);
|
|
}
|
|
} else if (*(s->merge_in_progress)) {
|
|
assert(s->do_merge);
|
|
|
|
if (s->value || s->columns) {
|
|
// `op_failure_scope` (an output parameter) is not provided (set
|
|
// to nullptr) since a failure must be propagated regardless of
|
|
// its value.
|
|
*(s->status) = MergeHelper::TimedFullMerge(
|
|
merge_operator, s->key->user_key(), MergeHelper::kWideBaseValue,
|
|
v, merge_context->GetOperands(), s->logger, s->statistics,
|
|
s->clock, /* update_num_ops_stats */ true,
|
|
/* op_failure_scope */ nullptr, s->value, s->columns);
|
|
}
|
|
} else if (s->value) {
|
|
Slice value_of_default;
|
|
*(s->status) = WideColumnSerialization::GetValueOfDefaultColumn(
|
|
v, value_of_default);
|
|
if (s->status->ok()) {
|
|
s->value->assign(value_of_default.data(), value_of_default.size());
|
|
}
|
|
} else if (s->columns) {
|
|
*(s->status) = s->columns->SetWideColumnValue(v);
|
|
}
|
|
|
|
if (s->inplace_update_support) {
|
|
s->mem->GetLock(s->key->user_key())->ReadUnlock();
|
|
}
|
|
|
|
*(s->found_final_value) = true;
|
|
|
|
if (s->is_blob_index != nullptr) {
|
|
*(s->is_blob_index) = false;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
case kTypeDeletion:
|
|
case kTypeDeletionWithTimestamp:
|
|
case kTypeSingleDeletion:
|
|
case kTypeRangeDeletion: {
|
|
if (*(s->merge_in_progress)) {
|
|
if (s->value || s->columns) {
|
|
// `op_failure_scope` (an output parameter) is not provided (set to
|
|
// nullptr) since a failure must be propagated regardless of its
|
|
// value.
|
|
*(s->status) = MergeHelper::TimedFullMerge(
|
|
merge_operator, s->key->user_key(), MergeHelper::kNoBaseValue,
|
|
merge_context->GetOperands(), s->logger, s->statistics,
|
|
s->clock, /* update_num_ops_stats */ true,
|
|
/* op_failure_scope */ nullptr, s->value, s->columns);
|
|
} else {
|
|
// We have found a final value (a base deletion) and have newer
|
|
// merge operands that we do not intend to merge. Nothing remains
|
|
// to be done so assign status to OK.
|
|
*(s->status) = Status::OK();
|
|
}
|
|
} else {
|
|
*(s->status) = Status::NotFound();
|
|
}
|
|
*(s->found_final_value) = true;
|
|
return false;
|
|
}
|
|
case kTypeMerge: {
|
|
if (!merge_operator) {
|
|
*(s->status) = Status::InvalidArgument(
|
|
"merge_operator is not properly initialized.");
|
|
// Normally we continue the loop (return true) when we see a merge
|
|
// operand. But in case of an error, we should stop the loop
|
|
// immediately and pretend we have found the value to stop further
|
|
// seek. Otherwise, the later call will override this error status.
|
|
*(s->found_final_value) = true;
|
|
return false;
|
|
}
|
|
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
|
*(s->merge_in_progress) = true;
|
|
merge_context->PushOperand(
|
|
v, s->inplace_update_support == false /* operand_pinned */);
|
|
PERF_COUNTER_ADD(internal_merge_point_lookup_count, 1);
|
|
|
|
if (s->do_merge && merge_operator->ShouldMerge(
|
|
merge_context->GetOperandsDirectionBackward())) {
|
|
if (s->value || s->columns) {
|
|
// `op_failure_scope` (an output parameter) is not provided (set to
|
|
// nullptr) since a failure must be propagated regardless of its
|
|
// value.
|
|
*(s->status) = MergeHelper::TimedFullMerge(
|
|
merge_operator, s->key->user_key(), MergeHelper::kNoBaseValue,
|
|
merge_context->GetOperands(), s->logger, s->statistics,
|
|
s->clock, /* update_num_ops_stats */ true,
|
|
/* op_failure_scope */ nullptr, s->value, s->columns);
|
|
}
|
|
|
|
*(s->found_final_value) = true;
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
default: {
|
|
std::string msg("Corrupted value not expected.");
|
|
if (s->allow_data_in_errors) {
|
|
msg.append("Unrecognized value type: " +
|
|
std::to_string(static_cast<int>(type)) + ". ");
|
|
msg.append("User key: " + user_key_slice.ToString(/*hex=*/true) +
|
|
". ");
|
|
msg.append("seq: " + std::to_string(seq) + ".");
|
|
}
|
|
*(s->status) = Status::Corruption(msg.c_str());
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// s->state could be Corrupt, merge or notfound
|
|
return false;
|
|
}
|
|
|
|
bool MemTable::Get(const LookupKey& key, std::string* value,
|
|
PinnableWideColumns* columns, std::string* timestamp,
|
|
Status* s, MergeContext* merge_context,
|
|
SequenceNumber* max_covering_tombstone_seq,
|
|
SequenceNumber* seq, const ReadOptions& read_opts,
|
|
bool immutable_memtable, ReadCallback* callback,
|
|
bool* is_blob_index, bool do_merge) {
|
|
// The sequence number is updated synchronously in version_set.h
|
|
if (IsEmpty()) {
|
|
// Avoiding recording stats for speed.
|
|
return false;
|
|
}
|
|
|
|
PERF_TIMER_GUARD(get_from_memtable_time);
|
|
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
|
|
NewRangeTombstoneIterator(read_opts,
|
|
GetInternalKeySeqno(key.internal_key()),
|
|
immutable_memtable));
|
|
if (range_del_iter != nullptr) {
|
|
SequenceNumber covering_seq =
|
|
range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key());
|
|
if (covering_seq > *max_covering_tombstone_seq) {
|
|
*max_covering_tombstone_seq = covering_seq;
|
|
if (timestamp) {
|
|
// Will be overwritten in SaveValue() if there is a point key with
|
|
// a higher seqno.
|
|
timestamp->assign(range_del_iter->timestamp().data(),
|
|
range_del_iter->timestamp().size());
|
|
}
|
|
}
|
|
}
|
|
|
|
bool found_final_value = false;
|
|
bool merge_in_progress = s->IsMergeInProgress();
|
|
bool may_contain = true;
|
|
Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz_);
|
|
bool bloom_checked = false;
|
|
if (bloom_filter_) {
|
|
// when both memtable_whole_key_filtering and prefix_extractor_ are set,
|
|
// only do whole key filtering for Get() to save CPU
|
|
if (moptions_.memtable_whole_key_filtering) {
|
|
may_contain = bloom_filter_->MayContain(user_key_without_ts);
|
|
bloom_checked = true;
|
|
} else {
|
|
assert(prefix_extractor_);
|
|
if (prefix_extractor_->InDomain(user_key_without_ts)) {
|
|
may_contain = bloom_filter_->MayContain(
|
|
prefix_extractor_->Transform(user_key_without_ts));
|
|
bloom_checked = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (bloom_filter_ && !may_contain) {
|
|
// iter is null if prefix bloom says the key does not exist
|
|
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
|
|
*seq = kMaxSequenceNumber;
|
|
} else {
|
|
if (bloom_checked) {
|
|
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
|
|
}
|
|
GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
|
|
is_blob_index, value, columns, timestamp, s, merge_context,
|
|
seq, &found_final_value, &merge_in_progress);
|
|
}
|
|
|
|
// No change to value, since we have not yet found a Put/Delete
|
|
// Propagate corruption error
|
|
if (!found_final_value && merge_in_progress && !s->IsCorruption()) {
|
|
*s = Status::MergeInProgress();
|
|
}
|
|
PERF_COUNTER_ADD(get_from_memtable_count, 1);
|
|
return found_final_value;
|
|
}
|
|
|
|
void MemTable::GetFromTable(const LookupKey& key,
|
|
SequenceNumber max_covering_tombstone_seq,
|
|
bool do_merge, ReadCallback* callback,
|
|
bool* is_blob_index, std::string* value,
|
|
PinnableWideColumns* columns,
|
|
std::string* timestamp, Status* s,
|
|
MergeContext* merge_context, SequenceNumber* seq,
|
|
bool* found_final_value, bool* merge_in_progress) {
|
|
Saver saver;
|
|
saver.status = s;
|
|
saver.found_final_value = found_final_value;
|
|
saver.merge_in_progress = merge_in_progress;
|
|
saver.key = &key;
|
|
saver.value = value;
|
|
saver.columns = columns;
|
|
saver.timestamp = timestamp;
|
|
saver.seq = kMaxSequenceNumber;
|
|
saver.mem = this;
|
|
saver.merge_context = merge_context;
|
|
saver.max_covering_tombstone_seq = max_covering_tombstone_seq;
|
|
saver.merge_operator = moptions_.merge_operator;
|
|
saver.logger = moptions_.info_log;
|
|
saver.inplace_update_support = moptions_.inplace_update_support;
|
|
saver.statistics = moptions_.statistics;
|
|
saver.clock = clock_;
|
|
saver.callback_ = callback;
|
|
saver.is_blob_index = is_blob_index;
|
|
saver.do_merge = do_merge;
|
|
saver.allow_data_in_errors = moptions_.allow_data_in_errors;
|
|
saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
|
|
table_->Get(key, &saver, SaveValue);
|
|
*seq = saver.seq;
|
|
}
|
|
|
|
void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|
ReadCallback* callback, bool immutable_memtable) {
|
|
// The sequence number is updated synchronously in version_set.h
|
|
if (IsEmpty()) {
|
|
// Avoiding recording stats for speed.
|
|
return;
|
|
}
|
|
PERF_TIMER_GUARD(get_from_memtable_time);
|
|
|
|
// For now, memtable Bloom filter is effectively disabled if there are any
|
|
// range tombstones. This is the simplest way to ensure range tombstones are
|
|
// handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
|
|
bool no_range_del = read_options.ignore_range_deletions ||
|
|
is_range_del_table_empty_.load(std::memory_order_relaxed);
|
|
MultiGetRange temp_range(*range, range->begin(), range->end());
|
|
if (bloom_filter_ && no_range_del) {
|
|
bool whole_key =
|
|
!prefix_extractor_ || moptions_.memtable_whole_key_filtering;
|
|
std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
|
|
std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match;
|
|
std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> range_indexes;
|
|
int num_keys = 0;
|
|
for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
|
|
if (whole_key) {
|
|
bloom_keys[num_keys] = iter->ukey_without_ts;
|
|
range_indexes[num_keys++] = iter.index();
|
|
} else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
|
|
bloom_keys[num_keys] =
|
|
prefix_extractor_->Transform(iter->ukey_without_ts);
|
|
range_indexes[num_keys++] = iter.index();
|
|
}
|
|
}
|
|
bloom_filter_->MayContain(num_keys, bloom_keys.data(), may_match.data());
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
if (!may_match[i]) {
|
|
temp_range.SkipIndex(range_indexes[i]);
|
|
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
|
|
} else {
|
|
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
|
|
}
|
|
}
|
|
}
|
|
for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
|
|
bool found_final_value{false};
|
|
bool merge_in_progress = iter->s->IsMergeInProgress();
|
|
if (!no_range_del) {
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
|
|
NewRangeTombstoneIteratorInternal(
|
|
read_options, GetInternalKeySeqno(iter->lkey->internal_key()),
|
|
immutable_memtable));
|
|
SequenceNumber covering_seq =
|
|
range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key());
|
|
if (covering_seq > iter->max_covering_tombstone_seq) {
|
|
iter->max_covering_tombstone_seq = covering_seq;
|
|
if (iter->timestamp) {
|
|
// Will be overwritten in SaveValue() if there is a point key with
|
|
// a higher seqno.
|
|
iter->timestamp->assign(range_del_iter->timestamp().data(),
|
|
range_del_iter->timestamp().size());
|
|
}
|
|
}
|
|
}
|
|
SequenceNumber dummy_seq;
|
|
GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
|
|
callback, &iter->is_blob_index,
|
|
iter->value ? iter->value->GetSelf() : nullptr, iter->columns,
|
|
iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq,
|
|
&found_final_value, &merge_in_progress);
|
|
|
|
if (!found_final_value && merge_in_progress) {
|
|
*(iter->s) = Status::MergeInProgress();
|
|
}
|
|
|
|
if (found_final_value) {
|
|
if (iter->value) {
|
|
iter->value->PinSelf();
|
|
range->AddValueSize(iter->value->size());
|
|
} else {
|
|
assert(iter->columns);
|
|
range->AddValueSize(iter->columns->serialized_size());
|
|
}
|
|
|
|
range->MarkKeyDone(iter);
|
|
RecordTick(moptions_.statistics, MEMTABLE_HIT);
|
|
if (range->GetValueSize() > read_options.value_size_soft_limit) {
|
|
// Set all remaining keys in range to Abort
|
|
for (auto range_iter = range->begin(); range_iter != range->end();
|
|
++range_iter) {
|
|
range->MarkKeyDone(range_iter);
|
|
*(range_iter->s) = Status::Aborted();
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
PERF_COUNTER_ADD(get_from_memtable_count, 1);
|
|
}
|
|
|
|
Status MemTable::Update(SequenceNumber seq, ValueType value_type,
|
|
const Slice& key, const Slice& value,
|
|
const ProtectionInfoKVOS64* kv_prot_info) {
|
|
LookupKey lkey(key, seq);
|
|
Slice mem_key = lkey.memtable_key();
|
|
|
|
std::unique_ptr<MemTableRep::Iterator> iter(
|
|
table_->GetDynamicPrefixIterator());
|
|
iter->Seek(lkey.internal_key(), mem_key.data());
|
|
|
|
if (iter->Valid()) {
|
|
// Refer to comments under MemTable::Add() for entry format.
|
|
// Check that it belongs to same user key. We do not check the
|
|
// sequence number since the Seek() call above should have skipped
|
|
// all entries with overly large sequence numbers.
|
|
const char* entry = iter->key();
|
|
uint32_t key_length = 0;
|
|
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
|
if (comparator_.comparator.user_comparator()->Equal(
|
|
Slice(key_ptr, key_length - 8), lkey.user_key())) {
|
|
// Correct user key
|
|
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
|
ValueType type;
|
|
SequenceNumber existing_seq;
|
|
UnPackSequenceAndType(tag, &existing_seq, &type);
|
|
assert(existing_seq != seq);
|
|
if (type == value_type) {
|
|
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
|
|
uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
|
|
uint32_t new_size = static_cast<uint32_t>(value.size());
|
|
|
|
// Update value, if new value size <= previous value size
|
|
if (new_size <= prev_size) {
|
|
char* p =
|
|
EncodeVarint32(const_cast<char*>(key_ptr) + key_length, new_size);
|
|
WriteLock wl(GetLock(lkey.user_key()));
|
|
memcpy(p, value.data(), value.size());
|
|
assert((unsigned)((p + value.size()) - entry) ==
|
|
(unsigned)(VarintLength(key_length) + key_length +
|
|
VarintLength(value.size()) + value.size()));
|
|
RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
|
|
if (kv_prot_info != nullptr) {
|
|
ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
|
|
// `seq` is swallowed and `existing_seq` prevails.
|
|
updated_kv_prot_info.UpdateS(seq, existing_seq);
|
|
UpdateEntryChecksum(&updated_kv_prot_info, key, value, type,
|
|
existing_seq, p + value.size());
|
|
Slice encoded(entry, p + value.size() - entry);
|
|
return VerifyEncodedEntry(encoded, updated_kv_prot_info);
|
|
} else {
|
|
UpdateEntryChecksum(nullptr, key, value, type, existing_seq,
|
|
p + value.size());
|
|
}
|
|
return Status::OK();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// The latest value is not value_type or key doesn't exist
|
|
return Add(seq, value_type, key, value, kv_prot_info);
|
|
}
|
|
|
|
Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key,
|
|
const Slice& delta,
|
|
const ProtectionInfoKVOS64* kv_prot_info) {
|
|
LookupKey lkey(key, seq);
|
|
Slice memkey = lkey.memtable_key();
|
|
|
|
std::unique_ptr<MemTableRep::Iterator> iter(
|
|
table_->GetDynamicPrefixIterator());
|
|
iter->Seek(lkey.internal_key(), memkey.data());
|
|
|
|
if (iter->Valid()) {
|
|
// Refer to comments under MemTable::Add() for entry format.
|
|
// Check that it belongs to same user key. We do not check the
|
|
// sequence number since the Seek() call above should have skipped
|
|
// all entries with overly large sequence numbers.
|
|
const char* entry = iter->key();
|
|
uint32_t key_length = 0;
|
|
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
|
if (comparator_.comparator.user_comparator()->Equal(
|
|
Slice(key_ptr, key_length - 8), lkey.user_key())) {
|
|
// Correct user key
|
|
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
|
ValueType type;
|
|
uint64_t existing_seq;
|
|
UnPackSequenceAndType(tag, &existing_seq, &type);
|
|
if (type == kTypeValue) {
|
|
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
|
|
uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
|
|
|
|
char* prev_buffer = const_cast<char*>(prev_value.data());
|
|
uint32_t new_prev_size = prev_size;
|
|
|
|
std::string str_value;
|
|
WriteLock wl(GetLock(lkey.user_key()));
|
|
auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
|
|
delta, &str_value);
|
|
if (status == UpdateStatus::UPDATED_INPLACE) {
|
|
// Value already updated by callback.
|
|
assert(new_prev_size <= prev_size);
|
|
if (new_prev_size < prev_size) {
|
|
// overwrite the new prev_size
|
|
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
|
|
new_prev_size);
|
|
if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
|
|
// shift the value buffer as well.
|
|
memcpy(p, prev_buffer, new_prev_size);
|
|
prev_buffer = p;
|
|
}
|
|
}
|
|
RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
|
|
UpdateFlushState();
|
|
Slice new_value(prev_buffer, new_prev_size);
|
|
if (kv_prot_info != nullptr) {
|
|
ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
|
|
// `seq` is swallowed and `existing_seq` prevails.
|
|
updated_kv_prot_info.UpdateS(seq, existing_seq);
|
|
updated_kv_prot_info.UpdateV(delta, new_value);
|
|
Slice encoded(entry, prev_buffer + new_prev_size - entry);
|
|
UpdateEntryChecksum(&updated_kv_prot_info, key, new_value, type,
|
|
existing_seq, prev_buffer + new_prev_size);
|
|
return VerifyEncodedEntry(encoded, updated_kv_prot_info);
|
|
} else {
|
|
UpdateEntryChecksum(nullptr, key, new_value, type, existing_seq,
|
|
prev_buffer + new_prev_size);
|
|
}
|
|
return Status::OK();
|
|
} else if (status == UpdateStatus::UPDATED) {
|
|
Status s;
|
|
if (kv_prot_info != nullptr) {
|
|
ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
|
|
updated_kv_prot_info.UpdateV(delta, str_value);
|
|
s = Add(seq, kTypeValue, key, Slice(str_value),
|
|
&updated_kv_prot_info);
|
|
} else {
|
|
s = Add(seq, kTypeValue, key, Slice(str_value),
|
|
nullptr /* kv_prot_info */);
|
|
}
|
|
RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
|
|
UpdateFlushState();
|
|
return s;
|
|
} else if (status == UpdateStatus::UPDATE_FAILED) {
|
|
// `UPDATE_FAILED` is named incorrectly. It indicates no update
|
|
// happened. It does not indicate a failure happened.
|
|
UpdateFlushState();
|
|
return Status::OK();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// The latest value is not `kTypeValue` or key doesn't exist
|
|
return Status::NotFound();
|
|
}
|
|
|
|
size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
|
|
Slice memkey = key.memtable_key();
|
|
|
|
// A total ordered iterator is costly for some memtablerep (prefix aware
|
|
// reps). By passing in the user key, we allow efficient iterator creation.
|
|
// The iterator only needs to be ordered within the same user key.
|
|
std::unique_ptr<MemTableRep::Iterator> iter(
|
|
table_->GetDynamicPrefixIterator());
|
|
iter->Seek(key.internal_key(), memkey.data());
|
|
|
|
size_t num_successive_merges = 0;
|
|
|
|
for (; iter->Valid(); iter->Next()) {
|
|
const char* entry = iter->key();
|
|
uint32_t key_length = 0;
|
|
const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
|
if (!comparator_.comparator.user_comparator()->Equal(
|
|
Slice(iter_key_ptr, key_length - 8), key.user_key())) {
|
|
break;
|
|
}
|
|
|
|
const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
|
|
ValueType type;
|
|
uint64_t unused;
|
|
UnPackSequenceAndType(tag, &unused, &type);
|
|
if (type != kTypeMerge) {
|
|
break;
|
|
}
|
|
|
|
++num_successive_merges;
|
|
}
|
|
|
|
return num_successive_merges;
|
|
}
|
|
|
|
void MemTableRep::Get(const LookupKey& k, void* callback_args,
|
|
bool (*callback_func)(void* arg, const char* entry)) {
|
|
auto iter = GetDynamicPrefixIterator();
|
|
for (iter->Seek(k.internal_key(), k.memtable_key().data());
|
|
iter->Valid() && callback_func(callback_args, iter->key());
|
|
iter->Next()) {
|
|
}
|
|
}
|
|
|
|
void MemTable::RefLogContainingPrepSection(uint64_t log) {
|
|
assert(log > 0);
|
|
auto cur = min_prep_log_referenced_.load();
|
|
while ((log < cur || cur == 0) &&
|
|
!min_prep_log_referenced_.compare_exchange_strong(cur, log)) {
|
|
cur = min_prep_log_referenced_.load();
|
|
}
|
|
}
|
|
|
|
uint64_t MemTable::GetMinLogContainingPrepSection() {
|
|
return min_prep_log_referenced_.load();
|
|
}
|
|
|
|
void MemTable::MaybeUpdateNewestUDT(const Slice& user_key) {
|
|
if (ts_sz_ == 0 || persist_user_defined_timestamps_) {
|
|
return;
|
|
}
|
|
const Comparator* ucmp = GetInternalKeyComparator().user_comparator();
|
|
Slice udt = ExtractTimestampFromUserKey(user_key, ts_sz_);
|
|
if (newest_udt_.empty() || ucmp->CompareTimestamp(udt, newest_udt_) > 0) {
|
|
newest_udt_ = udt;
|
|
}
|
|
}
|
|
|
|
const Slice& MemTable::GetNewestUDT() const {
|
|
// This path should not be invoked for MemTables that does not enable the UDT
|
|
// in Memtable only feature.
|
|
assert(ts_sz_ > 0 && !persist_user_defined_timestamps_);
|
|
return newest_udt_;
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|