rocksdb/db/write_batch_internal.h
Changyu Bi 0ff7713112 Handoff checksum during WAL replay (#10212)
Summary:
Added checksum protection for write batch content read from WAL to when per key-value checksum is computed on the write batch. This gives full coverage on write batch integrity of WAL replay to memtable.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10212

Test Plan:
- Added unit test and the existing tests (replay code path covers the change in this PR): `make -j32 check`
- Stress test: ran `db_stress` for 30min.
- Perf regression:
```
# setup
TEST_TMPDIR=/dev/shm/100MB_WAL_DB/ ./db_bench -benchmarks=fillrandom -write_buffer_size=1048576000
# benchmark db open time
TEST_TMPDIR=/dev/shm/100MB_WAL_DB/ /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=overwrite -write_buffer_size=1048576000 -writes=1 -report_open_timing=true

For 20 runs, pre-PR avg: 3734.31ms, post-PR avg: 3790.06 ms (~1.5% regression).

Pre-PR
OpenDb:     3714.36 milliseconds
OpenDb:     3622.71 milliseconds
OpenDb:     3591.17 milliseconds
OpenDb:     3674.7 milliseconds
OpenDb:     3615.79 milliseconds
OpenDb:     3982.83 milliseconds
OpenDb:     3650.6 milliseconds
OpenDb:     3809.26 milliseconds
OpenDb:     3576.44 milliseconds
OpenDb:     3638.12 milliseconds
OpenDb:     3845.68 milliseconds
OpenDb:     3677.32 milliseconds
OpenDb:     3659.64 milliseconds
OpenDb:     3837.55 milliseconds
OpenDb:     3899.64 milliseconds
OpenDb:     3840.72 milliseconds
OpenDb:     3802.71 milliseconds
OpenDb:     3573.27 milliseconds
OpenDb:     3895.76 milliseconds
OpenDb:     3778.02 milliseconds

Post-PR:
OpenDb:     3880.46 milliseconds
OpenDb:     3709.02 milliseconds
OpenDb:     3954.67 milliseconds
OpenDb:     3955.64 milliseconds
OpenDb:     3958.64 milliseconds
OpenDb:     3631.28 milliseconds
OpenDb:     3721 milliseconds
OpenDb:     3729.89 milliseconds
OpenDb:     3730.55 milliseconds
OpenDb:     3966.32 milliseconds
OpenDb:     3685.54 milliseconds
OpenDb:     3573.17 milliseconds
OpenDb:     3703.75 milliseconds
OpenDb:     3873.62 milliseconds
OpenDb:     3704.4 milliseconds
OpenDb:     3820.98 milliseconds
OpenDb:     3721.62 milliseconds
OpenDb:     3770.86 milliseconds
OpenDb:     3949.78 milliseconds
OpenDb:     3760.07 milliseconds
```

Reviewed By: ajkr

Differential Revision: D37302092

Pulled By: cbi42

fbshipit-source-id: 7346e625f453ce4c0e5d708776cd1fb2af6b068b
2022-07-05 15:44:35 -07:00

396 lines
14 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <array>
#include <vector>
#include "db/flush_scheduler.h"
#include "db/kv_checksum.h"
#include "db/trim_history_scheduler.h"
#include "db/write_thread.h"
#include "rocksdb/db.h"
#include "rocksdb/options.h"
#include "rocksdb/types.h"
#include "rocksdb/write_batch.h"
#include "util/autovector.h"
#include "util/cast_util.h"
namespace ROCKSDB_NAMESPACE {
class MemTable;
class FlushScheduler;
class ColumnFamilyData;
class ColumnFamilyMemTables {
public:
virtual ~ColumnFamilyMemTables() {}
virtual bool Seek(uint32_t column_family_id) = 0;
// returns true if the update to memtable should be ignored
// (useful when recovering from log whose updates have already
// been processed)
virtual uint64_t GetLogNumber() const = 0;
virtual MemTable* GetMemTable() const = 0;
virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
virtual ColumnFamilyData* current() { return nullptr; }
};
class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
public:
explicit ColumnFamilyMemTablesDefault(MemTable* mem)
: ok_(false), mem_(mem) {}
bool Seek(uint32_t column_family_id) override {
ok_ = (column_family_id == 0);
return ok_;
}
uint64_t GetLogNumber() const override { return 0; }
MemTable* GetMemTable() const override {
assert(ok_);
return mem_;
}
ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
private:
bool ok_;
MemTable* mem_;
};
struct WriteBatch::ProtectionInfo {
// `WriteBatch` usually doesn't contain a huge number of keys so protecting
// with a fixed, non-configurable eight bytes per key may work well enough.
autovector<ProtectionInfoKVOC64> entries_;
size_t GetBytesPerKey() const { return 8; }
};
// WriteBatchInternal provides static methods for manipulating a
// WriteBatch that we don't want in the public WriteBatch interface.
class WriteBatchInternal {
public:
// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
static constexpr size_t kHeader = 12;
// WriteBatch methods with column_family_id instead of ColumnFamilyHandle*
static Status Put(WriteBatch* batch, uint32_t column_family_id,
const Slice& key, const Slice& value);
static Status Put(WriteBatch* batch, uint32_t column_family_id,
const SliceParts& key, const SliceParts& value);
static Status PutEntity(WriteBatch* batch, uint32_t column_family_id,
const Slice& key, const WideColumns& columns);
static Status Delete(WriteBatch* batch, uint32_t column_family_id,
const SliceParts& key);
static Status Delete(WriteBatch* batch, uint32_t column_family_id,
const Slice& key);
static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
const SliceParts& key);
static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
const Slice& key);
static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
const Slice& begin_key, const Slice& end_key);
static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
const SliceParts& begin_key,
const SliceParts& end_key);
static Status Merge(WriteBatch* batch, uint32_t column_family_id,
const Slice& key, const Slice& value);
static Status Merge(WriteBatch* batch, uint32_t column_family_id,
const SliceParts& key, const SliceParts& value);
static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
const Slice& key, const Slice& value);
static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid,
const bool write_after_commit = true,
const bool unprepared_batch = false);
static Status MarkRollback(WriteBatch* batch, const Slice& xid);
static Status MarkCommit(WriteBatch* batch, const Slice& xid);
static Status MarkCommitWithTimestamp(WriteBatch* batch, const Slice& xid,
const Slice& commit_ts);
static Status InsertNoop(WriteBatch* batch);
// Return the number of entries in the batch.
static uint32_t Count(const WriteBatch* batch);
// Set the count for the number of entries in the batch.
static void SetCount(WriteBatch* batch, uint32_t n);
// Return the sequence number for the start of this batch.
static SequenceNumber Sequence(const WriteBatch* batch);
// Store the specified number as the sequence number for the start of
// this batch.
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
// Returns the offset of the first entry in the batch.
// This offset is only valid if the batch is not empty.
static size_t GetFirstOffset(WriteBatch* batch);
static Slice Contents(const WriteBatch* batch) {
return Slice(batch->rep_);
}
static size_t ByteSize(const WriteBatch* batch) {
return batch->rep_.size();
}
static Status SetContents(WriteBatch* batch, const Slice& contents);
static Status CheckSlicePartsLength(const SliceParts& key,
const SliceParts& value);
// Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive.
//
// If ignore_missing_column_families == true. WriteBatch
// referencing non-existing column family will be ignored.
// If ignore_missing_column_families == false, processing of the
// batches will be stopped if a reference is found to a non-existing
// column family and InvalidArgument() will be returned. The writes
// in batches may be only partially applied at that point.
//
// If log_number is non-zero, the memtable will be updated only if
// memtables->GetLogNumber() >= log_number.
//
// If flush_scheduler is non-null, it will be invoked if the memtable
// should be flushed.
//
// Under concurrent use, the caller is responsible for making sure that
// the memtables object itself is thread-local.
static Status InsertInto(
WriteThread::WriteGroup& write_group, SequenceNumber sequence,
ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
TrimHistoryScheduler* trim_history_scheduler,
bool ignore_missing_column_families = false, uint64_t log_number = 0,
DB* db = nullptr, bool concurrent_memtable_writes = false,
bool seq_per_batch = false, bool batch_per_txn = true);
// Convenience form of InsertInto when you have only one batch
// next_seq returns the seq after last sequence number used in MemTable insert
static Status InsertInto(
const WriteBatch* batch, ColumnFamilyMemTables* memtables,
FlushScheduler* flush_scheduler,
TrimHistoryScheduler* trim_history_scheduler,
bool ignore_missing_column_families = false, uint64_t log_number = 0,
DB* db = nullptr, bool concurrent_memtable_writes = false,
SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr,
bool seq_per_batch = false, bool batch_per_txn = true);
static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence,
ColumnFamilyMemTables* memtables,
FlushScheduler* flush_scheduler,
TrimHistoryScheduler* trim_history_scheduler,
bool ignore_missing_column_families = false,
uint64_t log_number = 0, DB* db = nullptr,
bool concurrent_memtable_writes = false,
bool seq_per_batch = false, size_t batch_cnt = 0,
bool batch_per_txn = true,
bool hint_per_batch = false);
// Appends src write batch to dst write batch and updates count in dst
// write batch. Returns OK if the append is successful. Checks number of
// checksum against count in dst and src write batches, and returns Corruption
// if the count is inconsistent.
static Status Append(WriteBatch* dst, const WriteBatch* src,
const bool WAL_only = false);
// Returns the byte size of appending a WriteBatch with ByteSize
// leftByteSize and a WriteBatch with ByteSize rightByteSize
static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize);
// Iterate over [begin, end) range of a write batch
static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler,
size_t begin, size_t end);
// This write batch includes the latest state that should be persisted. Such
// state meant to be used only during recovery.
static void SetAsLatestPersistentState(WriteBatch* b);
static bool IsLatestPersistentState(const WriteBatch* b);
static std::tuple<Status, uint32_t, size_t> GetColumnFamilyIdAndTimestampSize(
WriteBatch* b, ColumnFamilyHandle* column_family);
static bool TimestampsUpdateNeeded(const WriteBatch& wb) {
return wb.needs_in_place_update_ts_;
}
static bool HasKeyWithTimestamp(const WriteBatch& wb) {
return wb.has_key_with_ts_;
}
// Update per-key value protection information on this write batch.
// If checksum is provided, the batch content is verfied against the checksum.
static Status UpdateProtectionInfo(WriteBatch* wb, size_t bytes_per_key,
uint64_t* checksum = nullptr);
};
// LocalSavePoint is similar to a scope guard
class LocalSavePoint {
public:
explicit LocalSavePoint(WriteBatch* batch)
: batch_(batch),
savepoint_(batch->GetDataSize(), batch->Count(),
batch->content_flags_.load(std::memory_order_relaxed))
#ifndef NDEBUG
,
committed_(false)
#endif
{
}
#ifndef NDEBUG
~LocalSavePoint() { assert(committed_); }
#endif
Status commit() {
#ifndef NDEBUG
committed_ = true;
#endif
if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) {
batch_->rep_.resize(savepoint_.size);
WriteBatchInternal::SetCount(batch_, savepoint_.count);
if (batch_->prot_info_ != nullptr) {
batch_->prot_info_->entries_.resize(savepoint_.count);
}
batch_->content_flags_.store(savepoint_.content_flags,
std::memory_order_relaxed);
return Status::MemoryLimit();
}
return Status::OK();
}
private:
WriteBatch* batch_;
SavePoint savepoint_;
#ifndef NDEBUG
bool committed_;
#endif
};
template <typename TimestampSizeFuncType>
class TimestampUpdater : public WriteBatch::Handler {
public:
explicit TimestampUpdater(WriteBatch::ProtectionInfo* prot_info,
TimestampSizeFuncType&& ts_sz_func, const Slice& ts)
: prot_info_(prot_info),
ts_sz_func_(std::move(ts_sz_func)),
timestamp_(ts) {
assert(!timestamp_.empty());
}
~TimestampUpdater() override {}
Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
return UpdateTimestamp(cf, key);
}
Status DeleteCF(uint32_t cf, const Slice& key) override {
return UpdateTimestamp(cf, key);
}
Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
return UpdateTimestamp(cf, key);
}
Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
const Slice&) override {
return UpdateTimestamp(cf, begin_key);
}
Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
return UpdateTimestamp(cf, key);
}
Status PutBlobIndexCF(uint32_t cf, const Slice& key, const Slice&) override {
return UpdateTimestamp(cf, key);
}
Status MarkBeginPrepare(bool) override { return Status::OK(); }
Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
Status MarkCommit(const Slice&) override { return Status::OK(); }
Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
return Status::OK();
}
Status MarkRollback(const Slice&) override { return Status::OK(); }
Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); }
private:
Status UpdateTimestamp(uint32_t cf, const Slice& key) {
Status s = UpdateTimestampImpl(cf, key, idx_);
++idx_;
return s;
}
Status UpdateTimestampImpl(uint32_t cf, const Slice& key, size_t /*idx*/) {
if (timestamp_.empty()) {
return Status::InvalidArgument("Timestamp is empty");
}
size_t cf_ts_sz = ts_sz_func_(cf);
if (0 == cf_ts_sz) {
// Skip this column family.
return Status::OK();
} else if (std::numeric_limits<size_t>::max() == cf_ts_sz) {
// Column family timestamp info not found.
return Status::NotFound();
} else if (cf_ts_sz != timestamp_.size()) {
return Status::InvalidArgument("timestamp size mismatch");
}
UpdateProtectionInformationIfNeeded(key, timestamp_);
char* ptr = const_cast<char*>(key.data() + key.size() - cf_ts_sz);
assert(ptr);
memcpy(ptr, timestamp_.data(), timestamp_.size());
return Status::OK();
}
void UpdateProtectionInformationIfNeeded(const Slice& key, const Slice& ts) {
if (prot_info_ != nullptr) {
const size_t ts_sz = ts.size();
SliceParts old_key(&key, 1);
Slice key_no_ts(key.data(), key.size() - ts_sz);
std::array<Slice, 2> new_key_cmpts{{key_no_ts, ts}};
SliceParts new_key(new_key_cmpts.data(), 2);
prot_info_->entries_[idx_].UpdateK(old_key, new_key);
}
}
// No copy or move.
TimestampUpdater(const TimestampUpdater&) = delete;
TimestampUpdater(TimestampUpdater&&) = delete;
TimestampUpdater& operator=(const TimestampUpdater&) = delete;
TimestampUpdater& operator=(TimestampUpdater&&) = delete;
WriteBatch::ProtectionInfo* const prot_info_ = nullptr;
const TimestampSizeFuncType ts_sz_func_{};
const Slice timestamp_;
size_t idx_ = 0;
};
} // namespace ROCKSDB_NAMESPACE