rocksdb/db/db_kv_checksum_test.cc
Andrew Kryczka 933ac0e05c Fix locking for ColumnFamilyOptions::inplace_update_support (#12624)
Summary:
In `SaveValue()`, the read lock needs to be obtained before `VerifyEntryChecksum()` because the KV checksum verification reads the entire value metadata+data, which is all mutable when `ColumnFamilyOptions::inplace_update_support == true`.

In `MemTable::Update()`, the write lock needs to be obtained before mutating the value metadata (changing the value size) because it can be read concurrently.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12624

Test Plan:
```
$ make COMPILE_WITH_TSAN=1 -j56 db_stress
...
$ python3 tools/db_crashtest.py blackbox --simple --max_key=10 --inplace_update_support=1 --interval=10 --allow_concurrent_memtable_write=0
```

Reviewed By: cbi42

Differential Revision: D57034571

Pulled By: ajkr

fbshipit-source-id: 3dddf881ad87923143acdf6bfec12ce47bb13a48
2024-05-08 08:30:12 -07:00

884 lines
36 KiB
C++

// Copyright (c) 2020-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/blob/blob_index.h"
#include "db/db_test_util.h"
#include "rocksdb/rocksdb_namespace.h"
namespace ROCKSDB_NAMESPACE {
enum class WriteBatchOpType {
kPut = 0,
kDelete,
kSingleDelete,
kMerge,
kPutEntity,
kDeleteRange,
kNum,
};
// Integer addition is needed for `::testing::Range()` to take the enum type.
WriteBatchOpType operator+(WriteBatchOpType lhs, const int rhs) {
using T = std::underlying_type<WriteBatchOpType>::type;
return static_cast<WriteBatchOpType>(static_cast<T>(lhs) + rhs);
}
enum class WriteMode {
// `Write()` a `WriteBatch` constructed with `protection_bytes_per_key = 0`
// and `WriteOptions::protection_bytes_per_key = 0`
kWriteUnprotectedBatch = 0,
// `Write()` a `WriteBatch` constructed with `protection_bytes_per_key > 0`.
kWriteProtectedBatch,
// `Write()` a `WriteBatch` constructed with `protection_bytes_per_key == 0`.
// Protection is enabled via `WriteOptions::protection_bytes_per_key > 0`.
kWriteOptionProtectedBatch,
// TODO(ajkr): add a mode that uses `Write()` wrappers, e.g., `Put()`.
kNum,
};
// Integer addition is needed for `::testing::Range()` to take the enum type.
WriteMode operator+(WriteMode lhs, const int rhs) {
using T = std::underlying_type<WriteMode>::type;
return static_cast<WriteMode>(static_cast<T>(lhs) + rhs);
}
std::pair<WriteBatch, Status> GetWriteBatch(ColumnFamilyHandle* cf_handle,
size_t protection_bytes_per_key,
WriteBatchOpType op_type) {
Status s;
WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */,
protection_bytes_per_key, 0 /* default_cf_ts_sz */);
switch (op_type) {
case WriteBatchOpType::kPut:
s = wb.Put(cf_handle, "key", "val");
break;
case WriteBatchOpType::kDelete:
s = wb.Delete(cf_handle, "key");
break;
case WriteBatchOpType::kSingleDelete:
s = wb.SingleDelete(cf_handle, "key");
break;
case WriteBatchOpType::kDeleteRange:
s = wb.DeleteRange(cf_handle, "begin", "end");
break;
case WriteBatchOpType::kMerge:
s = wb.Merge(cf_handle, "key", "val");
break;
case WriteBatchOpType::kPutEntity:
s = wb.PutEntity(cf_handle, "key",
{{"attr_name1", "foo"}, {"attr_name2", "bar"}});
break;
case WriteBatchOpType::kNum:
assert(false);
}
return {std::move(wb), std::move(s)};
}
class DbKvChecksumTestBase : public DBTestBase {
public:
DbKvChecksumTestBase(const std::string& path, bool env_do_fsync)
: DBTestBase(path, env_do_fsync) {}
ColumnFamilyHandle* GetCFHandleToUse(ColumnFamilyHandle* column_family,
WriteBatchOpType op_type) const {
// Note: PutEntity cannot be called without column family
if (op_type == WriteBatchOpType::kPutEntity && !column_family) {
return db_->DefaultColumnFamily();
}
return column_family;
}
};
class DbKvChecksumTest
: public DbKvChecksumTestBase,
public ::testing::WithParamInterface<
std::tuple<WriteBatchOpType, char, WriteMode,
uint32_t /* memtable_protection_bytes_per_key */>> {
public:
DbKvChecksumTest()
: DbKvChecksumTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {
op_type_ = std::get<0>(GetParam());
corrupt_byte_addend_ = std::get<1>(GetParam());
write_mode_ = std::get<2>(GetParam());
memtable_protection_bytes_per_key_ = std::get<3>(GetParam());
}
Status ExecuteWrite(ColumnFamilyHandle* cf_handle) {
switch (write_mode_) {
case WriteMode::kWriteUnprotectedBatch: {
auto batch_and_status =
GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
0 /* protection_bytes_per_key */, op_type_);
assert(batch_and_status.second.ok());
// Default write option has protection_bytes_per_key = 0
return db_->Write(WriteOptions(), &batch_and_status.first);
}
case WriteMode::kWriteProtectedBatch: {
auto batch_and_status =
GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
8 /* protection_bytes_per_key */, op_type_);
assert(batch_and_status.second.ok());
return db_->Write(WriteOptions(), &batch_and_status.first);
}
case WriteMode::kWriteOptionProtectedBatch: {
auto batch_and_status =
GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
0 /* protection_bytes_per_key */, op_type_);
assert(batch_and_status.second.ok());
WriteOptions write_opts;
write_opts.protection_bytes_per_key = 8;
return db_->Write(write_opts, &batch_and_status.first);
}
case WriteMode::kNum:
assert(false);
}
return Status::NotSupported("WriteMode " +
std::to_string(static_cast<int>(write_mode_)));
}
void CorruptNextByteCallBack(void* arg) {
Slice encoded = *static_cast<Slice*>(arg);
if (entry_len_ == std::numeric_limits<size_t>::max()) {
// We learn the entry size on the first attempt
entry_len_ = encoded.size();
}
char* buf = const_cast<char*>(encoded.data());
buf[corrupt_byte_offset_] += corrupt_byte_addend_;
++corrupt_byte_offset_;
}
bool MoreBytesToCorrupt() { return corrupt_byte_offset_ < entry_len_; }
protected:
WriteBatchOpType op_type_;
char corrupt_byte_addend_;
WriteMode write_mode_;
uint32_t memtable_protection_bytes_per_key_;
size_t corrupt_byte_offset_ = 0;
size_t entry_len_ = std::numeric_limits<size_t>::max();
};
std::string GetOpTypeString(const WriteBatchOpType& op_type) {
switch (op_type) {
case WriteBatchOpType::kPut:
return "Put";
case WriteBatchOpType::kDelete:
return "Delete";
case WriteBatchOpType::kSingleDelete:
return "SingleDelete";
case WriteBatchOpType::kDeleteRange:
return "DeleteRange";
case WriteBatchOpType::kMerge:
return "Merge";
case WriteBatchOpType::kPutEntity:
return "PutEntity";
case WriteBatchOpType::kNum:
assert(false);
}
assert(false);
return "";
}
std::string GetWriteModeString(const WriteMode& mode) {
switch (mode) {
case WriteMode::kWriteUnprotectedBatch:
return "WriteUnprotectedBatch";
case WriteMode::kWriteProtectedBatch:
return "WriteProtectedBatch";
case WriteMode::kWriteOptionProtectedBatch:
return "kWriteOptionProtectedBatch";
case WriteMode::kNum:
assert(false);
}
return "";
}
INSTANTIATE_TEST_CASE_P(
DbKvChecksumTest, DbKvChecksumTest,
::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
WriteBatchOpType::kNum),
::testing::Values(2, 103, 251),
::testing::Range(WriteMode::kWriteProtectedBatch,
WriteMode::kNum),
::testing::Values(0)),
[](const testing::TestParamInfo<
std::tuple<WriteBatchOpType, char, WriteMode, uint32_t>>& args) {
std::ostringstream oss;
oss << GetOpTypeString(std::get<0>(args.param)) << "Add"
<< static_cast<int>(
static_cast<unsigned char>(std::get<1>(args.param)))
<< GetWriteModeString(std::get<2>(args.param))
<< static_cast<uint32_t>(std::get<3>(args.param));
return oss.str();
});
// TODO(ajkr): add a test that corrupts the `WriteBatch` contents. Such
// corruptions should only be detectable in `WriteMode::kWriteProtectedBatch`.
TEST_P(DbKvChecksumTest, MemTableAddCorrupted) {
// This test repeatedly attempts to write `WriteBatch`es containing a single
// entry of type `op_type_`. Each attempt has one byte corrupted in its
// memtable entry by adding `corrupt_byte_addend_` to its original value. The
// test repeats until an attempt has been made on each byte in the encoded
// memtable entry. All attempts are expected to fail with `Status::Corruption`
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:Encoded",
std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
std::placeholders::_1));
while (MoreBytesToCorrupt()) {
// Failed memtable insert always leads to read-only mode, so we have to
// reopen for every attempt.
Options options = CurrentOptions();
if (op_type_ == WriteBatchOpType::kMerge) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
Reopen(options);
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
SyncPoint::GetInstance()->DisableProcessing();
// In case the above callback is not invoked, this test will run
// numeric_limits<size_t>::max() times until it reports an error (or will
// exhaust disk space). Added this assert to report error early.
ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
}
}
TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) {
// This test repeatedly attempts to write `WriteBatch`es containing a single
// entry of type `op_type_` to a non-default column family. Each attempt has
// one byte corrupted in its memtable entry by adding `corrupt_byte_addend_`
// to its original value. The test repeats until an attempt has been made on
// each byte in the encoded memtable entry. All attempts are expected to fail
// with `Status::Corruption`.
Options options = CurrentOptions();
if (op_type_ == WriteBatchOpType::kMerge) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
CreateAndReopenWithCF({"pikachu"}, options);
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:Encoded",
std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
std::placeholders::_1));
while (MoreBytesToCorrupt()) {
// Failed memtable insert always leads to read-only mode, so we have to
// reopen for every attempt.
ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_TRUE(ExecuteWrite(handles_[1]).IsCorruption());
SyncPoint::GetInstance()->DisableProcessing();
// In case the above callback is not invoked, this test will run
// numeric_limits<size_t>::max() times until it reports an error (or will
// exhaust disk space). Added this assert to report error early.
ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
}
}
TEST_P(DbKvChecksumTest, NoCorruptionCase) {
// If this test fails, we may have found a piece of malfunctioned hardware
auto batch_and_status =
GetWriteBatch(GetCFHandleToUse(nullptr, op_type_),
8 /* protection_bytes_per_key */, op_type_);
ASSERT_OK(batch_and_status.second);
ASSERT_OK(batch_and_status.first.VerifyChecksum());
}
TEST_P(DbKvChecksumTest, WriteToWALCorrupted) {
// This test repeatedly attempts to write `WriteBatch`es containing a single
// entry of type `op_type_`. Each attempt has one byte corrupted by adding
// `corrupt_byte_addend_` to its original value. The test repeats until an
// attempt has been made on each byte in the encoded write batch. All attempts
// are expected to fail with `Status::Corruption`
Options options = CurrentOptions();
if (op_type_ == WriteBatchOpType::kMerge) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::WriteToWAL:log_entry",
std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
std::placeholders::_1));
// First 8 bytes are for sequence number which is not protected in write batch
corrupt_byte_offset_ = 8;
while (MoreBytesToCorrupt()) {
// Corrupted write batch leads to read-only mode, so we have to
// reopen for every attempt.
Reopen(options);
auto log_size_pre_write = dbfull()->TEST_total_log_size();
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
// Confirm that nothing was written to WAL
ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
SyncPoint::GetInstance()->DisableProcessing();
// In case the above callback is not invoked, this test will run
// numeric_limits<size_t>::max() times until it reports an error (or will
// exhaust disk space). Added this assert to report error early.
ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
}
}
TEST_P(DbKvChecksumTest, WriteToWALWithColumnFamilyCorrupted) {
// This test repeatedly attempts to write `WriteBatch`es containing a single
// entry of type `op_type_`. Each attempt has one byte corrupted by adding
// `corrupt_byte_addend_` to its original value. The test repeats until an
// attempt has been made on each byte in the encoded write batch. All attempts
// are expected to fail with `Status::Corruption`
Options options = CurrentOptions();
if (op_type_ == WriteBatchOpType::kMerge) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
CreateAndReopenWithCF({"pikachu"}, options);
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::WriteToWAL:log_entry",
std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
std::placeholders::_1));
// First 8 bytes are for sequence number which is not protected in write batch
corrupt_byte_offset_ = 8;
while (MoreBytesToCorrupt()) {
// Corrupted write batch leads to read-only mode, so we have to
// reopen for every attempt.
ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
auto log_size_pre_write = dbfull()->TEST_total_log_size();
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
// Confirm that nothing was written to WAL
ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
SyncPoint::GetInstance()->DisableProcessing();
// In case the above callback is not invoked, this test will run
// numeric_limits<size_t>::max() times until it reports an error (or will
// exhaust disk space). Added this assert to report error early.
ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
}
}
class DbKvChecksumTestMergedBatch
: public DbKvChecksumTestBase,
public ::testing::WithParamInterface<
std::tuple<WriteBatchOpType, WriteBatchOpType, char>> {
public:
DbKvChecksumTestMergedBatch()
: DbKvChecksumTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {
op_type1_ = std::get<0>(GetParam());
op_type2_ = std::get<1>(GetParam());
corrupt_byte_addend_ = std::get<2>(GetParam());
}
protected:
WriteBatchOpType op_type1_;
WriteBatchOpType op_type2_;
char corrupt_byte_addend_;
};
void CorruptWriteBatch(Slice* content, size_t offset,
char corrupt_byte_addend) {
ASSERT_TRUE(offset < content->size());
char* buf = const_cast<char*>(content->data());
buf[offset] += corrupt_byte_addend;
}
TEST_P(DbKvChecksumTestMergedBatch, NoCorruptionCase) {
// Veirfy write batch checksum after write batch append
auto batch1 = GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
8 /* protection_bytes_per_key */, op_type1_);
ASSERT_OK(batch1.second);
auto batch2 = GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
8 /* protection_bytes_per_key */, op_type2_);
ASSERT_OK(batch2.second);
ASSERT_OK(WriteBatchInternal::Append(&batch1.first, &batch2.first));
ASSERT_OK(batch1.first.VerifyChecksum());
}
TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
// This test has two writers repeatedly attempt to write `WriteBatch`es
// containing a single entry of type op_type1_ and op_type2_ respectively. The
// leader of the write group writes the batch containinng the entry of type
// op_type1_. One byte of the pre-merged write batches is corrupted by adding
// `corrupt_byte_addend_` to the batch's original value during each attempt.
// The test repeats until an attempt has been made on each byte in both
// pre-merged write batches. All attempts are expected to fail with
// `Status::Corruption`.
Options options = CurrentOptions();
if (op_type1_ == WriteBatchOpType::kMerge ||
op_type2_ == WriteBatchOpType::kMerge) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
auto leader_batch_and_status =
GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
8 /* protection_bytes_per_key */, op_type1_);
ASSERT_OK(leader_batch_and_status.second);
auto follower_batch_and_status =
GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
8 /* protection_bytes_per_key */, op_type2_);
size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
size_t total_bytes =
leader_batch_size + follower_batch_and_status.first.GetDataSize();
// First 8 bytes are for sequence number which is not protected in write batch
size_t corrupt_byte_offset = 8;
std::atomic<bool> follower_joined{false};
std::atomic<int> leader_count{0};
port::Thread follower_thread;
// This callback should only be called by the leader thread
SyncPoint::GetInstance()->SetCallBack(
"WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) {
auto* leader = static_cast<WriteThread::Writer*>(arg_leader);
ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER);
// This callback should only be called by the follower thread
SyncPoint::GetInstance()->SetCallBack(
"WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) {
auto* follower = static_cast<WriteThread::Writer*>(arg_follower);
// The leader thread will wait on this bool and hence wait until
// this writer joins the write group
ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER);
if (corrupt_byte_offset >= leader_batch_size) {
Slice batch_content = follower->batch->Data();
CorruptWriteBatch(&batch_content,
corrupt_byte_offset - leader_batch_size,
corrupt_byte_addend_);
}
// Leader busy waits on this flag
follower_joined = true;
// So the follower does not enter the outer callback at
// WriteThread::JoinBatchGroup:Wait2
SyncPoint::GetInstance()->DisableProcessing();
});
// Start the other writer thread which will join the write group as
// follower
follower_thread = port::Thread([&]() {
follower_batch_and_status =
GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
8 /* protection_bytes_per_key */, op_type2_);
ASSERT_OK(follower_batch_and_status.second);
ASSERT_TRUE(
db_->Write(WriteOptions(), &follower_batch_and_status.first)
.IsCorruption());
});
ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size);
if (corrupt_byte_offset < leader_batch_size) {
Slice batch_content = leader->batch->Data();
CorruptWriteBatch(&batch_content, corrupt_byte_offset,
corrupt_byte_addend_);
}
leader_count++;
while (!follower_joined) {
// busy waiting
}
});
while (corrupt_byte_offset < total_bytes) {
// Reopen DB since it failed WAL write which lead to read-only mode
Reopen(options);
SyncPoint::GetInstance()->EnableProcessing();
auto log_size_pre_write = dbfull()->TEST_total_log_size();
leader_batch_and_status =
GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
8 /* protection_bytes_per_key */, op_type1_);
ASSERT_OK(leader_batch_and_status.second);
ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
.IsCorruption());
follower_thread.join();
// Prevent leader thread from entering this callback
SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
ASSERT_EQ(1, leader_count);
// Nothing should have been written to WAL
ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
corrupt_byte_offset++;
if (corrupt_byte_offset == leader_batch_size) {
// skip over the sequence number part of follower's write batch
corrupt_byte_offset += 8;
}
follower_joined = false;
leader_count = 0;
}
SyncPoint::GetInstance()->DisableProcessing();
}
TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
// This test has two writers repeatedly attempt to write `WriteBatch`es
// containing a single entry of type op_type1_ and op_type2_ respectively. The
// leader of the write group writes the batch containinng the entry of type
// op_type1_. One byte of the pre-merged write batches is corrupted by adding
// `corrupt_byte_addend_` to the batch's original value during each attempt.
// The test repeats until an attempt has been made on each byte in both
// pre-merged write batches. All attempts are expected to fail with
// `Status::Corruption`.
Options options = CurrentOptions();
if (op_type1_ == WriteBatchOpType::kMerge ||
op_type2_ == WriteBatchOpType::kMerge) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
CreateAndReopenWithCF({"ramen"}, options);
auto leader_batch_and_status =
GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_),
8 /* protection_bytes_per_key */, op_type1_);
ASSERT_OK(leader_batch_and_status.second);
auto follower_batch_and_status =
GetWriteBatch(GetCFHandleToUse(handles_[1], op_type2_),
8 /* protection_bytes_per_key */, op_type2_);
size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
size_t total_bytes =
leader_batch_size + follower_batch_and_status.first.GetDataSize();
// First 8 bytes are for sequence number which is not protected in write batch
size_t corrupt_byte_offset = 8;
std::atomic<bool> follower_joined{false};
std::atomic<int> leader_count{0};
port::Thread follower_thread;
// This callback should only be called by the leader thread
SyncPoint::GetInstance()->SetCallBack(
"WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) {
auto* leader = static_cast<WriteThread::Writer*>(arg_leader);
ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER);
// This callback should only be called by the follower thread
SyncPoint::GetInstance()->SetCallBack(
"WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) {
auto* follower = static_cast<WriteThread::Writer*>(arg_follower);
// The leader thread will wait on this bool and hence wait until
// this writer joins the write group
ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER);
if (corrupt_byte_offset >= leader_batch_size) {
Slice batch_content =
WriteBatchInternal::Contents(follower->batch);
CorruptWriteBatch(&batch_content,
corrupt_byte_offset - leader_batch_size,
corrupt_byte_addend_);
}
follower_joined = true;
// So the follower does not enter the outer callback at
// WriteThread::JoinBatchGroup:Wait2
SyncPoint::GetInstance()->DisableProcessing();
});
// Start the other writer thread which will join the write group as
// follower
follower_thread = port::Thread([&]() {
follower_batch_and_status =
GetWriteBatch(GetCFHandleToUse(handles_[1], op_type2_),
8 /* protection_bytes_per_key */, op_type2_);
ASSERT_OK(follower_batch_and_status.second);
ASSERT_TRUE(
db_->Write(WriteOptions(), &follower_batch_and_status.first)
.IsCorruption());
});
ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size);
if (corrupt_byte_offset < leader_batch_size) {
Slice batch_content = WriteBatchInternal::Contents(leader->batch);
CorruptWriteBatch(&batch_content, corrupt_byte_offset,
corrupt_byte_addend_);
}
leader_count++;
while (!follower_joined) {
// busy waiting
}
});
SyncPoint::GetInstance()->EnableProcessing();
while (corrupt_byte_offset < total_bytes) {
// Reopen DB since it failed WAL write which lead to read-only mode
ReopenWithColumnFamilies({kDefaultColumnFamilyName, "ramen"}, options);
SyncPoint::GetInstance()->EnableProcessing();
auto log_size_pre_write = dbfull()->TEST_total_log_size();
leader_batch_and_status =
GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_),
8 /* protection_bytes_per_key */, op_type1_);
ASSERT_OK(leader_batch_and_status.second);
ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
.IsCorruption());
follower_thread.join();
// Prevent leader thread from entering this callback
SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
ASSERT_EQ(1, leader_count);
// Nothing should have been written to WAL
ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
corrupt_byte_offset++;
if (corrupt_byte_offset == leader_batch_size) {
// skip over the sequence number part of follower's write batch
corrupt_byte_offset += 8;
}
follower_joined = false;
leader_count = 0;
}
SyncPoint::GetInstance()->DisableProcessing();
}
INSTANTIATE_TEST_CASE_P(
DbKvChecksumTestMergedBatch, DbKvChecksumTestMergedBatch,
::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
WriteBatchOpType::kNum),
::testing::Range(static_cast<WriteBatchOpType>(0),
WriteBatchOpType::kNum),
::testing::Values(2, 103, 251)),
[](const testing::TestParamInfo<
std::tuple<WriteBatchOpType, WriteBatchOpType, char>>& args) {
std::ostringstream oss;
oss << GetOpTypeString(std::get<0>(args.param))
<< GetOpTypeString(std::get<1>(args.param)) << "Add"
<< static_cast<int>(
static_cast<unsigned char>(std::get<2>(args.param)));
return oss.str();
});
// TODO: add test for transactions
// TODO: add test for corrupted write batch with WAL disabled
class DbKVChecksumWALToWriteBatchTest : public DBTestBase {
public:
DbKVChecksumWALToWriteBatchTest()
: DBTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {}
};
TEST_F(DbKVChecksumWALToWriteBatchTest, WriteBatchChecksumHandoff) {
Options options = CurrentOptions();
Reopen(options);
ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
std::string content;
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch",
[&](void* batch_ptr) {
WriteBatch* batch = static_cast<WriteBatch*>(batch_ptr);
content.assign(batch->Data().data(), batch->GetDataSize());
Slice batch_content = batch->Data();
// Corrupt first bit
CorruptWriteBatch(&batch_content, 0, 1);
});
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
[&](void* checksum_ptr) {
// Verify that checksum is produced on the batch content
uint64_t checksum = *static_cast<uint64_t*>(checksum_ptr);
ASSERT_EQ(checksum, XXH3_64bits(content.data(), content.size()));
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_TRUE(TryReopen(options).IsCorruption());
SyncPoint::GetInstance()->DisableProcessing();
};
// TODO (cbi): add DeleteRange coverage once it is implemented
class DbMemtableKVChecksumTest : public DbKvChecksumTest {
public:
DbMemtableKVChecksumTest() : DbKvChecksumTest() {}
protected:
// Indices in the memtable entry that we will not corrupt.
// For memtable entry format, see comments in MemTable::Add().
// We do not corrupt key length and value length fields in this test
// case since it causes segfault and ASAN will complain.
// For this test case, key and value are all of length 3, so
// key length field is at index 0 and value length field is at index 12.
const std::set<size_t> index_not_to_corrupt{0, 12};
void SkipNotToCorruptEntry() {
if (index_not_to_corrupt.find(corrupt_byte_offset_) !=
index_not_to_corrupt.end()) {
corrupt_byte_offset_++;
}
}
};
INSTANTIATE_TEST_CASE_P(
DbMemtableKVChecksumTest, DbMemtableKVChecksumTest,
::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
WriteBatchOpType::kDeleteRange),
::testing::Values(2, 103, 251),
::testing::Range(static_cast<WriteMode>(0),
WriteMode::kWriteOptionProtectedBatch),
// skip 1 byte checksum as it makes test flaky
::testing::Values(2, 4, 8)),
[](const testing::TestParamInfo<
std::tuple<WriteBatchOpType, char, WriteMode, uint32_t>>& args) {
std::ostringstream oss;
oss << GetOpTypeString(std::get<0>(args.param)) << "Add"
<< static_cast<int>(
static_cast<unsigned char>(std::get<1>(args.param)))
<< GetWriteModeString(std::get<2>(args.param))
<< static_cast<uint32_t>(std::get<3>(args.param));
return oss.str();
});
TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
// Record memtable entry size.
// Not corrupting memtable entry here since it will segfault
// or fail some asserts inside memtablerep implementation
// e.g., when key_len is corrupted.
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:BeforeReturn:Encoded", [&](void* arg) {
Slice encoded = *static_cast<Slice*>(arg);
entry_len_ = encoded.size();
});
SyncPoint::GetInstance()->SetCallBack(
"Memtable::SaveValue:Found:entry", [&](void* entry) {
char* buf = *static_cast<char**>(entry);
buf[corrupt_byte_offset_] += corrupt_byte_addend_;
++corrupt_byte_offset_;
});
SyncPoint::GetInstance()->EnableProcessing();
Options options = CurrentOptions();
options.memtable_protection_bytes_per_key =
memtable_protection_bytes_per_key_;
if (op_type_ == WriteBatchOpType::kMerge) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
SkipNotToCorruptEntry();
while (MoreBytesToCorrupt()) {
Reopen(options);
ASSERT_OK(ExecuteWrite(nullptr));
std::string val;
ASSERT_TRUE(db_->Get(ReadOptions(), "key", &val).IsCorruption());
Destroy(options);
SkipNotToCorruptEntry();
}
}
TEST_P(DbMemtableKVChecksumTest,
GetWithColumnFamilyCorruptAfterMemtableInsert) {
// Record memtable entry size.
// Not corrupting memtable entry here since it will segfault
// or fail some asserts inside memtablerep implementation
// e.g., when key_len is corrupted.
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:BeforeReturn:Encoded", [&](void* arg) {
Slice encoded = *static_cast<Slice*>(arg);
entry_len_ = encoded.size();
});
SyncPoint::GetInstance()->SetCallBack(
"Memtable::SaveValue:Found:entry", [&](void* entry) {
char* buf = *static_cast<char**>(entry);
buf[corrupt_byte_offset_] += corrupt_byte_addend_;
++corrupt_byte_offset_;
});
SyncPoint::GetInstance()->EnableProcessing();
Options options = CurrentOptions();
options.memtable_protection_bytes_per_key =
memtable_protection_bytes_per_key_;
if (op_type_ == WriteBatchOpType::kMerge) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
SkipNotToCorruptEntry();
while (MoreBytesToCorrupt()) {
Reopen(options);
CreateAndReopenWithCF({"pikachu"}, options);
ASSERT_OK(ExecuteWrite(handles_[1]));
std::string val;
ASSERT_TRUE(
db_->Get(ReadOptions(), handles_[1], "key", &val).IsCorruption());
Destroy(options);
SkipNotToCorruptEntry();
}
}
TEST_P(DbMemtableKVChecksumTest, IteratorWithCorruptAfterMemtableInsert) {
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:BeforeReturn:Encoded",
std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
std::placeholders::_1));
SyncPoint::GetInstance()->EnableProcessing();
Options options = CurrentOptions();
options.memtable_protection_bytes_per_key =
memtable_protection_bytes_per_key_;
if (op_type_ == WriteBatchOpType::kMerge) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
SkipNotToCorruptEntry();
while (MoreBytesToCorrupt()) {
Reopen(options);
ASSERT_OK(ExecuteWrite(nullptr));
Iterator* it = db_->NewIterator(ReadOptions());
it->SeekToFirst();
ASSERT_FALSE(it->Valid());
ASSERT_TRUE(it->status().IsCorruption());
delete it;
Destroy(options);
SkipNotToCorruptEntry();
}
}
TEST_P(DbMemtableKVChecksumTest,
IteratorWithColumnFamilyCorruptAfterMemtableInsert) {
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:BeforeReturn:Encoded",
std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
std::placeholders::_1));
SyncPoint::GetInstance()->EnableProcessing();
Options options = CurrentOptions();
options.memtable_protection_bytes_per_key =
memtable_protection_bytes_per_key_;
if (op_type_ == WriteBatchOpType::kMerge) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
SkipNotToCorruptEntry();
while (MoreBytesToCorrupt()) {
Reopen(options);
CreateAndReopenWithCF({"pikachu"}, options);
ASSERT_OK(ExecuteWrite(handles_[1]));
Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]);
it->SeekToFirst();
ASSERT_FALSE(it->Valid());
ASSERT_TRUE(it->status().IsCorruption());
delete it;
Destroy(options);
SkipNotToCorruptEntry();
}
}
TEST_P(DbMemtableKVChecksumTest, FlushWithCorruptAfterMemtableInsert) {
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:BeforeReturn:Encoded",
std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
std::placeholders::_1));
SyncPoint::GetInstance()->EnableProcessing();
Options options = CurrentOptions();
options.memtable_protection_bytes_per_key =
memtable_protection_bytes_per_key_;
if (op_type_ == WriteBatchOpType::kMerge) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
SkipNotToCorruptEntry();
// Not corruping each byte like other tests since Flush() is relatively slow.
Reopen(options);
ASSERT_OK(ExecuteWrite(nullptr));
ASSERT_TRUE(Flush().IsCorruption());
// DB enters read-only state when flush reads corrupted data
ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
Destroy(options);
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}