rocksdb/db/write_callback_test.cc
anand76 97991960e9 Retry DB::Open upon a corruption detected while reading the MANIFEST (#12518)
Summary:
This PR is a counterpart of https://github.com/facebook/rocksdb/issues/12427 . On file systems that support storage level data checksum and reconstruction, retry opening the DB if a corruption is detected when reading the MANIFEST. This could be done in `log::Reader`, but its a little complicated since the sequential file would have to be reopened in order to re-read the same data, and we may miss some subtle corruptions that don't result in checksum mismatch. The approach chosen here instead is to make the decision to retry in `DBImpl::Recover`, based on either an explicit corruption in the MANIFEST file, or missing SST files due to bad data in the MANIFEST.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12518

Reviewed By: ajkr

Differential Revision: D55932155

Pulled By: anand1976

fbshipit-source-id: 51755a29b3eb14b9d8e98534adb2e7d54b12ced9
2024-04-18 17:36:33 -07:00

455 lines
14 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/write_callback.h"
#include <atomic>
#include <functional>
#include <string>
#include <utility>
#include <vector>
#include "db/db_impl/db_impl.h"
#include "port/port.h"
#include "rocksdb/db.h"
#include "rocksdb/write_batch.h"
#include "test_util/sync_point.h"
#include "test_util/testharness.h"
#include "util/random.h"
using std::string;
namespace ROCKSDB_NAMESPACE {
class WriteCallbackTest : public testing::Test {
public:
string dbname;
WriteCallbackTest() {
dbname = test::PerThreadDBPath("write_callback_testdb");
}
};
class WriteCallbackTestWriteCallback1 : public WriteCallback {
public:
bool was_called = false;
Status Callback(DB* db) override {
was_called = true;
// Make sure db is a DBImpl
DBImpl* db_impl = dynamic_cast<DBImpl*>(db);
if (db_impl == nullptr) {
return Status::InvalidArgument("");
}
return Status::OK();
}
bool AllowWriteBatching() override { return true; }
};
class WriteCallbackTestWriteCallback2 : public WriteCallback {
public:
Status Callback(DB* /*db*/) override { return Status::Busy(); }
bool AllowWriteBatching() override { return true; }
};
class MockWriteCallback : public WriteCallback {
public:
bool should_fail_ = false;
bool allow_batching_ = false;
std::atomic<bool> was_called_{false};
MockWriteCallback() = default;
MockWriteCallback(const MockWriteCallback& other) {
should_fail_ = other.should_fail_;
allow_batching_ = other.allow_batching_;
was_called_.store(other.was_called_.load());
}
Status Callback(DB* /*db*/) override {
was_called_.store(true);
if (should_fail_) {
return Status::Busy();
} else {
return Status::OK();
}
}
bool AllowWriteBatching() override { return allow_batching_; }
};
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
class WriteCallbackPTest
: public WriteCallbackTest,
public ::testing::WithParamInterface<
std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
public:
WriteCallbackPTest() {
std::tie(unordered_write_, seq_per_batch_, two_queues_, allow_parallel_,
allow_batching_, enable_WAL_, enable_pipelined_write_) =
GetParam();
}
protected:
bool unordered_write_;
bool seq_per_batch_;
bool two_queues_;
bool allow_parallel_;
bool allow_batching_;
bool enable_WAL_;
bool enable_pipelined_write_;
};
TEST_P(WriteCallbackPTest, WriteWithCallbackTest) {
struct WriteOP {
WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; }
void Put(const string& key, const string& val) {
kvs_.emplace_back(key, val);
ASSERT_OK(write_batch_.Put(key, val));
}
void Clear() {
kvs_.clear();
write_batch_.Clear();
callback_.was_called_.store(false);
}
MockWriteCallback callback_;
WriteBatch write_batch_;
std::vector<std::pair<string, string>> kvs_;
};
// In each scenario we'll launch multiple threads to write.
// The size of each array equals to number of threads, and
// each boolean in it denote whether callback of corresponding
// thread should succeed or fail.
std::vector<std::vector<WriteOP>> write_scenarios = {
{true},
{false},
{false, false},
{true, true},
{true, false},
{false, true},
{false, false, false},
{true, true, true},
{false, true, false},
{true, false, true},
{true, false, false, false, false},
{false, false, false, false, true},
{false, false, true, false, true},
};
for (auto& write_group : write_scenarios) {
Options options;
options.create_if_missing = true;
options.unordered_write = unordered_write_;
options.allow_concurrent_memtable_write = allow_parallel_;
options.enable_pipelined_write = enable_pipelined_write_;
options.two_write_queues = two_queues_;
// Skip unsupported combinations
if (options.enable_pipelined_write && seq_per_batch_) {
continue;
}
if (options.enable_pipelined_write && options.two_write_queues) {
continue;
}
if (options.unordered_write && !options.allow_concurrent_memtable_write) {
continue;
}
if (options.unordered_write && options.enable_pipelined_write) {
continue;
}
ReadOptions read_options;
DB* db;
DBImpl* db_impl;
ASSERT_OK(DestroyDB(dbname, options));
DBOptions db_options(options);
ColumnFamilyOptions cf_options(options);
std::vector<ColumnFamilyDescriptor> column_families;
column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
std::vector<ColumnFamilyHandle*> handles;
auto open_s = DBImpl::Open(db_options, dbname, column_families, &handles,
&db, seq_per_batch_, true /* batch_per_txn */,
false /* is_retry */, nullptr /* can_retry */);
ASSERT_OK(open_s);
assert(handles.size() == 1);
delete handles[0];
db_impl = dynamic_cast<DBImpl*>(db);
ASSERT_TRUE(db_impl);
// Writers that have called JoinBatchGroup.
std::atomic<uint64_t> threads_joining(0);
// Writers that have linked to the queue
std::atomic<uint64_t> threads_linked(0);
// Writers that pass WriteThread::JoinBatchGroup:Wait sync-point.
std::atomic<uint64_t> threads_verified(0);
std::atomic<uint64_t> seq(db_impl->GetLatestSequenceNumber());
ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0);
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"WriteThread::JoinBatchGroup:Start", [&](void*) {
uint64_t cur_threads_joining = threads_joining.fetch_add(1);
// Wait for the last joined writer to link to the queue.
// In this way the writers link to the queue one by one.
// This allows us to confidently detect the first writer
// who increases threads_linked as the leader.
while (threads_linked.load() < cur_threads_joining) {
}
});
// Verification once writers call JoinBatchGroup.
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
uint64_t cur_threads_linked = threads_linked.fetch_add(1);
bool is_leader = false;
bool is_last = false;
// who am i
is_leader = (cur_threads_linked == 0);
is_last = (cur_threads_linked == write_group.size() - 1);
// check my state
auto* writer = static_cast<WriteThread::Writer*>(arg);
if (is_leader) {
ASSERT_TRUE(writer->state ==
WriteThread::State::STATE_GROUP_LEADER);
} else {
ASSERT_TRUE(writer->state == WriteThread::State::STATE_INIT);
}
// (meta test) the first WriteOP should indeed be the first
// and the last should be the last (all others can be out of
// order)
if (is_leader) {
ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
!write_group.front().callback_.should_fail_);
} else if (is_last) {
ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
!write_group.back().callback_.should_fail_);
}
threads_verified.fetch_add(1);
// Wait here until all verification in this sync-point
// callback finish for all writers.
while (threads_verified.load() < write_group.size()) {
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) {
// check my state
auto* writer = static_cast<WriteThread::Writer*>(arg);
if (!allow_batching_) {
// no batching so everyone should be a leader
ASSERT_TRUE(writer->state ==
WriteThread::State::STATE_GROUP_LEADER);
} else if (!allow_parallel_) {
ASSERT_TRUE(writer->state == WriteThread::State::STATE_COMPLETED ||
(enable_pipelined_write_ &&
writer->state ==
WriteThread::State::STATE_MEMTABLE_WRITER_LEADER));
}
});
std::atomic<uint32_t> thread_num(0);
std::atomic<char> dummy_key(0);
// Each write thread create a random write batch and write to DB
// with a write callback.
std::function<void()> write_with_callback_func = [&]() {
uint32_t i = thread_num.fetch_add(1);
Random rnd(i);
// leaders gotta lead
while (i > 0 && threads_verified.load() < 1) {
}
// loser has to lose
while (i == write_group.size() - 1 &&
threads_verified.load() < write_group.size() - 1) {
}
auto& write_op = write_group.at(i);
write_op.Clear();
write_op.callback_.allow_batching_ = allow_batching_;
// insert some keys
for (uint32_t j = 0; j < rnd.Next() % 50; j++) {
// grab unique key
char my_key = dummy_key.fetch_add(1);
string skey(5, my_key);
string sval(10, my_key);
write_op.Put(skey, sval);
if (!write_op.callback_.should_fail_ && !seq_per_batch_) {
seq.fetch_add(1);
}
}
if (!write_op.callback_.should_fail_ && seq_per_batch_) {
seq.fetch_add(1);
}
WriteOptions woptions;
woptions.disableWAL = !enable_WAL_;
woptions.sync = enable_WAL_;
if (woptions.protection_bytes_per_key > 0) {
ASSERT_OK(WriteBatchInternal::UpdateProtectionInfo(
&write_op.write_batch_, woptions.protection_bytes_per_key));
}
Status s;
if (seq_per_batch_) {
class PublishSeqCallback : public PreReleaseCallback {
public:
PublishSeqCallback(DBImpl* db_impl_in) : db_impl_(db_impl_in) {}
Status Callback(SequenceNumber last_seq, bool /*not used*/, uint64_t,
size_t /*index*/, size_t /*total*/) override {
db_impl_->SetLastPublishedSequence(last_seq);
return Status::OK();
}
DBImpl* db_impl_;
} publish_seq_callback(db_impl);
// seq_per_batch_ requires a natural batch separator or Noop
ASSERT_OK(WriteBatchInternal::InsertNoop(&write_op.write_batch_));
const size_t ONE_BATCH = 1;
s = db_impl->WriteImpl(woptions, &write_op.write_batch_,
&write_op.callback_, nullptr, 0, false, nullptr,
ONE_BATCH,
two_queues_ ? &publish_seq_callback : nullptr);
} else {
s = db_impl->WriteWithCallback(woptions, &write_op.write_batch_,
&write_op.callback_);
}
if (write_op.callback_.should_fail_) {
ASSERT_TRUE(s.IsBusy());
} else {
ASSERT_OK(s);
}
};
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
// do all the writes
std::vector<port::Thread> threads;
for (uint32_t i = 0; i < write_group.size(); i++) {
threads.emplace_back(write_with_callback_func);
}
for (auto& t : threads) {
t.join();
}
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
// check for keys
string value;
for (auto& w : write_group) {
ASSERT_TRUE(w.callback_.was_called_.load());
for (auto& kvp : w.kvs_) {
if (w.callback_.should_fail_) {
ASSERT_TRUE(db->Get(read_options, kvp.first, &value).IsNotFound());
} else {
ASSERT_OK(db->Get(read_options, kvp.first, &value));
ASSERT_EQ(value, kvp.second);
}
}
}
ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence());
delete db;
ASSERT_OK(DestroyDB(dbname, options));
}
}
INSTANTIATE_TEST_CASE_P(WriteCallbackPTest, WriteCallbackPTest,
::testing::Combine(::testing::Bool(), ::testing::Bool(),
::testing::Bool(), ::testing::Bool(),
::testing::Bool(), ::testing::Bool(),
::testing::Bool()));
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
TEST_F(WriteCallbackTest, WriteCallBackTest) {
Options options;
WriteOptions write_options;
ReadOptions read_options;
string value;
DB* db;
DBImpl* db_impl;
ASSERT_OK(DestroyDB(dbname, options));
options.create_if_missing = true;
Status s = DB::Open(options, dbname, &db);
ASSERT_OK(s);
db_impl = dynamic_cast<DBImpl*>(db);
ASSERT_TRUE(db_impl);
WriteBatch wb;
ASSERT_OK(wb.Put("a", "value.a"));
ASSERT_OK(wb.Delete("x"));
// Test a simple Write
s = db->Write(write_options, &wb);
ASSERT_OK(s);
s = db->Get(read_options, "a", &value);
ASSERT_OK(s);
ASSERT_EQ("value.a", value);
// Test WriteWithCallback
WriteCallbackTestWriteCallback1 callback1;
WriteBatch wb2;
ASSERT_OK(wb2.Put("a", "value.a2"));
s = db_impl->WriteWithCallback(write_options, &wb2, &callback1);
ASSERT_OK(s);
ASSERT_TRUE(callback1.was_called);
s = db->Get(read_options, "a", &value);
ASSERT_OK(s);
ASSERT_EQ("value.a2", value);
// Test WriteWithCallback for a callback that fails
WriteCallbackTestWriteCallback2 callback2;
WriteBatch wb3;
ASSERT_OK(wb3.Put("a", "value.a3"));
s = db_impl->WriteWithCallback(write_options, &wb3, &callback2);
ASSERT_NOK(s);
s = db->Get(read_options, "a", &value);
ASSERT_OK(s);
ASSERT_EQ("value.a2", value);
delete db;
ASSERT_OK(DestroyDB(dbname, options));
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}