Fix duplicate WAL entries caused by write after error recovery (#12873)

Summary:
**Context/Summary:**
We recently discovered a case where write of the same key right after error recovery of a previous failed write of the same key finishes causes two same WAL entries, violating our assertion. This is because we don't advance seqno on failed write and reuse the same WAL containing the failed write for the new write if the memtable at the time is empty.

This PR reuses the flush path for an empty memtable to switch WAL and update min WAL to keep in error recovery flush
 as well as updates the INFO log message for clarity.

```
2024/07/17-15:01:32.271789 327757 (Original Log Time 2024/07/17-15:01:25.942234) [/flush_job.cc:1017] [default] [JOB 2] Level-0 flush table https://github.com/facebook/rocksdb/issues/9: 0 bytes OK It's an empty SST file from a successful flush so won't be kept in the DB
2024/07/17-15:01:32.271798 327757 (Original Log Time 2024/07/17-15:01:32.269954) [/memtable_list.cc:560] [default] Level-0 commit flush result of table https://github.com/facebook/rocksdb/issues/9 started
2024/07/17-15:01:32.271802 327757 (Original Log Time 2024/07/17-15:01:32.271217) [/memtable_list.cc:760] [default] Level-0 commit flush result of table https://github.com/facebook/rocksdb/issues/9: memtable https://github.com/facebook/rocksdb/issues/1 done
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12873

Test Plan:
New UT that failed before this PR with following assertion failure (i.e, duplicate WAL entries) and passes after
```
db_wal_test: db/write_batch.cc:2254: rocksdb::Status rocksdb::{anonymous}::MemTableInserter::PutCFImpl(uint32_t, const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::ValueType, RebuildTxnOp, const ProtectionInfoKVOS64*) [with RebuildTxnOp = rocksdb::{anonymous}::MemTableInserter::PutCF(uint32_t, const rocksdb::Slice&, const rocksdb::Slice&)::<lambda(rocksdb::WriteBatch*, uint32_t, const rocksdb::Slice&, const rocksdb::Slice&)>; uint32_t = unsigned int; rocksdb::ProtectionInfoKVOS64 = rocksdb::ProtectionInfoKVOS<long unsigned int>]: Assertion `seq_per_batch_' failed.
```

Reviewed By: anand1976

Differential Revision: D59884468

Pulled By: hx235

fbshipit-source-id: 5d854b719092552c69727a979f269fb7f6c39756
This commit is contained in:
Hui Xiao 2024-07-22 12:40:25 -07:00 committed by Facebook GitHub Bot
parent c064ac3bc5
commit 349b1ec08f
7 changed files with 73 additions and 13 deletions

View File

@ -1888,6 +1888,10 @@ class DBImpl : public DB {
Env::Priority compaction_pri_;
};
static bool IsRecoveryFlush(FlushReason flush_reason) {
return flush_reason == FlushReason::kErrorRecoveryRetryFlush ||
flush_reason == FlushReason::kErrorRecovery;
}
// Initialize the built-in column family for persistent stats. Depending on
// whether on-disk persistent stats have been enabled before, it may either
// create a new column family and column family handle or just a column family
@ -2032,7 +2036,8 @@ class DBImpl : public DB {
// REQUIRES: mutex held
void SelectColumnFamiliesForAtomicFlush(
autovector<ColumnFamilyData*>* selected_cfds,
const autovector<ColumnFamilyData*>& provided_candidate_cfds = {});
const autovector<ColumnFamilyData*>& provided_candidate_cfds = {},
FlushReason flush_reason = FlushReason::kOthers);
// Force current memtable contents to be flushed.
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,

View File

@ -2306,13 +2306,15 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
}
WaitForPendingWrites();
if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load() ||
IsRecoveryFlush(flush_reason)) {
s = SwitchMemtable(cfd, &context);
}
const uint64_t flush_memtable_id = std::numeric_limits<uint64_t>::max();
if (s.ok()) {
if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
!cached_recoverable_state_empty_.load()) {
!cached_recoverable_state_empty_.load() ||
IsRecoveryFlush(flush_reason)) {
FlushRequest req{flush_reason, {{cfd, flush_memtable_id}}};
flush_reqs.emplace_back(std::move(req));
memtable_ids_to_wait.emplace_back(
@ -2489,7 +2491,7 @@ Status DBImpl::AtomicFlushMemTables(
}
WaitForPendingWrites();
SelectColumnFamiliesForAtomicFlush(&cfds, candidate_cfds);
SelectColumnFamiliesForAtomicFlush(&cfds, candidate_cfds, flush_reason);
// Unref the newly generated candidate cfds (when not provided) in
// `candidate_cfds`
@ -2500,7 +2502,8 @@ Status DBImpl::AtomicFlushMemTables(
}
for (auto cfd : cfds) {
if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) {
if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load() &&
!IsRecoveryFlush(flush_reason)) {
continue;
}
cfd->Ref();

View File

@ -1650,7 +1650,8 @@ Status DBImpl::WriteRecoverableState() {
void DBImpl::SelectColumnFamiliesForAtomicFlush(
autovector<ColumnFamilyData*>* selected_cfds,
const autovector<ColumnFamilyData*>& provided_candidate_cfds) {
const autovector<ColumnFamilyData*>& provided_candidate_cfds,
FlushReason flush_reason) {
mutex_.AssertHeld();
assert(selected_cfds);
@ -1673,7 +1674,8 @@ void DBImpl::SelectColumnFamiliesForAtomicFlush(
continue;
}
if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
!cached_recoverable_state_empty_.load()) {
!cached_recoverable_state_empty_.load() ||
IsRecoveryFlush(flush_reason)) {
selected_cfds->push_back(cfd);
}
}

View File

@ -2757,6 +2757,49 @@ TEST_F(DBWALTest, EmptyWalReopenTest) {
}
}
TEST_F(DBWALTest, RecoveryFlushSwitchWALOnEmptyMemtable) {
Options options = CurrentOptions();
auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
options.env = fault_fs_env.get();
options.avoid_flush_during_shutdown = true;
DestroyAndReopen(options);
fault_fs->SetThreadLocalErrorContext(
FaultInjectionIOType::kMetadataWrite, 7 /* seed*/, 1 /* one_in */,
true /* retryable */, false /* has_data_loss*/);
fault_fs->EnableThreadLocalErrorInjection(
FaultInjectionIOType::kMetadataWrite);
WriteOptions wo;
wo.sync = true;
Status s = Put("k", "old_v", wo);
ASSERT_TRUE(s.IsIOError());
// To verify the key is not in memtable nor SST
ASSERT_TRUE(static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
->cfd()
->mem()
->IsEmpty());
ASSERT_EQ("NOT_FOUND", Get("k"));
fault_fs->DisableThreadLocalErrorInjection(
FaultInjectionIOType::kMetadataWrite);
// Keep trying write until recovery of the previous IO error finishes
while (!s.ok()) {
options.env->SleepForMicroseconds(1000);
s = Put("k", "new_v");
}
// If recovery flush didn't switch WAL, we will end up having two duplicate
// WAL entries with same seqno and same key that violate assertion during WAL
// recovery and fail DB reopen
options.avoid_flush_during_recovery = false;
Reopen(options);
ASSERT_EQ("new_v", Get("k"));
Destroy(options);
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

View File

@ -235,7 +235,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
AutoThreadOperationStageUpdater stage_run(ThreadStatus::STAGE_FLUSH_RUN);
if (mems_.empty()) {
ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
ROCKS_LOG_BUFFER(log_buffer_, "[%s] No memtable to flush",
cfd_->GetName().c_str());
return Status::OK();
}
@ -1017,10 +1017,15 @@ Status FlushJob::WriteLevel0Table() {
ROCKS_LOG_BUFFER(log_buffer_,
"[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
" bytes %s"
"%s",
" %s"
" %s",
cfd_->GetName().c_str(), job_context_->job_id,
meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
s.ToString().c_str(),
s.ok() && meta_.fd.GetFileSize() == 0
? "It's an empty SST file from a successful flush so "
"won't be kept in the DB"
: "",
meta_.marked_for_compaction ? " (needs compaction)" : "");
if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {

View File

@ -558,11 +558,12 @@ Status MemTableList::TryInstallMemtableFlushResults(
batch_file_number = m->file_number_;
if (m->edit_.GetBlobFileAdditions().empty()) {
ROCKS_LOG_BUFFER(log_buffer,
"[%s] Level-0 commit table #%" PRIu64 " started",
"[%s] Level-0 commit flush result of table #%" PRIu64
" started",
cfd->GetName().c_str(), m->file_number_);
} else {
ROCKS_LOG_BUFFER(log_buffer,
"[%s] Level-0 commit table #%" PRIu64
"[%s] Level-0 commit flush result of table #%" PRIu64
" (+%zu blob files) started",
cfd->GetName().c_str(), m->file_number_,
m->edit_.GetBlobFileAdditions().size());
@ -757,12 +758,12 @@ void MemTableList::RemoveMemTablesOrRestoreFlags(
MemTable* m = current_->memlist_.back();
if (m->edit_.GetBlobFileAdditions().empty()) {
ROCKS_LOG_BUFFER(log_buffer,
"[%s] Level-0 commit table #%" PRIu64
"[%s] Level-0 commit flush result of table #%" PRIu64
": memtable #%" PRIu64 " done",
cfd->GetName().c_str(), m->file_number_, mem_id);
} else {
ROCKS_LOG_BUFFER(log_buffer,
"[%s] Level-0 commit table #%" PRIu64
"[%s] Level-0 commit flush result of table #%" PRIu64
" (+%zu blob files)"
": memtable #%" PRIu64 " done",
cfd->GetName().c_str(), m->file_number_,

View File

@ -0,0 +1 @@
Fixed a bug where successful write right after error recovery for last failed write finishes causes duplicate WAL entries