Fix duplicate WAL entries caused by write after error recovery (#12873)

Summary: **Context/Summary:** We recently discovered a case where write of the same key right after error recovery of a previous failed write of the same key finishes causes two same WAL entries, violating our assertion. This is because we don't advance seqno on failed write and reuse the same WAL containing the failed write for the new write if the memtable at the time is empty. This PR reuses the flush path for an empty memtable to switch WAL and update min WAL to keep in error recovery flush as well as updates the INFO log message for clarity. ``` 2024/07/17-15:01:32.271789 327757 (Original Log Time 2024/07/17-15:01:25.942234) [/flush_job.cc:1017] [default] [JOB 2] Level-0 flush table https://github.com/facebook/rocksdb/issues/9: 0 bytes OK It's an empty SST file from a successful flush so won't be kept in the DB 2024/07/17-15:01:32.271798 327757 (Original Log Time 2024/07/17-15:01:32.269954) [/memtable_list.cc:560] [default] Level-0 commit flush result of table https://github.com/facebook/rocksdb/issues/9 started 2024/07/17-15:01:32.271802 327757 (Original Log Time 2024/07/17-15:01:32.271217) [/memtable_list.cc:760] [default] Level-0 commit flush result of table https://github.com/facebook/rocksdb/issues/9: memtable https://github.com/facebook/rocksdb/issues/1 done ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/12873 Test Plan: New UT that failed before this PR with following assertion failure (i.e, duplicate WAL entries) and passes after ``` db_wal_test: db/write_batch.cc:2254: rocksdb::Status rocksdb::{anonymous}::MemTableInserter::PutCFImpl(uint32_t, const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::ValueType, RebuildTxnOp, const ProtectionInfoKVOS64*) [with RebuildTxnOp = rocksdb::{anonymous}::MemTableInserter::PutCF(uint32_t, const rocksdb::Slice&, const rocksdb::Slice&)::<lambda(rocksdb::WriteBatch*, uint32_t, const rocksdb::Slice&, const rocksdb::Slice&)>; uint32_t = unsigned int; rocksdb::ProtectionInfoKVOS64 = rocksdb::ProtectionInfoKVOS<long unsigned int>]: Assertion `seq_per_batch_' failed. ``` Reviewed By: anand1976 Differential Revision: D59884468 Pulled By: hx235 fbshipit-source-id: 5d854b719092552c69727a979f269fb7f6c39756
2024-07-22 12:40:25 -07:00 · 2024-07-22 12:40:25 -07:00 · 349b1ec08f
parent c064ac3bc5
commit 349b1ec08f
7 changed files with 73 additions and 13 deletions
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@ -1888,6 +1888,10 @@ class DBImpl : public DB {
    Env::Priority compaction_pri_;
  };

+  static bool IsRecoveryFlush(FlushReason flush_reason) {
+    return flush_reason == FlushReason::kErrorRecoveryRetryFlush ||
+           flush_reason == FlushReason::kErrorRecovery;
+  }
  // Initialize the built-in column family for persistent stats. Depending on
  // whether on-disk persistent stats have been enabled before, it may either
  // create a new column family and column family handle or just a column family
@ -2032,7 +2036,8 @@ class DBImpl : public DB {
  // REQUIRES: mutex held
  void SelectColumnFamiliesForAtomicFlush(
      autovector<ColumnFamilyData*>* selected_cfds,
-      const autovector<ColumnFamilyData*>& provided_candidate_cfds = {});
+      const autovector<ColumnFamilyData*>& provided_candidate_cfds = {},
+      FlushReason flush_reason = FlushReason::kOthers);

  // Force current memtable contents to be flushed.
  Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@ -2306,13 +2306,15 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
    }
    WaitForPendingWrites();

-    if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
+    if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load() ||
+        IsRecoveryFlush(flush_reason)) {
      s = SwitchMemtable(cfd, &context);
    }
    const uint64_t flush_memtable_id = std::numeric_limits<uint64_t>::max();
    if (s.ok()) {
      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
-          !cached_recoverable_state_empty_.load()) {
+          !cached_recoverable_state_empty_.load() ||
+          IsRecoveryFlush(flush_reason)) {
        FlushRequest req{flush_reason, {{cfd, flush_memtable_id}}};
        flush_reqs.emplace_back(std::move(req));
        memtable_ids_to_wait.emplace_back(
@ -2489,7 +2491,7 @@ Status DBImpl::AtomicFlushMemTables(
    }
    WaitForPendingWrites();

-    SelectColumnFamiliesForAtomicFlush(&cfds, candidate_cfds);
+    SelectColumnFamiliesForAtomicFlush(&cfds, candidate_cfds, flush_reason);

    // Unref the newly generated candidate cfds (when not provided) in
    // `candidate_cfds`
@ -2500,7 +2502,8 @@ Status DBImpl::AtomicFlushMemTables(
    }

    for (auto cfd : cfds) {
-      if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) {
+      if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load() &&
+          !IsRecoveryFlush(flush_reason)) {
        continue;
      }
      cfd->Ref();
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@ -1650,7 +1650,8 @@ Status DBImpl::WriteRecoverableState() {

 void DBImpl::SelectColumnFamiliesForAtomicFlush(
    autovector<ColumnFamilyData*>* selected_cfds,
-    const autovector<ColumnFamilyData*>& provided_candidate_cfds) {
+    const autovector<ColumnFamilyData*>& provided_candidate_cfds,
+    FlushReason flush_reason) {
  mutex_.AssertHeld();
  assert(selected_cfds);

@ -1673,7 +1674,8 @@ void DBImpl::SelectColumnFamiliesForAtomicFlush(
      continue;
    }
    if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
-        !cached_recoverable_state_empty_.load()) {
+        !cached_recoverable_state_empty_.load() ||
+        IsRecoveryFlush(flush_reason)) {
      selected_cfds->push_back(cfd);
    }
  }
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@ -2757,6 +2757,49 @@ TEST_F(DBWALTest, EmptyWalReopenTest) {
  }
 }

+TEST_F(DBWALTest, RecoveryFlushSwitchWALOnEmptyMemtable) {
+  Options options = CurrentOptions();
+  auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  options.env = fault_fs_env.get();
+  options.avoid_flush_during_shutdown = true;
+  DestroyAndReopen(options);
+
+  fault_fs->SetThreadLocalErrorContext(
+      FaultInjectionIOType::kMetadataWrite, 7 /* seed*/, 1 /* one_in */,
+      true /* retryable */, false /* has_data_loss*/);
+  fault_fs->EnableThreadLocalErrorInjection(
+      FaultInjectionIOType::kMetadataWrite);
+
+  WriteOptions wo;
+  wo.sync = true;
+  Status s = Put("k", "old_v", wo);
+  ASSERT_TRUE(s.IsIOError());
+  // To verify the key is not in memtable nor SST
+  ASSERT_TRUE(static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+                  ->cfd()
+                  ->mem()
+                  ->IsEmpty());
+  ASSERT_EQ("NOT_FOUND", Get("k"));
+
+  fault_fs->DisableThreadLocalErrorInjection(
+      FaultInjectionIOType::kMetadataWrite);
+
+  // Keep trying write until recovery of the previous IO error finishes
+  while (!s.ok()) {
+    options.env->SleepForMicroseconds(1000);
+    s = Put("k", "new_v");
+  }
+
+  // If recovery flush didn't switch WAL, we will end up having two duplicate
+  // WAL entries with same seqno and same key that violate assertion during WAL
+  // recovery and fail DB reopen
+  options.avoid_flush_during_recovery = false;
+  Reopen(options);
+
+  ASSERT_EQ("new_v", Get("k"));
+  Destroy(options);
+}
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@ -235,7 +235,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,

  AutoThreadOperationStageUpdater stage_run(ThreadStatus::STAGE_FLUSH_RUN);
  if (mems_.empty()) {
-    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] No memtable to flush",
                     cfd_->GetName().c_str());
    return Status::OK();
  }
@ -1017,10 +1017,15 @@ Status FlushJob::WriteLevel0Table() {
    ROCKS_LOG_BUFFER(log_buffer_,
                     "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
                     " bytes %s"
-                     "%s",
+                     " %s"
+                     " %s",
                     cfd_->GetName().c_str(), job_context_->job_id,
                     meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
                     s.ToString().c_str(),
+                     s.ok() && meta_.fd.GetFileSize() == 0
+                         ? "It's an empty SST file from a successful flush so "
+                           "won't be kept in the DB"
+                         : "",
                     meta_.marked_for_compaction ? " (needs compaction)" : "");

    if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@ -558,11 +558,12 @@ Status MemTableList::TryInstallMemtableFlushResults(
        batch_file_number = m->file_number_;
        if (m->edit_.GetBlobFileAdditions().empty()) {
          ROCKS_LOG_BUFFER(log_buffer,
-                           "[%s] Level-0 commit table #%" PRIu64 " started",
+                           "[%s] Level-0 commit flush result of table #%" PRIu64
+                           " started",
                           cfd->GetName().c_str(), m->file_number_);
        } else {
          ROCKS_LOG_BUFFER(log_buffer,
-                           "[%s] Level-0 commit table #%" PRIu64
+                           "[%s] Level-0 commit flush result of table #%" PRIu64
                           " (+%zu blob files) started",
                           cfd->GetName().c_str(), m->file_number_,
                           m->edit_.GetBlobFileAdditions().size());
@ -757,12 +758,12 @@ void MemTableList::RemoveMemTablesOrRestoreFlags(
      MemTable* m = current_->memlist_.back();
      if (m->edit_.GetBlobFileAdditions().empty()) {
        ROCKS_LOG_BUFFER(log_buffer,
-                         "[%s] Level-0 commit table #%" PRIu64
+                         "[%s] Level-0 commit flush result of table #%" PRIu64
                         ": memtable #%" PRIu64 " done",
                         cfd->GetName().c_str(), m->file_number_, mem_id);
      } else {
        ROCKS_LOG_BUFFER(log_buffer,
-                         "[%s] Level-0 commit table #%" PRIu64
+                         "[%s] Level-0 commit flush result of table #%" PRIu64
                         " (+%zu blob files)"
                         ": memtable #%" PRIu64 " done",
                         cfd->GetName().c_str(), m->file_number_,
--- a/unreleased_history/bug_fixes/duplicate_wal_entries.md
+++ b/unreleased_history/bug_fixes/duplicate_wal_entries.md
@ -0,0 +1 @@
+Fixed a bug where successful write right after error recovery for last failed write finishes causes duplicate WAL entries
				`@ -0,0 +1 @@`
				`Fixed a bug where successful write right after error recovery for last failed write finishes causes duplicate WAL entries`