From e36b0a2da40c28461b602c4a340d9dcccafb96c6 Mon Sep 17 00:00:00 2001 From: anand76 Date: Mon, 29 Apr 2024 12:25:00 -0700 Subject: [PATCH] Fix corruption bug when recycle_log_file_num changed from 0 (#12591) Summary: When `recycle_log_file_num` is changed from 0 to non-zero and the DB is reopened, any log files from the previous session that are still alive get reused. However, the WAL records in those files are not in the recyclable format. If one of those files is reused and is empty, a subsequent re-open, in `RecoverLogFiles`, can replay those records and insert stale data into the memtable. Another manifestation of this is an assertion failure `first_seqno_ == 0 || s >= first_seqno_` in `rocksdb::MemTable::Add`. We could fix this by either 1) Writing a special record when reusing a log file, or 2) Implement more rigorous checking in `RecoverLogFiles` to ensure we don't replay stale records, or 3) Not reuse files created by a previous DB session. We choose option 3 as its the simplest, and flipping `recycle_log_file_num` is expected to be a rare event. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12591 Test Plan: 1. Add a unit test to verify the bug and fix Reviewed By: jowlyzhang Differential Revision: D56655812 Pulled By: anand1976 fbshipit-source-id: aa3a26b4a5e892d39a54b5a0658233cbebebac87 --- db/db_impl/db_impl.h | 7 +++++ db/db_impl/db_impl_files.cc | 5 +++- db/db_impl/db_impl_open.cc | 5 ++++ db/db_write_test.cc | 26 +++++++++++++++++++ .../bug_fixes/recycle_logs_toggle_bug.md | 1 + 5 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 unreleased_history/bug_fixes/recycle_logs_toggle_bug.md diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 504d7ec608..3e93c3e2d4 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -821,6 +821,8 @@ class DBImpl : public DB { uint64_t MinLogNumberToKeep(); + uint64_t MinLogNumberToRecycle(); + // Returns the lower bound file number for SSTs that won't be deleted, even if // they're obsolete. This lower bound is used internally to prevent newly // created flush/compaction output files from being deleted before they're @@ -2482,6 +2484,11 @@ class DBImpl : public DB { uint64_t logfile_number_; // Log files that we can recycle. Must be protected by db mutex_. std::deque log_recycle_files_; + // The minimum log file number taht can be recycled, if log recycling is + // enabled. This is used to ensure that log files created by previous + // instances of the database are not recycled, as we cannot be sure they + // were created in the recyclable format. + uint64_t min_log_number_to_recycle_; // Protected by log_write_mutex_. bool log_dir_synced_; // Without two_write_queues, read and writes to log_empty_ are protected by diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index e1ba41e4ac..e11b012622 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -28,6 +28,8 @@ uint64_t DBImpl::MinLogNumberToKeep() { return versions_->min_log_number_to_keep(); } +uint64_t DBImpl::MinLogNumberToRecycle() { return min_log_number_to_recycle_; } + uint64_t DBImpl::MinObsoleteSstNumberToKeep() { mutex_.AssertHeld(); if (!pending_outputs_.empty()) { @@ -298,7 +300,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, while (alive_log_files_.begin()->number < min_log_number) { auto& earliest = *alive_log_files_.begin(); if (immutable_db_options_.recycle_log_file_num > - log_recycle_files_.size()) { + log_recycle_files_.size() && + earliest.number >= MinLogNumberToRecycle()) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "adding log %" PRIu64 " to recycle list\n", earliest.number); diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 5765549b5f..cff4a8bd5d 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -2068,6 +2068,11 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, impl->GetWalPreallocateBlockSize(max_write_buffer_size); s = impl->CreateWAL(write_options, new_log_number, 0 /*recycle_log_number*/, preallocate_block_size, &new_log); + if (s.ok()) { + // Prevent log files created by previous instance from being recycled. + // They might be in alive_log_file_, and might get recycled otherwise. + impl->min_log_number_to_recycle_ = new_log_number; + } if (s.ok()) { InstrumentedMutexLock wl(&impl->log_write_mutex_); impl->logfile_number_ = new_log_number; diff --git a/db/db_write_test.cc b/db/db_write_test.cc index abb34da1f0..e074dd6b42 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -913,6 +913,32 @@ TEST_P(DBWriteTest, RecycleLogTestCFAheadOfWAL) { Status::Corruption()); } +TEST_P(DBWriteTest, RecycleLogToggleTest) { + Options options = GetOptions(); + options.recycle_log_file_num = 0; + options.avoid_flush_during_recovery = true; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + Destroy(options); + Reopen(options); + // After opening, a new log gets created, say 1.log + ASSERT_OK(Put(Key(1), "val1")); + + options.recycle_log_file_num = 1; + Reopen(options); + // 1.log is added to alive_log_files_ + ASSERT_OK(Put(Key(2), "val1")); + ASSERT_OK(Flush()); + // 1.log should be deleted and not recycled, since it + // was created by the previous Reopen + ASSERT_OK(Put(Key(1), "val2")); + ASSERT_OK(Flush()); + + options.recycle_log_file_num = 1; + Reopen(options); + ASSERT_EQ(Get(Key(1)), "val2"); +} + INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest, testing::Values(DBTestBase::kDefault, DBTestBase::kConcurrentWALWrites, diff --git a/unreleased_history/bug_fixes/recycle_logs_toggle_bug.md b/unreleased_history/bug_fixes/recycle_logs_toggle_bug.md new file mode 100644 index 0000000000..2cfd8c4666 --- /dev/null +++ b/unreleased_history/bug_fixes/recycle_logs_toggle_bug.md @@ -0,0 +1 @@ +Fixed a bug when the recycle_log_file_num in DBOptions is changed from 0 to non-zero when a DB is reopened. On a subsequent reopen, if a log file created when recycle_log_file_num==0 was reused previously, is alive and is empty, we could end up inserting stale WAL records into the memtable.