mirror of https://github.com/facebook/rocksdb.git
Fix corruption bug when recycle_log_file_num changed from 0 (#12591)
Summary: When `recycle_log_file_num` is changed from 0 to non-zero and the DB is reopened, any log files from the previous session that are still alive get reused. However, the WAL records in those files are not in the recyclable format. If one of those files is reused and is empty, a subsequent re-open, in `RecoverLogFiles`, can replay those records and insert stale data into the memtable. Another manifestation of this is an assertion failure `first_seqno_ == 0 || s >= first_seqno_` in `rocksdb::MemTable::Add`. We could fix this by either 1) Writing a special record when reusing a log file, or 2) Implement more rigorous checking in `RecoverLogFiles` to ensure we don't replay stale records, or 3) Not reuse files created by a previous DB session. We choose option 3 as its the simplest, and flipping `recycle_log_file_num` is expected to be a rare event. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12591 Test Plan: 1. Add a unit test to verify the bug and fix Reviewed By: jowlyzhang Differential Revision: D56655812 Pulled By: anand1976 fbshipit-source-id: aa3a26b4a5e892d39a54b5a0658233cbebebac87
This commit is contained in:
parent
d80e1d99bc
commit
e36b0a2da4
|
@ -821,6 +821,8 @@ class DBImpl : public DB {
|
|||
|
||||
uint64_t MinLogNumberToKeep();
|
||||
|
||||
uint64_t MinLogNumberToRecycle();
|
||||
|
||||
// Returns the lower bound file number for SSTs that won't be deleted, even if
|
||||
// they're obsolete. This lower bound is used internally to prevent newly
|
||||
// created flush/compaction output files from being deleted before they're
|
||||
|
@ -2482,6 +2484,11 @@ class DBImpl : public DB {
|
|||
uint64_t logfile_number_;
|
||||
// Log files that we can recycle. Must be protected by db mutex_.
|
||||
std::deque<uint64_t> log_recycle_files_;
|
||||
// The minimum log file number taht can be recycled, if log recycling is
|
||||
// enabled. This is used to ensure that log files created by previous
|
||||
// instances of the database are not recycled, as we cannot be sure they
|
||||
// were created in the recyclable format.
|
||||
uint64_t min_log_number_to_recycle_;
|
||||
// Protected by log_write_mutex_.
|
||||
bool log_dir_synced_;
|
||||
// Without two_write_queues, read and writes to log_empty_ are protected by
|
||||
|
|
|
@ -28,6 +28,8 @@ uint64_t DBImpl::MinLogNumberToKeep() {
|
|||
return versions_->min_log_number_to_keep();
|
||||
}
|
||||
|
||||
uint64_t DBImpl::MinLogNumberToRecycle() { return min_log_number_to_recycle_; }
|
||||
|
||||
uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
|
||||
mutex_.AssertHeld();
|
||||
if (!pending_outputs_.empty()) {
|
||||
|
@ -298,7 +300,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
|||
while (alive_log_files_.begin()->number < min_log_number) {
|
||||
auto& earliest = *alive_log_files_.begin();
|
||||
if (immutable_db_options_.recycle_log_file_num >
|
||||
log_recycle_files_.size()) {
|
||||
log_recycle_files_.size() &&
|
||||
earliest.number >= MinLogNumberToRecycle()) {
|
||||
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
||||
"adding log %" PRIu64 " to recycle list\n",
|
||||
earliest.number);
|
||||
|
|
|
@ -2068,6 +2068,11 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|||
impl->GetWalPreallocateBlockSize(max_write_buffer_size);
|
||||
s = impl->CreateWAL(write_options, new_log_number, 0 /*recycle_log_number*/,
|
||||
preallocate_block_size, &new_log);
|
||||
if (s.ok()) {
|
||||
// Prevent log files created by previous instance from being recycled.
|
||||
// They might be in alive_log_file_, and might get recycled otherwise.
|
||||
impl->min_log_number_to_recycle_ = new_log_number;
|
||||
}
|
||||
if (s.ok()) {
|
||||
InstrumentedMutexLock wl(&impl->log_write_mutex_);
|
||||
impl->logfile_number_ = new_log_number;
|
||||
|
|
|
@ -913,6 +913,32 @@ TEST_P(DBWriteTest, RecycleLogTestCFAheadOfWAL) {
|
|||
Status::Corruption());
|
||||
}
|
||||
|
||||
TEST_P(DBWriteTest, RecycleLogToggleTest) {
|
||||
Options options = GetOptions();
|
||||
options.recycle_log_file_num = 0;
|
||||
options.avoid_flush_during_recovery = true;
|
||||
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
|
||||
|
||||
Destroy(options);
|
||||
Reopen(options);
|
||||
// After opening, a new log gets created, say 1.log
|
||||
ASSERT_OK(Put(Key(1), "val1"));
|
||||
|
||||
options.recycle_log_file_num = 1;
|
||||
Reopen(options);
|
||||
// 1.log is added to alive_log_files_
|
||||
ASSERT_OK(Put(Key(2), "val1"));
|
||||
ASSERT_OK(Flush());
|
||||
// 1.log should be deleted and not recycled, since it
|
||||
// was created by the previous Reopen
|
||||
ASSERT_OK(Put(Key(1), "val2"));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
options.recycle_log_file_num = 1;
|
||||
Reopen(options);
|
||||
ASSERT_EQ(Get(Key(1)), "val2");
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
|
||||
testing::Values(DBTestBase::kDefault,
|
||||
DBTestBase::kConcurrentWALWrites,
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Fixed a bug when the recycle_log_file_num in DBOptions is changed from 0 to non-zero when a DB is reopened. On a subsequent reopen, if a log file created when recycle_log_file_num==0 was reused previously, is alive and is empty, we could end up inserting stale WAL records into the memtable.
|
Loading…
Reference in New Issue