From 7177dc46a13332c96332d524b20f14b7e1372d07 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 11 Jun 2019 13:04:59 -0700 Subject: [PATCH] Handle missing WAL in secondary mode (#5323) Summary: In secondary mode, it is possible that the secondary lists the primary's WAL directory, finds a WAL and tries to open it. It is possible that the primary deletes the WAL after secondary listing dir but before the secondary opening it. Then the secondary will fail to open the WAL file with a PathNotFound status. In this case, we can return OK without replaying WAL and optionally replay more MANIFEST. Test Plan (on my dev machine): Without this PR, the following will fail several times out of 100 runs. ``` ~/gtest-parallel/gtest-parallel -r 100 -w 16 ./db_secondary_test --gtest_filter=DBSecondaryTest.SwitchToNewManifestDuringOpen ``` With this PR, the above should always succeed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5323 Differential Revision: D15763878 Pulled By: riversand963 fbshipit-source-id: c7164fa7cb8d9001abc258b6a2dc93613e4f38ff --- db/db_impl/db_impl_secondary.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 2737df0ae8..5cd0beb1f0 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -60,6 +60,12 @@ Status DBImplSecondary::Recover( s = FindAndRecoverLogFiles(&cfds_changed, &job_context); } + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } // TODO: update options_file_number_ needed? job_context.Clean(); @@ -475,6 +481,12 @@ Status DBImplSecondary::TryCatchUpWithPrimary() { if (s.ok()) { s = FindAndRecoverLogFiles(&cfds_changed, &job_context); } + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } if (s.ok()) { for (auto cfd : cfds_changed) { cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),