From 096fb9b67d19a9a180e7c906b4a0cdb2b2d0c1f6 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Thu, 14 Mar 2024 21:24:06 -0700 Subject: [PATCH] Fix data race in WalManager (#12439) Summary: Crash tests were failing due to data race in accessing `purge_wal_files_last_run_`. This PR changes it to atomic. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12439 Test Plan: - existing UT - not able to repro with `python3 tools/db_crashtest.py whitebox --simple --max_key=25000000 --WAL_ttl_seconds=1` and TSAN yet, will monitor internal crash tests Reviewed By: anand1976 Differential Revision: D54920817 Pulled By: cbi42 fbshipit-source-id: 80ee026b1785ad5dba11295ed35c88889df5f5a6 --- db/wal_manager.cc | 13 ++++++++----- db/wal_manager.h | 3 ++- .../bug_fixes/data_race_in_wal_manager.md | 1 + 3 files changed, 11 insertions(+), 6 deletions(-) create mode 100644 unreleased_history/bug_fixes/data_race_in_wal_manager.md diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 79055b5624..1f8190b93a 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -158,11 +158,14 @@ void WalManager::PurgeObsoleteWALFiles() { ? std::min(kDefaultIntervalToDeleteObsoleteWAL, std::max(uint64_t{1}, db_options_.WAL_ttl_seconds / 2)) : kDefaultIntervalToDeleteObsoleteWAL; - if (purge_wal_files_last_run_ + time_to_check > now_seconds) { - return; - } - - purge_wal_files_last_run_ = now_seconds; + uint64_t old_last_run_time = purge_wal_files_last_run_.LoadRelaxed(); + do { + if (old_last_run_time + time_to_check > now_seconds) { + // last run is recent enough, no need to purge + return; + } + } while (!purge_wal_files_last_run_.CasWeakRelaxed( + /*expected=*/old_last_run_time, /*desired=*/now_seconds)); std::string archival_dir = ArchivalDirectory(wal_dir_); std::vector files; diff --git a/db/wal_manager.h b/db/wal_manager.h index ab79bf0023..d8acba8afa 100644 --- a/db/wal_manager.h +++ b/db/wal_manager.h @@ -25,6 +25,7 @@ #include "rocksdb/status.h" #include "rocksdb/transaction_log.h" #include "rocksdb/types.h" +#include "util/atomic.h" namespace ROCKSDB_NAMESPACE { @@ -118,7 +119,7 @@ class WalManager { port::Mutex read_first_record_cache_mutex_; // last time when PurgeObsoleteWALFiles ran. - uint64_t purge_wal_files_last_run_; + RelaxedAtomic purge_wal_files_last_run_; bool seq_per_batch_; diff --git a/unreleased_history/bug_fixes/data_race_in_wal_manager.md b/unreleased_history/bug_fixes/data_race_in_wal_manager.md new file mode 100644 index 0000000000..e561072a67 --- /dev/null +++ b/unreleased_history/bug_fixes/data_race_in_wal_manager.md @@ -0,0 +1 @@ +* Fixed a data race in WalManager that may affect how frequent PurgeObsoleteWALFiles() runs. \ No newline at end of file