Fix race condition in db_stress checkpoint cleanup (#11389)

Summary:
The old cleanup code had a race condition:

1. Test thread: DestroyDB() marked a file as trash
2. DeleteScheduler thread: Got the file's size and decided to delete it in chunks
3. Test thread: DestroyDir() deleted that trash file
4. DeleteScheduler thread: Began deleting in chunks starting by calling ReopenWritableFile(). Unfortunately this recreates the deleted trash file
5. Test thread: DestroyDir() fails to remove the parent directory because it contains the file created in 4.
6. Test thread: Checkpoint::Create() fails due to the directory already existing

It could be repro'd with the following patch/command.

Patch:

```
 diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc
index 8a2d1615d..337d24a60 100644
 --- a/file/delete_scheduler.cc
+++ b/file/delete_scheduler.cc
@@ -317,6 +317,12 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
                                            &num_hard_links, nullptr);
       if (my_status.ok()) {
         if (num_hard_links == 1) {
+          // Give some time for DestroyDir() to delete file entries. Then, the
+          // below `ReopenWritableFile()` will recreate files, preventing the
+          // parent directory from being deleted.
+          if (rand() % 2 == 0) {
+            usleep(1000);
+          }
           std::unique_ptr<FSWritableFile> wf;
           my_status = fs_->ReopenWritableFile(path_in_trash, FileOptions(), &wf,
                                               nullptr);
 diff --git a/file/file_util.cc b/file/file_util.cc
index 43608fcdc..2cee1ad8e 100644
 --- a/file/file_util.cc
+++ b/file/file_util.cc
@@ -263,6 +263,13 @@ Status DestroyDir(Env* env, const std::string& dir) {
     }
   }

+  // Give some time for the DeleteScheduler thread's ReopenWritableFile() to
+  // recreate deleted files
+  if (dir.find("checkpoint") != std::string::npos) {
+    fprintf(stderr, "waiting to destroy %s\n", dir.c_str());
+    usleep(10000);
+  }
+
   if (s.ok()) {
     s = env->DeleteDir(dir);
     // DeleteDir might or might not report NotFound
```

Command:

```
TEST_TMPDIR=/dev/shm python3 tools/db_crashtest.py blackbox --simple --write_buffer_size=131072 --target_file_size_base=131072 --max_bytes_for_level_base=524288 --checkpoint_one_in=100 --clear_column_family_one_in=0  --max_key=1000 --value_size_mult=33 --sst_file_manager_bytes_per_truncate=4096 --sst_file_manager_bytes_per_sec=1048576  --interval=3 --compression_type=none --sync_fault_injection=1
```

Obviously we don't want to use scheduled deletion here as we need the checkpoint directory deleted immediately. I suspect the DestroyDir() was an attempt to fixup incomplete DestroyDB()s. Now that we expect DestroyDB() to be complete I removed that code.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11389

Reviewed By: hx235

Differential Revision: D45137142

Pulled By: ajkr

fbshipit-source-id: 2af743d342c77cc414fd25fc4c9d7c9c6079ad24

This commit is contained in:

Andrew Kryczka

2023-04-20 12:48:53 -07:00

committed by

Facebook GitHub Bot

parent 43e9a60bb2

commit 6cac4c79d4

1 changed files with 2 additions and 10 deletions

									
										12

db_stress_tool/db_stress_test_base.cc

										View File
									
					@ -1835,19 +1835,11 @@ Status StressTest::TestCheckpoint(ThreadState* thread,

					  Options tmp_opts(options_);

					  Options tmp_opts(options_);

					  tmp_opts.listeners.clear();

					  tmp_opts.listeners.clear();

					  tmp_opts.env = db_stress_env;

					  tmp_opts.env = db_stress_env;

					  // Avoid delayed deletion so whole directory can be deleted

					  tmp_opts.sst_file_manager.reset();

					  DestroyDB(checkpoint_dir, tmp_opts);

					  DestroyDB(checkpoint_dir, tmp_opts);

					  if (db_stress_env->FileExists(checkpoint_dir).ok()) {

					    // If the directory might still exist, try to delete the files one by one.

					    // Likely a trash file is still there.

					    Status my_s = DestroyDir(db_stress_env, checkpoint_dir);

					    if (!my_s.ok()) {

					      fprintf(stderr, "Fail to destory directory before checkpoint: %s",

					              my_s.ToString().c_str());

					    }

					  }

					  Checkpoint* checkpoint = nullptr;

					  Checkpoint* checkpoint = nullptr;

					  Status s = Checkpoint::Create(db_, &checkpoint);

					  Status s = Checkpoint::Create(db_, &checkpoint);

					  if (s.ok()) {

					  if (s.ok()) {

Fix race condition in db_stress checkpoint cleanup (#11389)

12 db_stress_tool/db_stress_test_base.cc Unescape Escape View File

12

db_stress_tool/db_stress_test_base.cc

View File