Fix manual flush hanging on waiting for no stall for UDT in memtable … (#12771)

Summary: This PR fix a possible manual flush hanging scenario because of its expectation that others will clear out excessive memtables was not met. The root cause is the FlushRequest rescheduling logic is using a stricter criteria for what a write stall is about to happen means than `WaitUntilFlushWouldNotStallWrites` does. Currently, the former thinks a write stall is about to happen when the last memtable is half full, and it will instead reschedule queued FlushRequest and not actually proceed with the flush. While the latter thinks if we already start to use the last memtable, we should wait until some other background flush jobs clear out some memtables before proceed this manual flush. If we make them use the same criteria, we can guarantee that at any time when`WaitUntilFlushWouldNotStallWrites` is waiting, it's not because the rescheduling logic is holding it back. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12771 Test Plan: Added unit test Reviewed By: ajkr Differential Revision: D58603746 Pulled By: jowlyzhang fbshipit-source-id: 9fa1c87c0175d47a40f584dfb1b497baa576755b
2024-06-14 13:37:37 -07:00 · 2024-06-14 13:37:37 -07:00 · f5e44f3490
parent 13c758f986
commit f5e44f3490
4 changed files with 33 additions and 22 deletions
--- a/db/column_family.h
+++ b/db/column_family.h
@ -572,6 +572,10 @@ class ColumnFamilyData {
  // of its files (if missing)
  void RecoverEpochNumbers();

+  int GetUnflushedMemTableCountForWriteStallCheck() const {
+    return (mem_->IsEmpty() ? 0 : 1) + imm_.NumNotFlushed();
+  }
+
 private:
  friend class ColumnFamilySet;
  ColumnFamilyData(uint32_t id, const std::string& name,
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@ -3916,9 +3916,6 @@ TEST_F(ManualFlushSkipRetainUDTTest, BulkLoading) {
 }

 TEST_F(ManualFlushSkipRetainUDTTest, AutomaticFlushQueued) {
-  // TODO(yuzhangyu): this is where the write for no write stall happens.
-  // We need to sovle that and remove this.
-  column_family_options_.max_write_buffer_number = 3;
  Open();
  ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(0)));

@ -3926,10 +3923,13 @@ TEST_F(ManualFlushSkipRetainUDTTest, AutomaticFlushQueued) {
  ASSERT_OK(dbfull()->TEST_SwitchWAL());
  CheckEffectiveCutoffTime(0);

-  ASSERT_OK(Put(0, "foo", EncodeAsUint64(2), "v2"));
+  // Default `max_write_buffer_number=2` used, writing another memtable can get
+  // automatic flush to proceed because of memory pressure. Not doing that so
+  // we can test automatic flush gets to proceed because of an ongoing manual
+  // flush attempt.
  ASSERT_OK(Flush(0));
-  CheckEffectiveCutoffTime(3);
-  CheckAutomaticFlushRetainUDT(4);
+  CheckEffectiveCutoffTime(2);
+  CheckAutomaticFlushRetainUDT(3);

  Close();
 }
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@ -101,15 +101,20 @@ bool DBImpl::ShouldRescheduleFlushRequestToRetainUDT(
  // alleviated if we continue with the flush instead of postponing it.
  const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();

-  // Taking the status of the active Memtable into consideration so that we are
-  // not just checking if DB is currently already in write stall mode.
-  int mem_to_flush = cfd->mem()->ApproximateMemoryUsageFast() >=
-                             cfd->mem()->write_buffer_size() / 2
-                         ? 1
-                         : 0;
+  // Use the same criteria as WaitUntilFlushWouldNotStallWrites does w.r.t
+  // defining what a write stall is about to happen means. If this uses a
+  // stricter criteria, for example, a write stall is about to happen if the
+  // last memtable is 10% full, there is a possibility that manual flush could
+  // be waiting in `WaitUntilFlushWouldNotStallWrites` with the incorrect
+  // expectation that others will clear up the excessive memtables and
+  // eventually let it proceed. The others in this case won't start clearing
+  // until the last memtable is 10% full. To avoid that scenario, the criteria
+  // this uses should be the same or less strict than
+  // `WaitUntilFlushWouldNotStallWrites` does.
  WriteStallCondition write_stall =
      ColumnFamilyData::GetWriteStallConditionAndCause(
-          cfd->imm()->NumNotFlushed() + mem_to_flush, /*num_l0_files=*/0,
+          cfd->GetUnflushedMemTableCountForWriteStallCheck(),
+          /*num_l0_files=*/0,
          /*num_compaction_needed_bytes=*/0, mutable_cf_options,
          *cfd->ioptions())
          .first;
@ -2677,13 +2682,13 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
      // mode due to pending compaction bytes, but that's less common
      // No extra immutable Memtable will be created if the current Memtable is
      // empty.
-      int mem_to_flush = cfd->mem()->IsEmpty() ? 0 : 1;
-      write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause(
-                                  cfd->imm()->NumNotFlushed() + mem_to_flush,
-                                  vstorage->l0_delay_trigger_count() + 1,
-                                  vstorage->estimated_compaction_needed_bytes(),
-                                  mutable_cf_options, *cfd->ioptions())
-                                  .first;
+      write_stall_condition =
+          ColumnFamilyData::GetWriteStallConditionAndCause(
+              cfd->GetUnflushedMemTableCountForWriteStallCheck(),
+              vstorage->l0_delay_trigger_count() + 1,
+              vstorage->estimated_compaction_needed_bytes(), mutable_cf_options,
+              *cfd->ioptions())
+              .first;
    } while (write_stall_condition != WriteStallCondition::kNormal);
  }
  return Status::OK();
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@ -1037,8 +1037,10 @@ struct AdvancedColumnFamilyOptions {
  // When setting this flag to `false`, users should also call
  // `DB::IncreaseFullHistoryTsLow` to set a cutoff timestamp for flush. RocksDB
  // refrains from flushing a memtable with data still above
-  // the cutoff timestamp with best effort. If this cutoff timestamp is not set,
-  // flushing continues normally.
+  // the cutoff timestamp with best effort. One limitation of this best effort
+  // is that when `max_write_buffer_number` is equal to or smaller than 2,
+  // RocksDB will not attempt to retain user-defined timestamps, all flush jobs
+  // continue normally.
  //
  // Users can do user-defined
  // multi-versioned read above the cutoff timestamp. When users try to read