From f5e44f34904d78d1f649452dc19ef3b36b94ea9d Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Fri, 14 Jun 2024 13:37:37 -0700 Subject: [PATCH] =?UTF-8?q?Fix=20manual=20flush=20hanging=20on=20waiting?= =?UTF-8?q?=20for=20no=20stall=20for=20UDT=20in=20memtable=20=E2=80=A6=20(?= =?UTF-8?q?#12771)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: This PR fix a possible manual flush hanging scenario because of its expectation that others will clear out excessive memtables was not met. The root cause is the FlushRequest rescheduling logic is using a stricter criteria for what a write stall is about to happen means than `WaitUntilFlushWouldNotStallWrites` does. Currently, the former thinks a write stall is about to happen when the last memtable is half full, and it will instead reschedule queued FlushRequest and not actually proceed with the flush. While the latter thinks if we already start to use the last memtable, we should wait until some other background flush jobs clear out some memtables before proceed this manual flush. If we make them use the same criteria, we can guarantee that at any time when`WaitUntilFlushWouldNotStallWrites` is waiting, it's not because the rescheduling logic is holding it back. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12771 Test Plan: Added unit test Reviewed By: ajkr Differential Revision: D58603746 Pulled By: jowlyzhang fbshipit-source-id: 9fa1c87c0175d47a40f584dfb1b497baa576755b --- db/column_family.h | 4 ++++ db/column_family_test.cc | 12 +++++----- db/db_impl/db_impl_compaction_flush.cc | 33 +++++++++++++++----------- include/rocksdb/advanced_options.h | 6 +++-- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/db/column_family.h b/db/column_family.h index 61c955f92c..18fc41e177 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -572,6 +572,10 @@ class ColumnFamilyData { // of its files (if missing) void RecoverEpochNumbers(); + int GetUnflushedMemTableCountForWriteStallCheck() const { + return (mem_->IsEmpty() ? 0 : 1) + imm_.NumNotFlushed(); + } + private: friend class ColumnFamilySet; ColumnFamilyData(uint32_t id, const std::string& name, diff --git a/db/column_family_test.cc b/db/column_family_test.cc index 17438c36a7..f845ad05e9 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -3916,9 +3916,6 @@ TEST_F(ManualFlushSkipRetainUDTTest, BulkLoading) { } TEST_F(ManualFlushSkipRetainUDTTest, AutomaticFlushQueued) { - // TODO(yuzhangyu): this is where the write for no write stall happens. - // We need to sovle that and remove this. - column_family_options_.max_write_buffer_number = 3; Open(); ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(0))); @@ -3926,10 +3923,13 @@ TEST_F(ManualFlushSkipRetainUDTTest, AutomaticFlushQueued) { ASSERT_OK(dbfull()->TEST_SwitchWAL()); CheckEffectiveCutoffTime(0); - ASSERT_OK(Put(0, "foo", EncodeAsUint64(2), "v2")); + // Default `max_write_buffer_number=2` used, writing another memtable can get + // automatic flush to proceed because of memory pressure. Not doing that so + // we can test automatic flush gets to proceed because of an ongoing manual + // flush attempt. ASSERT_OK(Flush(0)); - CheckEffectiveCutoffTime(3); - CheckAutomaticFlushRetainUDT(4); + CheckEffectiveCutoffTime(2); + CheckAutomaticFlushRetainUDT(3); Close(); } diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 9643983786..ab28ab0f94 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -101,15 +101,20 @@ bool DBImpl::ShouldRescheduleFlushRequestToRetainUDT( // alleviated if we continue with the flush instead of postponing it. const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); - // Taking the status of the active Memtable into consideration so that we are - // not just checking if DB is currently already in write stall mode. - int mem_to_flush = cfd->mem()->ApproximateMemoryUsageFast() >= - cfd->mem()->write_buffer_size() / 2 - ? 1 - : 0; + // Use the same criteria as WaitUntilFlushWouldNotStallWrites does w.r.t + // defining what a write stall is about to happen means. If this uses a + // stricter criteria, for example, a write stall is about to happen if the + // last memtable is 10% full, there is a possibility that manual flush could + // be waiting in `WaitUntilFlushWouldNotStallWrites` with the incorrect + // expectation that others will clear up the excessive memtables and + // eventually let it proceed. The others in this case won't start clearing + // until the last memtable is 10% full. To avoid that scenario, the criteria + // this uses should be the same or less strict than + // `WaitUntilFlushWouldNotStallWrites` does. WriteStallCondition write_stall = ColumnFamilyData::GetWriteStallConditionAndCause( - cfd->imm()->NumNotFlushed() + mem_to_flush, /*num_l0_files=*/0, + cfd->GetUnflushedMemTableCountForWriteStallCheck(), + /*num_l0_files=*/0, /*num_compaction_needed_bytes=*/0, mutable_cf_options, *cfd->ioptions()) .first; @@ -2677,13 +2682,13 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, // mode due to pending compaction bytes, but that's less common // No extra immutable Memtable will be created if the current Memtable is // empty. - int mem_to_flush = cfd->mem()->IsEmpty() ? 0 : 1; - write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause( - cfd->imm()->NumNotFlushed() + mem_to_flush, - vstorage->l0_delay_trigger_count() + 1, - vstorage->estimated_compaction_needed_bytes(), - mutable_cf_options, *cfd->ioptions()) - .first; + write_stall_condition = + ColumnFamilyData::GetWriteStallConditionAndCause( + cfd->GetUnflushedMemTableCountForWriteStallCheck(), + vstorage->l0_delay_trigger_count() + 1, + vstorage->estimated_compaction_needed_bytes(), mutable_cf_options, + *cfd->ioptions()) + .first; } while (write_stall_condition != WriteStallCondition::kNormal); } return Status::OK(); diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 80933b1497..cbe1eb52fc 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -1037,8 +1037,10 @@ struct AdvancedColumnFamilyOptions { // When setting this flag to `false`, users should also call // `DB::IncreaseFullHistoryTsLow` to set a cutoff timestamp for flush. RocksDB // refrains from flushing a memtable with data still above - // the cutoff timestamp with best effort. If this cutoff timestamp is not set, - // flushing continues normally. + // the cutoff timestamp with best effort. One limitation of this best effort + // is that when `max_write_buffer_number` is equal to or smaller than 2, + // RocksDB will not attempt to retain user-defined timestamps, all flush jobs + // continue normally. // // Users can do user-defined // multi-versioned read above the cutoff timestamp. When users try to read