From 8e29f243c9bf943e180928b037c4bf4061747891 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Wed, 21 Feb 2024 13:15:27 -0800 Subject: [PATCH] No filesystem reads during `Merge()` writes (#12365) Summary: This occasional filesystem read in the write path has caused user pain. It doesn't seem very useful considering it only limits one component's merge chain length, and only helps merge uncached (i.e., infrequently read) values. This PR proposes allowing `max_successive_merges` to be exceeded when the value cannot be read from in-memory components. I included a rollback flag (`strict_max_successive_merges`) just in case. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12365 Test Plan: "rocksdb.block.cache.data.add" is number of data blocks read from filesystem. Since the benchmark is write-only, compaction is disabled, and flush doesn't read data blocks, any nonzero value means the user write issued the read. ``` $ for s in false true; do echo -n "strict_max_successive_merges=$s: " && ./db_bench -value_size=64 -write_buffer_size=131072 -writes=128 -num=1 -benchmarks=mergerandom,flush,mergerandom -merge_operator=stringappend -disable_auto_compactions=true -compression_type=none -strict_max_successive_merges=$s -max_successive_merges=100 -statistics=true |& grep 'block.cache.data.add COUNT' ; done strict_max_successive_merges=false: rocksdb.block.cache.data.add COUNT : 0 strict_max_successive_merges=true: rocksdb.block.cache.data.add COUNT : 1 ``` Reviewed By: hx235 Differential Revision: D53982520 Pulled By: ajkr fbshipit-source-id: e40f761a60bd601f232417ac0058e4a33ee9c0f4 --- db/memtable.cc | 2 ++ db/memtable.h | 1 + db/write_batch.cc | 5 +++++ db_stress_tool/db_stress_test_base.cc | 1 + include/rocksdb/advanced_options.h | 19 +++++++++++++++---- options/cf_options.cc | 6 ++++++ options/cf_options.h | 3 +++ options/options.cc | 4 ++++ options/options_helper.cc | 1 + options/options_settable_test.cc | 1 + options/options_test.cc | 4 ++++ test_util/testutil.cc | 1 + tools/db_bench_tool.cc | 5 +++++ .../max_successive_merges_io.md | 1 + 14 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 unreleased_history/behavior_changes/max_successive_merges_io.md diff --git a/db/memtable.cc b/db/memtable.cc index 1a854423f2..872c176b9e 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -61,6 +61,8 @@ ImmutableMemTableOptions::ImmutableMemTableOptions( inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), inplace_callback(ioptions.inplace_callback), max_successive_merges(mutable_cf_options.max_successive_merges), + strict_max_successive_merges( + mutable_cf_options.strict_max_successive_merges), statistics(ioptions.stats), merge_operator(ioptions.merge_operator.get()), info_log(ioptions.logger), diff --git a/db/memtable.h b/db/memtable.h index b3206bf2ac..7f3db0d05b 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -54,6 +54,7 @@ struct ImmutableMemTableOptions { Slice delta_value, std::string* merged_value); size_t max_successive_merges; + bool strict_max_successive_merges; Statistics* statistics; MergeOperator* merge_operator; Logger* info_log; diff --git a/db/write_batch.cc b/db/write_batch.cc index 09fa2c371e..26f9b7d85d 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -2513,6 +2513,11 @@ class MemTableInserter : public WriteBatch::Handler { // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions read_options; + if (!moptions->strict_max_successive_merges) { + // Blocking the write path with read I/O is typically unacceptable, so + // only do this merge when the operands are all found in memory. + read_options.read_tier = kBlockCacheTier; + } read_options.snapshot = &read_from_snapshot; auto cf_handle = cf_mems_->GetColumnFamilyHandle(); diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 993a07ee0a..63001cad4d 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -247,6 +247,7 @@ bool StressTest::BuildOptionsTable() { }}, {"memtable_huge_page_size", {"0", std::to_string(2 * 1024 * 1024)}}, {"max_successive_merges", {"0", "2", "4"}}, + {"strict_max_successive_merges", {"false", "true"}}, {"inplace_update_num_locks", {"100", "200", "300"}}, // TODO: re-enable once internal task T124324915 is fixed. // {"experimental_mempurge_threshold", {"0.0", "1.0"}}, diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 0c1f52f676..f96f9b1d0c 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -649,18 +649,29 @@ struct AdvancedColumnFamilyOptions { TablePropertiesCollectorFactories table_properties_collector_factories; // Maximum number of successive merge operations on a key in the memtable. + // It may be violated when filesystem reads would be needed to stay under the + // limit, unless `strict_max_successive_merges` is explicitly set. // // When a merge operation is added to the memtable and the maximum number of - // successive merges is reached, the value of the key will be calculated and - // inserted into the memtable instead of the merge operation. This will - // ensure that there are never more than max_successive_merges merge - // operations in the memtable. + // successive merges is reached, RocksDB will attempt to read the value. Upon + // success, the value will be inserted into the memtable instead of the merge + // operation. // // Default: 0 (disabled) // // Dynamically changeable through SetOptions() API size_t max_successive_merges = 0; + // Whether to allow filesystem reads to stay under the `max_successive_merges` + // limit. When true, this can lead to merge writes blocking the write path + // waiting on filesystem reads. + // + // This option is temporary in case the recent change to disallow filesystem + // reads during merge writes has a problem and users need to undo it quickly. + // + // Default: false + bool strict_max_successive_merges = false; + // This flag specifies that the implementation should optimize the filters // mainly for cases where keys are found rather than also optimize for keys // missed. This would be used in cases where the application knows that diff --git a/options/cf_options.cc b/options/cf_options.cc index 374522c21a..b4b28ea8b5 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -339,6 +339,10 @@ static std::unordered_map {offsetof(struct MutableCFOptions, max_successive_merges), OptionType::kSizeT, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"strict_max_successive_merges", + {offsetof(struct MutableCFOptions, strict_max_successive_merges), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"memtable_huge_page_size", {offsetof(struct MutableCFOptions, memtable_huge_page_size), OptionType::kSizeT, OptionVerificationType::kNormal, @@ -1053,6 +1057,8 @@ void MutableCFOptions::Dump(Logger* log) const { ROCKS_LOG_INFO(log, " max_successive_merges: %" ROCKSDB_PRIszt, max_successive_merges); + ROCKS_LOG_INFO(log, " strict_max_successive_merges: %d", + strict_max_successive_merges); ROCKS_LOG_INFO(log, " inplace_update_num_locks: %" ROCKSDB_PRIszt, inplace_update_num_locks); diff --git a/options/cf_options.h b/options/cf_options.h index c27c81a456..955397bcb0 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -118,6 +118,7 @@ struct MutableCFOptions { memtable_whole_key_filtering(options.memtable_whole_key_filtering), memtable_huge_page_size(options.memtable_huge_page_size), max_successive_merges(options.max_successive_merges), + strict_max_successive_merges(options.strict_max_successive_merges), inplace_update_num_locks(options.inplace_update_num_locks), prefix_extractor(options.prefix_extractor), experimental_mempurge_threshold( @@ -186,6 +187,7 @@ struct MutableCFOptions { memtable_whole_key_filtering(false), memtable_huge_page_size(0), max_successive_merges(0), + strict_max_successive_merges(false), inplace_update_num_locks(0), prefix_extractor(nullptr), experimental_mempurge_threshold(0.0), @@ -251,6 +253,7 @@ struct MutableCFOptions { bool memtable_whole_key_filtering; size_t memtable_huge_page_size; size_t max_successive_merges; + bool strict_max_successive_merges; size_t inplace_update_num_locks; std::shared_ptr prefix_extractor; // [experimental] diff --git a/options/options.cc b/options/options.cc index e20f6b0fb2..b39ab6445d 100644 --- a/options/options.cc +++ b/options/options.cc @@ -85,6 +85,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) table_properties_collector_factories( options.table_properties_collector_factories), max_successive_merges(options.max_successive_merges), + strict_max_successive_merges(options.strict_max_successive_merges), optimize_filters_for_hits(options.optimize_filters_for_hits), paranoid_file_checks(options.paranoid_file_checks), force_consistency_checks(options.force_consistency_checks), @@ -395,6 +396,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const { log, " Options.max_successive_merges: %" ROCKSDB_PRIszt, max_successive_merges); + ROCKS_LOG_HEADER(log, + " Options.strict_max_successive_merges: %d", + strict_max_successive_merges); ROCKS_LOG_HEADER(log, " Options.optimize_filters_for_hits: %d", optimize_filters_for_hits); diff --git a/options/options_helper.cc b/options/options_helper.cc index ef679861d8..738d9b8380 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -204,6 +204,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering; cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size; cf_opts->max_successive_merges = moptions.max_successive_merges; + cf_opts->strict_max_successive_merges = moptions.strict_max_successive_merges; cf_opts->inplace_update_num_locks = moptions.inplace_update_num_locks; cf_opts->prefix_extractor = moptions.prefix_extractor; cf_opts->experimental_mempurge_threshold = diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 37a6eb17b8..646f8487c6 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -492,6 +492,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "target_file_size_base=4294976376;" "memtable_huge_page_size=2557;" "max_successive_merges=5497;" + "strict_max_successive_merges=true;" "max_sequential_skip_in_iterations=4294971408;" "arena_block_size=1893;" "target_file_size_multiplier=35;" diff --git a/options/options_test.cc b/options/options_test.cc index 62a24c7d93..addb2d5e64 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -115,6 +115,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"memtable_huge_page_size", "28"}, {"bloom_locality", "29"}, {"max_successive_merges", "30"}, + {"strict_max_successive_merges", "true"}, {"min_partial_merge_operands", "31"}, {"prefix_extractor", "fixed:31"}, {"experimental_mempurge_threshold", "0.003"}, @@ -270,6 +271,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U); ASSERT_EQ(new_cf_opt.bloom_locality, 29U); ASSERT_EQ(new_cf_opt.max_successive_merges, 30U); + ASSERT_EQ(new_cf_opt.strict_max_successive_merges, true); ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr); ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true); ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31"); @@ -2333,6 +2335,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"memtable_huge_page_size", "28"}, {"bloom_locality", "29"}, {"max_successive_merges", "30"}, + {"strict_max_successive_merges", "true"}, {"min_partial_merge_operands", "31"}, {"prefix_extractor", "fixed:31"}, {"experimental_mempurge_threshold", "0.003"}, @@ -2484,6 +2487,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U); ASSERT_EQ(new_cf_opt.bloom_locality, 29U); ASSERT_EQ(new_cf_opt.max_successive_merges, 30U); + ASSERT_EQ(new_cf_opt.strict_max_successive_merges, true); ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr); ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true); ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31"); diff --git a/test_util/testutil.cc b/test_util/testutil.cc index 968f567404..f90688a945 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -370,6 +370,7 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options, cf_opt->memtable_whole_key_filtering = rnd->Uniform(2); cf_opt->enable_blob_files = rnd->Uniform(2); cf_opt->enable_blob_garbage_collection = rnd->Uniform(2); + cf_opt->strict_max_successive_merges = rnd->Uniform(2); // double options cf_opt->memtable_prefix_bloom_size_ratio = diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 9aab37c703..8dff2d99d9 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1638,6 +1638,10 @@ DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge operations on a key in the " "memtable"); +DEFINE_bool(strict_max_successive_merges, false, + "Whether to issue filesystem reads to keep within " + "`max_successive_merges` limit"); + static bool ValidatePrefixSize(const char* flagname, int32_t value) { if (value < 0 || value >= 2000000000) { fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n", @@ -4626,6 +4630,7 @@ class Benchmark { } } options.max_successive_merges = FLAGS_max_successive_merges; + options.strict_max_successive_merges = FLAGS_strict_max_successive_merges; options.report_bg_io_stats = FLAGS_report_bg_io_stats; // set universal style compaction configurations, if applicable diff --git a/unreleased_history/behavior_changes/max_successive_merges_io.md b/unreleased_history/behavior_changes/max_successive_merges_io.md new file mode 100644 index 0000000000..c331b40497 --- /dev/null +++ b/unreleased_history/behavior_changes/max_successive_merges_io.md @@ -0,0 +1 @@ +* Merge writes will only keep merge operand count within `ColumnFamilyOptions::max_successive_merges` when the key's merge operands are all found in memory, unless `strict_max_successive_merges` is explicitly set.