diff --git a/db/memtable.cc b/db/memtable.cc index 1a854423f2..872c176b9e 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -61,6 +61,8 @@ ImmutableMemTableOptions::ImmutableMemTableOptions( inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), inplace_callback(ioptions.inplace_callback), max_successive_merges(mutable_cf_options.max_successive_merges), + strict_max_successive_merges( + mutable_cf_options.strict_max_successive_merges), statistics(ioptions.stats), merge_operator(ioptions.merge_operator.get()), info_log(ioptions.logger), diff --git a/db/memtable.h b/db/memtable.h index b3206bf2ac..7f3db0d05b 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -54,6 +54,7 @@ struct ImmutableMemTableOptions { Slice delta_value, std::string* merged_value); size_t max_successive_merges; + bool strict_max_successive_merges; Statistics* statistics; MergeOperator* merge_operator; Logger* info_log; diff --git a/db/write_batch.cc b/db/write_batch.cc index 09fa2c371e..26f9b7d85d 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -2513,6 +2513,11 @@ class MemTableInserter : public WriteBatch::Handler { // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions read_options; + if (!moptions->strict_max_successive_merges) { + // Blocking the write path with read I/O is typically unacceptable, so + // only do this merge when the operands are all found in memory. + read_options.read_tier = kBlockCacheTier; + } read_options.snapshot = &read_from_snapshot; auto cf_handle = cf_mems_->GetColumnFamilyHandle(); diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 993a07ee0a..63001cad4d 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -247,6 +247,7 @@ bool StressTest::BuildOptionsTable() { }}, {"memtable_huge_page_size", {"0", std::to_string(2 * 1024 * 1024)}}, {"max_successive_merges", {"0", "2", "4"}}, + {"strict_max_successive_merges", {"false", "true"}}, {"inplace_update_num_locks", {"100", "200", "300"}}, // TODO: re-enable once internal task T124324915 is fixed. // {"experimental_mempurge_threshold", {"0.0", "1.0"}}, diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 0c1f52f676..f96f9b1d0c 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -649,18 +649,29 @@ struct AdvancedColumnFamilyOptions { TablePropertiesCollectorFactories table_properties_collector_factories; // Maximum number of successive merge operations on a key in the memtable. + // It may be violated when filesystem reads would be needed to stay under the + // limit, unless `strict_max_successive_merges` is explicitly set. // // When a merge operation is added to the memtable and the maximum number of - // successive merges is reached, the value of the key will be calculated and - // inserted into the memtable instead of the merge operation. This will - // ensure that there are never more than max_successive_merges merge - // operations in the memtable. + // successive merges is reached, RocksDB will attempt to read the value. Upon + // success, the value will be inserted into the memtable instead of the merge + // operation. // // Default: 0 (disabled) // // Dynamically changeable through SetOptions() API size_t max_successive_merges = 0; + // Whether to allow filesystem reads to stay under the `max_successive_merges` + // limit. When true, this can lead to merge writes blocking the write path + // waiting on filesystem reads. + // + // This option is temporary in case the recent change to disallow filesystem + // reads during merge writes has a problem and users need to undo it quickly. + // + // Default: false + bool strict_max_successive_merges = false; + // This flag specifies that the implementation should optimize the filters // mainly for cases where keys are found rather than also optimize for keys // missed. This would be used in cases where the application knows that diff --git a/options/cf_options.cc b/options/cf_options.cc index 374522c21a..b4b28ea8b5 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -339,6 +339,10 @@ static std::unordered_map {offsetof(struct MutableCFOptions, max_successive_merges), OptionType::kSizeT, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"strict_max_successive_merges", + {offsetof(struct MutableCFOptions, strict_max_successive_merges), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"memtable_huge_page_size", {offsetof(struct MutableCFOptions, memtable_huge_page_size), OptionType::kSizeT, OptionVerificationType::kNormal, @@ -1053,6 +1057,8 @@ void MutableCFOptions::Dump(Logger* log) const { ROCKS_LOG_INFO(log, " max_successive_merges: %" ROCKSDB_PRIszt, max_successive_merges); + ROCKS_LOG_INFO(log, " strict_max_successive_merges: %d", + strict_max_successive_merges); ROCKS_LOG_INFO(log, " inplace_update_num_locks: %" ROCKSDB_PRIszt, inplace_update_num_locks); diff --git a/options/cf_options.h b/options/cf_options.h index c27c81a456..955397bcb0 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -118,6 +118,7 @@ struct MutableCFOptions { memtable_whole_key_filtering(options.memtable_whole_key_filtering), memtable_huge_page_size(options.memtable_huge_page_size), max_successive_merges(options.max_successive_merges), + strict_max_successive_merges(options.strict_max_successive_merges), inplace_update_num_locks(options.inplace_update_num_locks), prefix_extractor(options.prefix_extractor), experimental_mempurge_threshold( @@ -186,6 +187,7 @@ struct MutableCFOptions { memtable_whole_key_filtering(false), memtable_huge_page_size(0), max_successive_merges(0), + strict_max_successive_merges(false), inplace_update_num_locks(0), prefix_extractor(nullptr), experimental_mempurge_threshold(0.0), @@ -251,6 +253,7 @@ struct MutableCFOptions { bool memtable_whole_key_filtering; size_t memtable_huge_page_size; size_t max_successive_merges; + bool strict_max_successive_merges; size_t inplace_update_num_locks; std::shared_ptr prefix_extractor; // [experimental] diff --git a/options/options.cc b/options/options.cc index e20f6b0fb2..b39ab6445d 100644 --- a/options/options.cc +++ b/options/options.cc @@ -85,6 +85,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) table_properties_collector_factories( options.table_properties_collector_factories), max_successive_merges(options.max_successive_merges), + strict_max_successive_merges(options.strict_max_successive_merges), optimize_filters_for_hits(options.optimize_filters_for_hits), paranoid_file_checks(options.paranoid_file_checks), force_consistency_checks(options.force_consistency_checks), @@ -395,6 +396,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const { log, " Options.max_successive_merges: %" ROCKSDB_PRIszt, max_successive_merges); + ROCKS_LOG_HEADER(log, + " Options.strict_max_successive_merges: %d", + strict_max_successive_merges); ROCKS_LOG_HEADER(log, " Options.optimize_filters_for_hits: %d", optimize_filters_for_hits); diff --git a/options/options_helper.cc b/options/options_helper.cc index ef679861d8..738d9b8380 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -204,6 +204,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering; cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size; cf_opts->max_successive_merges = moptions.max_successive_merges; + cf_opts->strict_max_successive_merges = moptions.strict_max_successive_merges; cf_opts->inplace_update_num_locks = moptions.inplace_update_num_locks; cf_opts->prefix_extractor = moptions.prefix_extractor; cf_opts->experimental_mempurge_threshold = diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 37a6eb17b8..646f8487c6 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -492,6 +492,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "target_file_size_base=4294976376;" "memtable_huge_page_size=2557;" "max_successive_merges=5497;" + "strict_max_successive_merges=true;" "max_sequential_skip_in_iterations=4294971408;" "arena_block_size=1893;" "target_file_size_multiplier=35;" diff --git a/options/options_test.cc b/options/options_test.cc index 62a24c7d93..addb2d5e64 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -115,6 +115,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"memtable_huge_page_size", "28"}, {"bloom_locality", "29"}, {"max_successive_merges", "30"}, + {"strict_max_successive_merges", "true"}, {"min_partial_merge_operands", "31"}, {"prefix_extractor", "fixed:31"}, {"experimental_mempurge_threshold", "0.003"}, @@ -270,6 +271,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U); ASSERT_EQ(new_cf_opt.bloom_locality, 29U); ASSERT_EQ(new_cf_opt.max_successive_merges, 30U); + ASSERT_EQ(new_cf_opt.strict_max_successive_merges, true); ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr); ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true); ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31"); @@ -2333,6 +2335,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"memtable_huge_page_size", "28"}, {"bloom_locality", "29"}, {"max_successive_merges", "30"}, + {"strict_max_successive_merges", "true"}, {"min_partial_merge_operands", "31"}, {"prefix_extractor", "fixed:31"}, {"experimental_mempurge_threshold", "0.003"}, @@ -2484,6 +2487,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U); ASSERT_EQ(new_cf_opt.bloom_locality, 29U); ASSERT_EQ(new_cf_opt.max_successive_merges, 30U); + ASSERT_EQ(new_cf_opt.strict_max_successive_merges, true); ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr); ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true); ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31"); diff --git a/test_util/testutil.cc b/test_util/testutil.cc index 968f567404..f90688a945 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -370,6 +370,7 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options, cf_opt->memtable_whole_key_filtering = rnd->Uniform(2); cf_opt->enable_blob_files = rnd->Uniform(2); cf_opt->enable_blob_garbage_collection = rnd->Uniform(2); + cf_opt->strict_max_successive_merges = rnd->Uniform(2); // double options cf_opt->memtable_prefix_bloom_size_ratio = diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 9aab37c703..8dff2d99d9 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1638,6 +1638,10 @@ DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge operations on a key in the " "memtable"); +DEFINE_bool(strict_max_successive_merges, false, + "Whether to issue filesystem reads to keep within " + "`max_successive_merges` limit"); + static bool ValidatePrefixSize(const char* flagname, int32_t value) { if (value < 0 || value >= 2000000000) { fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n", @@ -4626,6 +4630,7 @@ class Benchmark { } } options.max_successive_merges = FLAGS_max_successive_merges; + options.strict_max_successive_merges = FLAGS_strict_max_successive_merges; options.report_bg_io_stats = FLAGS_report_bg_io_stats; // set universal style compaction configurations, if applicable diff --git a/unreleased_history/behavior_changes/max_successive_merges_io.md b/unreleased_history/behavior_changes/max_successive_merges_io.md new file mode 100644 index 0000000000..c331b40497 --- /dev/null +++ b/unreleased_history/behavior_changes/max_successive_merges_io.md @@ -0,0 +1 @@ +* Merge writes will only keep merge operand count within `ColumnFamilyOptions::max_successive_merges` when the key's merge operands are all found in memory, unless `strict_max_successive_merges` is explicitly set.