No filesystem reads during `Merge()` writes (#12365)

Summary:
This occasional filesystem read in the write path has caused user pain. It doesn't seem very useful considering it only limits one component's merge chain length, and only helps merge uncached (i.e., infrequently read) values. This PR proposes allowing `max_successive_merges` to be exceeded when the value cannot be read from in-memory components. I included a rollback flag (`strict_max_successive_merges`) just in case.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12365

Test Plan:
"rocksdb.block.cache.data.add" is number of data blocks read from filesystem. Since the benchmark is write-only, compaction is disabled, and flush doesn't read data blocks, any nonzero value means the user write issued the read.

```
$ for s in false true; do echo -n "strict_max_successive_merges=$s: " && ./db_bench -value_size=64 -write_buffer_size=131072 -writes=128 -num=1 -benchmarks=mergerandom,flush,mergerandom -merge_operator=stringappend -disable_auto_compactions=true -compression_type=none -strict_max_successive_merges=$s -max_successive_merges=100 -statistics=true |& grep 'block.cache.data.add COUNT' ; done
strict_max_successive_merges=false: rocksdb.block.cache.data.add COUNT : 0
strict_max_successive_merges=true: rocksdb.block.cache.data.add COUNT : 1
```

Reviewed By: hx235

Differential Revision: D53982520

Pulled By: ajkr

fbshipit-source-id: e40f761a60bd601f232417ac0058e4a33ee9c0f4
This commit is contained in:
Andrew Kryczka 2024-02-21 13:15:27 -08:00 committed by Facebook GitHub Bot
parent 5950907a82
commit 8e29f243c9
14 changed files with 50 additions and 4 deletions

View File

@ -61,6 +61,8 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
inplace_callback(ioptions.inplace_callback),
max_successive_merges(mutable_cf_options.max_successive_merges),
strict_max_successive_merges(
mutable_cf_options.strict_max_successive_merges),
statistics(ioptions.stats),
merge_operator(ioptions.merge_operator.get()),
info_log(ioptions.logger),

View File

@ -54,6 +54,7 @@ struct ImmutableMemTableOptions {
Slice delta_value,
std::string* merged_value);
size_t max_successive_merges;
bool strict_max_successive_merges;
Statistics* statistics;
MergeOperator* merge_operator;
Logger* info_log;

View File

@ -2513,6 +2513,11 @@ class MemTableInserter : public WriteBatch::Handler {
// TODO: plumb Env::IOActivity, Env::IOPriority
ReadOptions read_options;
if (!moptions->strict_max_successive_merges) {
// Blocking the write path with read I/O is typically unacceptable, so
// only do this merge when the operands are all found in memory.
read_options.read_tier = kBlockCacheTier;
}
read_options.snapshot = &read_from_snapshot;
auto cf_handle = cf_mems_->GetColumnFamilyHandle();

View File

@ -247,6 +247,7 @@ bool StressTest::BuildOptionsTable() {
}},
{"memtable_huge_page_size", {"0", std::to_string(2 * 1024 * 1024)}},
{"max_successive_merges", {"0", "2", "4"}},
{"strict_max_successive_merges", {"false", "true"}},
{"inplace_update_num_locks", {"100", "200", "300"}},
// TODO: re-enable once internal task T124324915 is fixed.
// {"experimental_mempurge_threshold", {"0.0", "1.0"}},

View File

@ -649,18 +649,29 @@ struct AdvancedColumnFamilyOptions {
TablePropertiesCollectorFactories table_properties_collector_factories;
// Maximum number of successive merge operations on a key in the memtable.
// It may be violated when filesystem reads would be needed to stay under the
// limit, unless `strict_max_successive_merges` is explicitly set.
//
// When a merge operation is added to the memtable and the maximum number of
// successive merges is reached, the value of the key will be calculated and
// inserted into the memtable instead of the merge operation. This will
// ensure that there are never more than max_successive_merges merge
// operations in the memtable.
// successive merges is reached, RocksDB will attempt to read the value. Upon
// success, the value will be inserted into the memtable instead of the merge
// operation.
//
// Default: 0 (disabled)
//
// Dynamically changeable through SetOptions() API
size_t max_successive_merges = 0;
// Whether to allow filesystem reads to stay under the `max_successive_merges`
// limit. When true, this can lead to merge writes blocking the write path
// waiting on filesystem reads.
//
// This option is temporary in case the recent change to disallow filesystem
// reads during merge writes has a problem and users need to undo it quickly.
//
// Default: false
bool strict_max_successive_merges = false;
// This flag specifies that the implementation should optimize the filters
// mainly for cases where keys are found rather than also optimize for keys
// missed. This would be used in cases where the application knows that

View File

@ -339,6 +339,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct MutableCFOptions, max_successive_merges),
OptionType::kSizeT, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"strict_max_successive_merges",
{offsetof(struct MutableCFOptions, strict_max_successive_merges),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"memtable_huge_page_size",
{offsetof(struct MutableCFOptions, memtable_huge_page_size),
OptionType::kSizeT, OptionVerificationType::kNormal,
@ -1053,6 +1057,8 @@ void MutableCFOptions::Dump(Logger* log) const {
ROCKS_LOG_INFO(log,
" max_successive_merges: %" ROCKSDB_PRIszt,
max_successive_merges);
ROCKS_LOG_INFO(log, " strict_max_successive_merges: %d",
strict_max_successive_merges);
ROCKS_LOG_INFO(log,
" inplace_update_num_locks: %" ROCKSDB_PRIszt,
inplace_update_num_locks);

View File

@ -118,6 +118,7 @@ struct MutableCFOptions {
memtable_whole_key_filtering(options.memtable_whole_key_filtering),
memtable_huge_page_size(options.memtable_huge_page_size),
max_successive_merges(options.max_successive_merges),
strict_max_successive_merges(options.strict_max_successive_merges),
inplace_update_num_locks(options.inplace_update_num_locks),
prefix_extractor(options.prefix_extractor),
experimental_mempurge_threshold(
@ -186,6 +187,7 @@ struct MutableCFOptions {
memtable_whole_key_filtering(false),
memtable_huge_page_size(0),
max_successive_merges(0),
strict_max_successive_merges(false),
inplace_update_num_locks(0),
prefix_extractor(nullptr),
experimental_mempurge_threshold(0.0),
@ -251,6 +253,7 @@ struct MutableCFOptions {
bool memtable_whole_key_filtering;
size_t memtable_huge_page_size;
size_t max_successive_merges;
bool strict_max_successive_merges;
size_t inplace_update_num_locks;
std::shared_ptr<const SliceTransform> prefix_extractor;
// [experimental]

View File

@ -85,6 +85,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
table_properties_collector_factories(
options.table_properties_collector_factories),
max_successive_merges(options.max_successive_merges),
strict_max_successive_merges(options.strict_max_successive_merges),
optimize_filters_for_hits(options.optimize_filters_for_hits),
paranoid_file_checks(options.paranoid_file_checks),
force_consistency_checks(options.force_consistency_checks),
@ -395,6 +396,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
log,
" Options.max_successive_merges: %" ROCKSDB_PRIszt,
max_successive_merges);
ROCKS_LOG_HEADER(log,
" Options.strict_max_successive_merges: %d",
strict_max_successive_merges);
ROCKS_LOG_HEADER(log,
" Options.optimize_filters_for_hits: %d",
optimize_filters_for_hits);

View File

@ -204,6 +204,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering;
cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size;
cf_opts->max_successive_merges = moptions.max_successive_merges;
cf_opts->strict_max_successive_merges = moptions.strict_max_successive_merges;
cf_opts->inplace_update_num_locks = moptions.inplace_update_num_locks;
cf_opts->prefix_extractor = moptions.prefix_extractor;
cf_opts->experimental_mempurge_threshold =

View File

@ -492,6 +492,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
"target_file_size_base=4294976376;"
"memtable_huge_page_size=2557;"
"max_successive_merges=5497;"
"strict_max_successive_merges=true;"
"max_sequential_skip_in_iterations=4294971408;"
"arena_block_size=1893;"
"target_file_size_multiplier=35;"

View File

@ -115,6 +115,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
{"memtable_huge_page_size", "28"},
{"bloom_locality", "29"},
{"max_successive_merges", "30"},
{"strict_max_successive_merges", "true"},
{"min_partial_merge_operands", "31"},
{"prefix_extractor", "fixed:31"},
{"experimental_mempurge_threshold", "0.003"},
@ -270,6 +271,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U);
ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
ASSERT_EQ(new_cf_opt.strict_max_successive_merges, true);
ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr);
ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true);
ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31");
@ -2333,6 +2335,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
{"memtable_huge_page_size", "28"},
{"bloom_locality", "29"},
{"max_successive_merges", "30"},
{"strict_max_successive_merges", "true"},
{"min_partial_merge_operands", "31"},
{"prefix_extractor", "fixed:31"},
{"experimental_mempurge_threshold", "0.003"},
@ -2484,6 +2487,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U);
ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
ASSERT_EQ(new_cf_opt.strict_max_successive_merges, true);
ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr);
ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true);
ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31");

View File

@ -370,6 +370,7 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
cf_opt->memtable_whole_key_filtering = rnd->Uniform(2);
cf_opt->enable_blob_files = rnd->Uniform(2);
cf_opt->enable_blob_garbage_collection = rnd->Uniform(2);
cf_opt->strict_max_successive_merges = rnd->Uniform(2);
// double options
cf_opt->memtable_prefix_bloom_size_ratio =

View File

@ -1638,6 +1638,10 @@ DEFINE_int32(max_successive_merges, 0,
"Maximum number of successive merge operations on a key in the "
"memtable");
DEFINE_bool(strict_max_successive_merges, false,
"Whether to issue filesystem reads to keep within "
"`max_successive_merges` limit");
static bool ValidatePrefixSize(const char* flagname, int32_t value) {
if (value < 0 || value >= 2000000000) {
fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
@ -4626,6 +4630,7 @@ class Benchmark {
}
}
options.max_successive_merges = FLAGS_max_successive_merges;
options.strict_max_successive_merges = FLAGS_strict_max_successive_merges;
options.report_bg_io_stats = FLAGS_report_bg_io_stats;
// set universal style compaction configurations, if applicable

View File

@ -0,0 +1 @@
* Merge writes will only keep merge operand count within `ColumnFamilyOptions::max_successive_merges` when the key's merge operands are all found in memory, unless `strict_max_successive_merges` is explicitly set.