mirror of https://github.com/facebook/rocksdb.git
No filesystem reads during `Merge()` writes (#12365)
Summary: This occasional filesystem read in the write path has caused user pain. It doesn't seem very useful considering it only limits one component's merge chain length, and only helps merge uncached (i.e., infrequently read) values. This PR proposes allowing `max_successive_merges` to be exceeded when the value cannot be read from in-memory components. I included a rollback flag (`strict_max_successive_merges`) just in case. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12365 Test Plan: "rocksdb.block.cache.data.add" is number of data blocks read from filesystem. Since the benchmark is write-only, compaction is disabled, and flush doesn't read data blocks, any nonzero value means the user write issued the read. ``` $ for s in false true; do echo -n "strict_max_successive_merges=$s: " && ./db_bench -value_size=64 -write_buffer_size=131072 -writes=128 -num=1 -benchmarks=mergerandom,flush,mergerandom -merge_operator=stringappend -disable_auto_compactions=true -compression_type=none -strict_max_successive_merges=$s -max_successive_merges=100 -statistics=true |& grep 'block.cache.data.add COUNT' ; done strict_max_successive_merges=false: rocksdb.block.cache.data.add COUNT : 0 strict_max_successive_merges=true: rocksdb.block.cache.data.add COUNT : 1 ``` Reviewed By: hx235 Differential Revision: D53982520 Pulled By: ajkr fbshipit-source-id: e40f761a60bd601f232417ac0058e4a33ee9c0f4
This commit is contained in:
parent
5950907a82
commit
8e29f243c9
|
@ -61,6 +61,8 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
|
|||
inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
|
||||
inplace_callback(ioptions.inplace_callback),
|
||||
max_successive_merges(mutable_cf_options.max_successive_merges),
|
||||
strict_max_successive_merges(
|
||||
mutable_cf_options.strict_max_successive_merges),
|
||||
statistics(ioptions.stats),
|
||||
merge_operator(ioptions.merge_operator.get()),
|
||||
info_log(ioptions.logger),
|
||||
|
|
|
@ -54,6 +54,7 @@ struct ImmutableMemTableOptions {
|
|||
Slice delta_value,
|
||||
std::string* merged_value);
|
||||
size_t max_successive_merges;
|
||||
bool strict_max_successive_merges;
|
||||
Statistics* statistics;
|
||||
MergeOperator* merge_operator;
|
||||
Logger* info_log;
|
||||
|
|
|
@ -2513,6 +2513,11 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||
|
||||
// TODO: plumb Env::IOActivity, Env::IOPriority
|
||||
ReadOptions read_options;
|
||||
if (!moptions->strict_max_successive_merges) {
|
||||
// Blocking the write path with read I/O is typically unacceptable, so
|
||||
// only do this merge when the operands are all found in memory.
|
||||
read_options.read_tier = kBlockCacheTier;
|
||||
}
|
||||
read_options.snapshot = &read_from_snapshot;
|
||||
|
||||
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
|
||||
|
|
|
@ -247,6 +247,7 @@ bool StressTest::BuildOptionsTable() {
|
|||
}},
|
||||
{"memtable_huge_page_size", {"0", std::to_string(2 * 1024 * 1024)}},
|
||||
{"max_successive_merges", {"0", "2", "4"}},
|
||||
{"strict_max_successive_merges", {"false", "true"}},
|
||||
{"inplace_update_num_locks", {"100", "200", "300"}},
|
||||
// TODO: re-enable once internal task T124324915 is fixed.
|
||||
// {"experimental_mempurge_threshold", {"0.0", "1.0"}},
|
||||
|
|
|
@ -649,18 +649,29 @@ struct AdvancedColumnFamilyOptions {
|
|||
TablePropertiesCollectorFactories table_properties_collector_factories;
|
||||
|
||||
// Maximum number of successive merge operations on a key in the memtable.
|
||||
// It may be violated when filesystem reads would be needed to stay under the
|
||||
// limit, unless `strict_max_successive_merges` is explicitly set.
|
||||
//
|
||||
// When a merge operation is added to the memtable and the maximum number of
|
||||
// successive merges is reached, the value of the key will be calculated and
|
||||
// inserted into the memtable instead of the merge operation. This will
|
||||
// ensure that there are never more than max_successive_merges merge
|
||||
// operations in the memtable.
|
||||
// successive merges is reached, RocksDB will attempt to read the value. Upon
|
||||
// success, the value will be inserted into the memtable instead of the merge
|
||||
// operation.
|
||||
//
|
||||
// Default: 0 (disabled)
|
||||
//
|
||||
// Dynamically changeable through SetOptions() API
|
||||
size_t max_successive_merges = 0;
|
||||
|
||||
// Whether to allow filesystem reads to stay under the `max_successive_merges`
|
||||
// limit. When true, this can lead to merge writes blocking the write path
|
||||
// waiting on filesystem reads.
|
||||
//
|
||||
// This option is temporary in case the recent change to disallow filesystem
|
||||
// reads during merge writes has a problem and users need to undo it quickly.
|
||||
//
|
||||
// Default: false
|
||||
bool strict_max_successive_merges = false;
|
||||
|
||||
// This flag specifies that the implementation should optimize the filters
|
||||
// mainly for cases where keys are found rather than also optimize for keys
|
||||
// missed. This would be used in cases where the application knows that
|
||||
|
|
|
@ -339,6 +339,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
|
|||
{offsetof(struct MutableCFOptions, max_successive_merges),
|
||||
OptionType::kSizeT, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kMutable}},
|
||||
{"strict_max_successive_merges",
|
||||
{offsetof(struct MutableCFOptions, strict_max_successive_merges),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kMutable}},
|
||||
{"memtable_huge_page_size",
|
||||
{offsetof(struct MutableCFOptions, memtable_huge_page_size),
|
||||
OptionType::kSizeT, OptionVerificationType::kNormal,
|
||||
|
@ -1053,6 +1057,8 @@ void MutableCFOptions::Dump(Logger* log) const {
|
|||
ROCKS_LOG_INFO(log,
|
||||
" max_successive_merges: %" ROCKSDB_PRIszt,
|
||||
max_successive_merges);
|
||||
ROCKS_LOG_INFO(log, " strict_max_successive_merges: %d",
|
||||
strict_max_successive_merges);
|
||||
ROCKS_LOG_INFO(log,
|
||||
" inplace_update_num_locks: %" ROCKSDB_PRIszt,
|
||||
inplace_update_num_locks);
|
||||
|
|
|
@ -118,6 +118,7 @@ struct MutableCFOptions {
|
|||
memtable_whole_key_filtering(options.memtable_whole_key_filtering),
|
||||
memtable_huge_page_size(options.memtable_huge_page_size),
|
||||
max_successive_merges(options.max_successive_merges),
|
||||
strict_max_successive_merges(options.strict_max_successive_merges),
|
||||
inplace_update_num_locks(options.inplace_update_num_locks),
|
||||
prefix_extractor(options.prefix_extractor),
|
||||
experimental_mempurge_threshold(
|
||||
|
@ -186,6 +187,7 @@ struct MutableCFOptions {
|
|||
memtable_whole_key_filtering(false),
|
||||
memtable_huge_page_size(0),
|
||||
max_successive_merges(0),
|
||||
strict_max_successive_merges(false),
|
||||
inplace_update_num_locks(0),
|
||||
prefix_extractor(nullptr),
|
||||
experimental_mempurge_threshold(0.0),
|
||||
|
@ -251,6 +253,7 @@ struct MutableCFOptions {
|
|||
bool memtable_whole_key_filtering;
|
||||
size_t memtable_huge_page_size;
|
||||
size_t max_successive_merges;
|
||||
bool strict_max_successive_merges;
|
||||
size_t inplace_update_num_locks;
|
||||
std::shared_ptr<const SliceTransform> prefix_extractor;
|
||||
// [experimental]
|
||||
|
|
|
@ -85,6 +85,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
|
|||
table_properties_collector_factories(
|
||||
options.table_properties_collector_factories),
|
||||
max_successive_merges(options.max_successive_merges),
|
||||
strict_max_successive_merges(options.strict_max_successive_merges),
|
||||
optimize_filters_for_hits(options.optimize_filters_for_hits),
|
||||
paranoid_file_checks(options.paranoid_file_checks),
|
||||
force_consistency_checks(options.force_consistency_checks),
|
||||
|
@ -395,6 +396,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
|
|||
log,
|
||||
" Options.max_successive_merges: %" ROCKSDB_PRIszt,
|
||||
max_successive_merges);
|
||||
ROCKS_LOG_HEADER(log,
|
||||
" Options.strict_max_successive_merges: %d",
|
||||
strict_max_successive_merges);
|
||||
ROCKS_LOG_HEADER(log,
|
||||
" Options.optimize_filters_for_hits: %d",
|
||||
optimize_filters_for_hits);
|
||||
|
|
|
@ -204,6 +204,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
|
|||
cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering;
|
||||
cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size;
|
||||
cf_opts->max_successive_merges = moptions.max_successive_merges;
|
||||
cf_opts->strict_max_successive_merges = moptions.strict_max_successive_merges;
|
||||
cf_opts->inplace_update_num_locks = moptions.inplace_update_num_locks;
|
||||
cf_opts->prefix_extractor = moptions.prefix_extractor;
|
||||
cf_opts->experimental_mempurge_threshold =
|
||||
|
|
|
@ -492,6 +492,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
|
|||
"target_file_size_base=4294976376;"
|
||||
"memtable_huge_page_size=2557;"
|
||||
"max_successive_merges=5497;"
|
||||
"strict_max_successive_merges=true;"
|
||||
"max_sequential_skip_in_iterations=4294971408;"
|
||||
"arena_block_size=1893;"
|
||||
"target_file_size_multiplier=35;"
|
||||
|
|
|
@ -115,6 +115,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
|
|||
{"memtable_huge_page_size", "28"},
|
||||
{"bloom_locality", "29"},
|
||||
{"max_successive_merges", "30"},
|
||||
{"strict_max_successive_merges", "true"},
|
||||
{"min_partial_merge_operands", "31"},
|
||||
{"prefix_extractor", "fixed:31"},
|
||||
{"experimental_mempurge_threshold", "0.003"},
|
||||
|
@ -270,6 +271,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
|
|||
ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U);
|
||||
ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
|
||||
ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
|
||||
ASSERT_EQ(new_cf_opt.strict_max_successive_merges, true);
|
||||
ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr);
|
||||
ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true);
|
||||
ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31");
|
||||
|
@ -2333,6 +2335,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
|
|||
{"memtable_huge_page_size", "28"},
|
||||
{"bloom_locality", "29"},
|
||||
{"max_successive_merges", "30"},
|
||||
{"strict_max_successive_merges", "true"},
|
||||
{"min_partial_merge_operands", "31"},
|
||||
{"prefix_extractor", "fixed:31"},
|
||||
{"experimental_mempurge_threshold", "0.003"},
|
||||
|
@ -2484,6 +2487,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
|
|||
ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U);
|
||||
ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
|
||||
ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
|
||||
ASSERT_EQ(new_cf_opt.strict_max_successive_merges, true);
|
||||
ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr);
|
||||
ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true);
|
||||
ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31");
|
||||
|
|
|
@ -370,6 +370,7 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
|
|||
cf_opt->memtable_whole_key_filtering = rnd->Uniform(2);
|
||||
cf_opt->enable_blob_files = rnd->Uniform(2);
|
||||
cf_opt->enable_blob_garbage_collection = rnd->Uniform(2);
|
||||
cf_opt->strict_max_successive_merges = rnd->Uniform(2);
|
||||
|
||||
// double options
|
||||
cf_opt->memtable_prefix_bloom_size_ratio =
|
||||
|
|
|
@ -1638,6 +1638,10 @@ DEFINE_int32(max_successive_merges, 0,
|
|||
"Maximum number of successive merge operations on a key in the "
|
||||
"memtable");
|
||||
|
||||
DEFINE_bool(strict_max_successive_merges, false,
|
||||
"Whether to issue filesystem reads to keep within "
|
||||
"`max_successive_merges` limit");
|
||||
|
||||
static bool ValidatePrefixSize(const char* flagname, int32_t value) {
|
||||
if (value < 0 || value >= 2000000000) {
|
||||
fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
|
||||
|
@ -4626,6 +4630,7 @@ class Benchmark {
|
|||
}
|
||||
}
|
||||
options.max_successive_merges = FLAGS_max_successive_merges;
|
||||
options.strict_max_successive_merges = FLAGS_strict_max_successive_merges;
|
||||
options.report_bg_io_stats = FLAGS_report_bg_io_stats;
|
||||
|
||||
// set universal style compaction configurations, if applicable
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
* Merge writes will only keep merge operand count within `ColumnFamilyOptions::max_successive_merges` when the key's merge operands are all found in memory, unless `strict_max_successive_merges` is explicitly set.
|
Loading…
Reference in New Issue