mirror of https://github.com/facebook/rocksdb.git
Add a utility function to guess optimized options based on constraints
Summary: Add a function GetOptions(), where based on four parameters users give: read/write amplification threshold, memory budget for mem tables and target DB size, it picks up a compaction style and parameters for them. Background threads are not touched yet. One limit of this algorithm: since compression rate and key/value size are hard to predict, it's hard to predict level 0 file size from write buffer size. Simply make 1:1 ratio here. Sample results: https://reviews.facebook.net/P477 Test Plan: Will add some a unit test where some sample scenarios are given and see they pick the results that make sense Reviewers: yhchiang, dhruba, haobo, igor, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D18741
This commit is contained in:
parent
250f035782
commit
e6de02103a
11
Makefile
11
Makefile
|
@ -114,9 +114,10 @@ TESTS = \
|
|||
deletefile_test \
|
||||
table_test \
|
||||
thread_local_test \
|
||||
geodb_test \
|
||||
rate_limiter_test \
|
||||
cuckoo_table_builder_test
|
||||
geodb_test \
|
||||
rate_limiter_test \
|
||||
cuckoo_table_builder_test \
|
||||
options_test
|
||||
|
||||
TOOLS = \
|
||||
sst_dump \
|
||||
|
@ -124,6 +125,7 @@ TOOLS = \
|
|||
db_stress \
|
||||
ldb \
|
||||
db_repl_stress \
|
||||
options_test \
|
||||
blob_store_bench
|
||||
|
||||
PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS)
|
||||
|
@ -414,6 +416,9 @@ geodb_test: utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
|||
cuckoo_table_builder_test: table/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) table/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
options_test: util/options_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(MEMENVOBJECTS)
|
||||
|
|
|
@ -1027,7 +1027,6 @@ struct FlushOptions {
|
|||
FlushOptions() : wait(true) {}
|
||||
};
|
||||
|
||||
|
||||
// Create a RateLimiter object, which can be shared among RocksDB instances to
|
||||
// control write rate of flush and compaction.
|
||||
// @rate_bytes_per_sec: this is the only parameter you want to set most of the
|
||||
|
@ -1051,7 +1050,16 @@ extern RateLimiter* NewRateLimiter(
|
|||
int64_t refill_period_us = 100 * 1000,
|
||||
int32_t fairness = 10);
|
||||
|
||||
|
||||
// Get options based on some guidelines. Now only tune parameter based on
|
||||
// flush/compaction and fill default parameters for other parameters.
|
||||
// total_write_buffer_limit: budget for memory spent for mem tables
|
||||
// read_amplification_threshold: comfortable value of read amplification
|
||||
// write_amplification_threshold: comfortable value of write amplification.
|
||||
// target_db_size: estimated total DB size.
|
||||
extern Options GetOptions(size_t total_write_buffer_limit,
|
||||
int read_amplification_threshold = 8,
|
||||
int write_amplification_threshold = 32,
|
||||
uint64_t target_db_size = 68719476736 /* 64GB */);
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
|
||||
|
|
|
@ -0,0 +1,196 @@
|
|||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include <math.h>
|
||||
#include <algorithm>
|
||||
#include "rocksdb/options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
|
||||
// For now, always use 1-0 as level bytes multiplier.
|
||||
const int kBytesForLevelMultiplier = 10;
|
||||
const size_t kBytesForOneMb = 1024 * 1024;
|
||||
|
||||
// Pick compaction style
|
||||
CompactionStyle PickCompactionStyle(size_t write_buffer_size,
|
||||
int read_amp_threshold,
|
||||
int write_amp_threshold,
|
||||
uint64_t target_db_size) {
|
||||
// Estimate read amplification and write amplification of two compaction
|
||||
// styles. If there is hard limit to force a choice, make the choice.
|
||||
// Otherwise, calculate a score based on threshold and expected value of
|
||||
// two styles, weighing reads 4X important than writes.
|
||||
int expected_levels = static_cast<int>(ceil(
|
||||
log(target_db_size / write_buffer_size) / log(kBytesForLevelMultiplier)));
|
||||
|
||||
int expected_max_files_universal =
|
||||
static_cast<int>(ceil(log2(target_db_size / write_buffer_size)));
|
||||
|
||||
const int kEstimatedLevel0FilesInLevelStyle = 2;
|
||||
// Estimate write amplification:
|
||||
// (1) 1 for every L0 file
|
||||
// (2) 2 for L1
|
||||
// (3) kBytesForLevelMultiplier for the last level. It's really hard to
|
||||
// predict.
|
||||
// (3) kBytesForLevelMultiplier for other levels.
|
||||
int expected_write_amp_level = kEstimatedLevel0FilesInLevelStyle + 2
|
||||
+ (expected_levels - 2) * kBytesForLevelMultiplier
|
||||
+ kBytesForLevelMultiplier;
|
||||
int expected_read_amp_level =
|
||||
kEstimatedLevel0FilesInLevelStyle + expected_levels;
|
||||
|
||||
int max_read_amp_uni = expected_max_files_universal;
|
||||
if (read_amp_threshold <= max_read_amp_uni) {
|
||||
return kCompactionStyleLevel;
|
||||
} else if (write_amp_threshold <= expected_write_amp_level) {
|
||||
return kCompactionStyleUniversal;
|
||||
}
|
||||
|
||||
const double kReadWriteWeight = 4;
|
||||
|
||||
double level_ratio =
|
||||
static_cast<double>(read_amp_threshold) / expected_read_amp_level *
|
||||
kReadWriteWeight +
|
||||
static_cast<double>(write_amp_threshold) / expected_write_amp_level;
|
||||
|
||||
int expected_write_amp_uni = expected_max_files_universal / 2 + 2;
|
||||
int expected_read_amp_uni = expected_max_files_universal / 2 + 1;
|
||||
|
||||
double uni_ratio =
|
||||
static_cast<double>(read_amp_threshold) / expected_read_amp_uni *
|
||||
kReadWriteWeight +
|
||||
static_cast<double>(write_amp_threshold) / expected_write_amp_uni;
|
||||
|
||||
if (level_ratio > uni_ratio) {
|
||||
return kCompactionStyleLevel;
|
||||
} else {
|
||||
return kCompactionStyleUniversal;
|
||||
}
|
||||
}
|
||||
|
||||
// Pick mem table size
|
||||
void PickWriteBufferSize(size_t total_write_buffer_limit, Options* options) {
|
||||
const size_t kMaxWriteBufferSize = 128 * kBytesForOneMb;
|
||||
const size_t kMinWriteBufferSize = 4 * kBytesForOneMb;
|
||||
|
||||
// Try to pick up a buffer size between 4MB and 128MB.
|
||||
// And try to pick 4 as the total number of write buffers.
|
||||
size_t write_buffer_size = total_write_buffer_limit / 4;
|
||||
if (write_buffer_size > kMaxWriteBufferSize) {
|
||||
write_buffer_size = kMaxWriteBufferSize;
|
||||
} else if (write_buffer_size < kMinWriteBufferSize) {
|
||||
write_buffer_size = std::min(static_cast<size_t>(kMinWriteBufferSize),
|
||||
total_write_buffer_limit / 2);
|
||||
}
|
||||
|
||||
// Truncate to multiple of 1MB.
|
||||
if (write_buffer_size % kBytesForOneMb != 0) {
|
||||
write_buffer_size =
|
||||
(write_buffer_size / kBytesForOneMb + 1) * kBytesForOneMb;
|
||||
}
|
||||
|
||||
options->write_buffer_size = write_buffer_size;
|
||||
options->max_write_buffer_number =
|
||||
total_write_buffer_limit / write_buffer_size;
|
||||
options->min_write_buffer_number_to_merge = 1;
|
||||
}
|
||||
|
||||
void OptimizeForUniversal(Options* options) {
|
||||
options->level0_file_num_compaction_trigger = 2;
|
||||
options->level0_slowdown_writes_trigger = 30;
|
||||
options->level0_stop_writes_trigger = 40;
|
||||
options->max_open_files = -1;
|
||||
}
|
||||
|
||||
// Optimize parameters for level-based compaction
|
||||
void OptimizeForLevel(int read_amplification_threshold,
|
||||
int write_amplification_threshold,
|
||||
uint64_t target_db_size, Options* options) {
|
||||
int expected_levels_one_level0_file =
|
||||
static_cast<int>(ceil(log(target_db_size / options->write_buffer_size) /
|
||||
log(kBytesForLevelMultiplier)));
|
||||
|
||||
int level0_stop_writes_trigger =
|
||||
read_amplification_threshold - expected_levels_one_level0_file;
|
||||
|
||||
const size_t kInitialLevel0TotalSize = 128 * kBytesForOneMb;
|
||||
const int kMaxFileNumCompactionTrigger = 4;
|
||||
const int kMinLevel0StopTrigger = 3;
|
||||
|
||||
int file_num_buffer =
|
||||
kInitialLevel0TotalSize / options->write_buffer_size + 1;
|
||||
|
||||
if (level0_stop_writes_trigger > file_num_buffer) {
|
||||
// Have sufficient room for multiple level 0 files
|
||||
// Try enlarge the buffer up to 1GB
|
||||
|
||||
// Try to enlarge the buffer up to 1GB, if still have sufficient headroom.
|
||||
file_num_buffer *=
|
||||
std::pow(2, std::max(0, std::min(3, level0_stop_writes_trigger -
|
||||
file_num_buffer - 2)));
|
||||
|
||||
options->level0_stop_writes_trigger = level0_stop_writes_trigger;
|
||||
options->level0_slowdown_writes_trigger = level0_stop_writes_trigger - 2;
|
||||
options->level0_file_num_compaction_trigger =
|
||||
std::min(kMaxFileNumCompactionTrigger, file_num_buffer / 2);
|
||||
} else {
|
||||
options->level0_stop_writes_trigger =
|
||||
std::max(kMinLevel0StopTrigger, file_num_buffer);
|
||||
options->level0_slowdown_writes_trigger =
|
||||
options->level0_stop_writes_trigger - 1;
|
||||
options->level0_file_num_compaction_trigger = 1;
|
||||
}
|
||||
|
||||
// This doesn't consider compaction and overheads of mem tables. But usually
|
||||
// it is in the same order of magnitude.
|
||||
int expected_level0_compaction_size =
|
||||
options->level0_file_num_compaction_trigger * options->write_buffer_size;
|
||||
// Enlarge level1 target file size if level0 compaction size is larger.
|
||||
int max_bytes_for_level_base = 10 * kBytesForOneMb;
|
||||
if (expected_level0_compaction_size > max_bytes_for_level_base) {
|
||||
max_bytes_for_level_base = expected_level0_compaction_size;
|
||||
}
|
||||
options->max_bytes_for_level_base = max_bytes_for_level_base;
|
||||
// Now always set level multiplier to be 10
|
||||
options->max_bytes_for_level_multiplier = kBytesForLevelMultiplier;
|
||||
|
||||
const int kMinFileSize = 2 * kBytesForOneMb;
|
||||
// Allow at least 3-way parallelism for compaction between level 1 and 2.
|
||||
int max_file_size = max_bytes_for_level_base / 3;
|
||||
if (max_file_size < kMinFileSize) {
|
||||
options->target_file_size_base = kMinFileSize;
|
||||
} else {
|
||||
if (max_file_size % kBytesForOneMb != 0) {
|
||||
max_file_size = (max_file_size / kBytesForOneMb + 1) * kBytesForOneMb;
|
||||
}
|
||||
options->target_file_size_base = max_file_size;
|
||||
}
|
||||
|
||||
// TODO: consider to tune num_levels too.
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Options GetOptions(size_t total_write_buffer_limit,
|
||||
int read_amplification_threshold,
|
||||
int write_amplification_threshold, uint64_t target_db_size) {
|
||||
Options options;
|
||||
PickWriteBufferSize(total_write_buffer_limit, &options);
|
||||
size_t write_buffer_size = options.write_buffer_size;
|
||||
options.compaction_style =
|
||||
PickCompactionStyle(write_buffer_size, read_amplification_threshold,
|
||||
write_amplification_threshold, target_db_size);
|
||||
if (options.compaction_style == kCompactionStyleUniversal) {
|
||||
OptimizeForUniversal(&options);
|
||||
} else {
|
||||
OptimizeForLevel(read_amplification_threshold,
|
||||
write_amplification_threshold, target_db_size, &options);
|
||||
}
|
||||
return options;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
|
@ -0,0 +1,80 @@
|
|||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
#include <gflags/gflags.h>
|
||||
|
||||
#include "rocksdb/options.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
using GFLAGS::ParseCommandLineFlags;
|
||||
DEFINE_bool(enable_print, false, "Print options generated to console.");
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class OptionsTest {};
|
||||
|
||||
class StderrLogger : public Logger {
|
||||
public:
|
||||
virtual void Logv(const char* format, va_list ap) override {
|
||||
vprintf(format, ap);
|
||||
printf("\n");
|
||||
}
|
||||
};
|
||||
|
||||
Options PrintAndGetOptions(size_t total_write_buffer_limit,
|
||||
int read_amplification_threshold,
|
||||
int write_amplification_threshold,
|
||||
uint64_t target_db_size = 68719476736) {
|
||||
StderrLogger logger;
|
||||
|
||||
if (FLAGS_enable_print) {
|
||||
printf(
|
||||
"---- total_write_buffer_limit: %zu "
|
||||
"read_amplification_threshold: %d write_amplification_threshold: %d "
|
||||
"target_db_size %" PRIu64 " ----\n",
|
||||
total_write_buffer_limit, read_amplification_threshold,
|
||||
write_amplification_threshold, target_db_size);
|
||||
}
|
||||
|
||||
Options options =
|
||||
GetOptions(total_write_buffer_limit, read_amplification_threshold,
|
||||
write_amplification_threshold, target_db_size);
|
||||
if (FLAGS_enable_print) {
|
||||
options.Dump(&logger);
|
||||
printf("-------------------------------------\n\n\n");
|
||||
}
|
||||
return options;
|
||||
}
|
||||
|
||||
TEST(OptionsTest, LooseCondition) {
|
||||
Options options;
|
||||
PrintAndGetOptions(static_cast<size_t>(10) * 1024 * 1024 * 1024, 100, 100);
|
||||
|
||||
// Less mem table memory budget
|
||||
PrintAndGetOptions(32 * 1024 * 1024, 100, 100);
|
||||
|
||||
// Tight read amplification
|
||||
options = PrintAndGetOptions(128 * 1024 * 1024, 8, 100);
|
||||
ASSERT_EQ(options.compaction_style, kCompactionStyleLevel);
|
||||
|
||||
// Tight write amplification
|
||||
options = PrintAndGetOptions(128 * 1024 * 1024, 64, 10);
|
||||
ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
|
||||
|
||||
// Both tight amplifications
|
||||
PrintAndGetOptions(128 * 1024 * 1024, 4, 8);
|
||||
}
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
ParseCommandLineFlags(&argc, &argv, true);
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
Loading…
Reference in New Issue