Hint temperature of bottommost level files to FileSystem (#8222)

Summary:
As the first part of the effort of having placing different files on different storage types, this change introduces several things:
(1) An experimental interface in FileSystem that specify temperature to a new file created.
(2) A test FileSystemWrapper,  SimulatedHybridFileSystem, that simulates HDD for a file of "warm" temperature.
(3) A simple experimental feature ColumnFamilyOptions.bottommost_temperature. RocksDB would pass this value to FileSystem when creating any bottommost file.
(4) A db_bench parameter that applies the (2) and (3) to db_bench.

The motivation of the change is to introduce minimal changes that allow us to evolve tiered storage development.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8222

Test Plan:
./db_bench --benchmarks=fillrandom --write_buffer_size=2000000 -max_bytes_for_level_base=20000000  -level_compaction_dynamic_level_bytes --reads=100 -compaction_readahead_size=20000000 --reads=100000 -num=10000000

followed by

./db_bench --benchmarks=readrandom,stats --write_buffer_size=2000000 -max_bytes_for_level_base=20000000 -simulate_hybrid_fs_file=/tmp/warm_file_list -level_compaction_dynamic_level_bytes -compaction_readahead_size=20000000 --reads=500 --threads=16 -use_existing_db --num=10000000

and see results as expected.

Reviewed By: ajkr

Differential Revision: D28003028

fbshipit-source-id: 4724896d5205730227ba2f17c3fecb11261744ce
This commit is contained in:
sdong 2021-05-03 13:30:00 -07:00 committed by Facebook GitHub Bot
parent d2ca04e3ed
commit c3ff14e2c1
10 changed files with 287 additions and 2 deletions

View file

@ -780,6 +780,7 @@ cpp_library(
"test_util/testutil.cc", "test_util/testutil.cc",
"tools/block_cache_analyzer/block_cache_trace_analyzer.cc", "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
"tools/db_bench_tool.cc", "tools/db_bench_tool.cc",
"tools/simulated_hybrid_file_system.cc",
"tools/trace_analyzer_tool.cc", "tools/trace_analyzer_tool.cc",
], ],
auto_headers = AutoHeaders.RECURSIVE_GLOB, auto_headers = AutoHeaders.RECURSIVE_GLOB,

View file

@ -1685,9 +1685,16 @@ Status CompactionJob::OpenCompactionOutputFile(
TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile", TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
&syncpoint_arg); &syncpoint_arg);
#endif #endif
// Pass temperature of botommost files to FileSystem.
FileOptions fo_copy = file_options_;
if (bottommost_level_) {
fo_copy.temperature =
sub_compact->compaction->mutable_cf_options()->bottommost_temperature;
}
Status s; Status s;
IOStatus io_s = IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy);
NewWritableFile(fs_.get(), fname, &writable_file, file_options_);
s = io_s; s = io_s;
if (sub_compact->io_status.ok()) { if (sub_compact->io_status.ok()) {
sub_compact->io_status = io_s; sub_compact->io_status = io_s;

View file

@ -186,6 +186,16 @@ struct CompressionOptions {
max_dict_buffer_bytes(_max_dict_buffer_bytes) {} max_dict_buffer_bytes(_max_dict_buffer_bytes) {}
}; };
// Temperature of a file. Used to pass to FileSystem for a different
// placement and/or coding.
enum class Temperature : uint8_t {
kHot,
kWarm,
kCold,
kTotal,
kUnknown = kTotal,
};
enum UpdateStatus { // Return status For inplace update callback enum UpdateStatus { // Return status For inplace update callback
UPDATE_FAILED = 0, // Nothing to update UPDATE_FAILED = 0, // Nothing to update
UPDATED_INPLACE = 1, // Value updated inplace UPDATED_INPLACE = 1, // Value updated inplace
@ -758,6 +768,13 @@ struct AdvancedColumnFamilyOptions {
// data is left uncompressed (unless compression is also requested). // data is left uncompressed (unless compression is also requested).
uint64_t sample_for_compression = 0; uint64_t sample_for_compression = 0;
// EXPERIMENTAL
// The feature is still in development and is incomplete.
// If this option is set, when creating bottommost files, pass this
// temperature to FileSystem used. Should be no-op for default FileSystem
// and users need to plug in their own FileSystem to take advantage of it.
Temperature bottommost_temperature = Temperature::kUnknown;
// When set, large values (blobs) are written to separate blob files, and // When set, large values (blobs) are written to separate blob files, and
// only pointers to them are stored in SST files. This can reduce write // only pointers to them are stored in SST files. This can reduce write
// amplification for large-value use cases at the cost of introducing a level // amplification for large-value use cases at the cost of introducing a level

View file

@ -100,6 +100,13 @@ struct FileOptions : EnvOptions {
// to be issued for the file open/creation // to be issued for the file open/creation
IOOptions io_options; IOOptions io_options;
// EXPERIMENTAL
// The feature is in development and is subject to change.
// When creating a new file, set the temperature of the file so that
// underlying file systems can put it with appropriate storage media and/or
// coding.
Temperature temperature = Temperature::kUnknown;
// The checksum type that is used to calculate the checksum value for // The checksum type that is used to calculate the checksum value for
// handoff during file writes. // handoff during file writes.
ChecksumType handoff_checksum_type; ChecksumType handoff_checksum_type;
@ -115,6 +122,7 @@ struct FileOptions : EnvOptions {
FileOptions(const FileOptions& opts) FileOptions(const FileOptions& opts)
: EnvOptions(opts), : EnvOptions(opts),
io_options(opts.io_options), io_options(opts.io_options),
temperature(opts.temperature),
handoff_checksum_type(opts.handoff_checksum_type) {} handoff_checksum_type(opts.handoff_checksum_type) {}
FileOptions& operator=(const FileOptions& opts) = default; FileOptions& operator=(const FileOptions& opts) = default;

View file

@ -186,6 +186,7 @@ struct MutableCFOptions {
bottommost_compression(options.bottommost_compression), bottommost_compression(options.bottommost_compression),
compression_opts(options.compression_opts), compression_opts(options.compression_opts),
bottommost_compression_opts(options.bottommost_compression_opts), bottommost_compression_opts(options.bottommost_compression_opts),
bottommost_temperature(options.bottommost_temperature),
sample_for_compression( sample_for_compression(
options.sample_for_compression) { // TODO: is 0 fine here? options.sample_for_compression) { // TODO: is 0 fine here?
RefreshDerivedOptions(options.num_levels, options.compaction_style); RefreshDerivedOptions(options.num_levels, options.compaction_style);
@ -227,6 +228,7 @@ struct MutableCFOptions {
report_bg_io_stats(false), report_bg_io_stats(false),
compression(Snappy_Supported() ? kSnappyCompression : kNoCompression), compression(Snappy_Supported() ? kSnappyCompression : kNoCompression),
bottommost_compression(kDisableCompressionOption), bottommost_compression(kDisableCompressionOption),
bottommost_temperature(Temperature::kUnknown),
sample_for_compression(0) {} sample_for_compression(0) {}
explicit MutableCFOptions(const Options& options); explicit MutableCFOptions(const Options& options);
@ -294,6 +296,9 @@ struct MutableCFOptions {
CompressionType bottommost_compression; CompressionType bottommost_compression;
CompressionOptions compression_opts; CompressionOptions compression_opts;
CompressionOptions bottommost_compression_opts; CompressionOptions bottommost_compression_opts;
// TODO this experimental option isn't made configurable
// through strings yet.
Temperature bottommost_temperature;
uint64_t sample_for_compression; uint64_t sample_for_compression;

View file

@ -441,6 +441,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
options->max_mem_compaction_level = 0; options->max_mem_compaction_level = 0;
options->compaction_filter = nullptr; options->compaction_filter = nullptr;
options->sst_partitioner_factory = nullptr; options->sst_partitioner_factory = nullptr;
options->bottommost_temperature = Temperature::kUnknown;
char* new_options_ptr = new char[sizeof(ColumnFamilyOptions)]; char* new_options_ptr = new char[sizeof(ColumnFamilyOptions)];
ColumnFamilyOptions* new_options = ColumnFamilyOptions* new_options =

1
src.mk
View file

@ -318,6 +318,7 @@ MOCK_LIB_SOURCES = \
BENCH_LIB_SOURCES = \ BENCH_LIB_SOURCES = \
tools/db_bench_tool.cc \ tools/db_bench_tool.cc \
tools/simulated_hybrid_file_system.cc \
STRESS_LIB_SOURCES = \ STRESS_LIB_SOURCES = \
db_stress_tool/batched_ops_stress.cc \ db_stress_tool/batched_ops_stress.cc \

View file

@ -65,6 +65,7 @@
#include "rocksdb/write_batch.h" #include "rocksdb/write_batch.h"
#include "test_util/testutil.h" #include "test_util/testutil.h"
#include "test_util/transaction_test_util.h" #include "test_util/transaction_test_util.h"
#include "tools/simulated_hybrid_file_system.h"
#include "util/cast_util.h" #include "util/cast_util.h"
#include "util/compression.h" #include "util/compression.h"
#include "util/crc32c.h" #include "util/crc32c.h"
@ -1032,6 +1033,10 @@ DEFINE_string(fs_uri, "",
DEFINE_string(hdfs, "", DEFINE_string(hdfs, "",
"Name of hdfs environment. Mutually exclusive with" "Name of hdfs environment. Mutually exclusive with"
" --env_uri and --fs_uri"); " --env_uri and --fs_uri");
DEFINE_string(simulate_hybrid_fs_file, "",
"File for Store Metadata for Simulate hybrid FS. Empty means "
"disable the feature. Now, if it is set, "
"bottommost_temperature is set to kWarm.");
static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard; static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
@ -4050,6 +4055,9 @@ class Benchmark {
options.level0_slowdown_writes_trigger = options.level0_slowdown_writes_trigger =
FLAGS_level0_slowdown_writes_trigger; FLAGS_level0_slowdown_writes_trigger;
options.compression = FLAGS_compression_type_e; options.compression = FLAGS_compression_type_e;
if (FLAGS_simulate_hybrid_fs_file != "") {
options.bottommost_temperature = Temperature::kWarm;
}
options.sample_for_compression = FLAGS_sample_for_compression; options.sample_for_compression = FLAGS_sample_for_compression;
options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds; options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB; options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
@ -7672,6 +7680,9 @@ int db_bench_tool(int argc, char** argv) {
exit(1); exit(1);
} }
FLAGS_env = GetCompositeEnv(fs); FLAGS_env = GetCompositeEnv(fs);
} else if (FLAGS_simulate_hybrid_fs_file != "") {
FLAGS_env = GetCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
FileSystem::Default(), FLAGS_simulate_hybrid_fs_file));
} }
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) { if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {

View file

@ -0,0 +1,145 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#ifndef ROCKSDB_LITE
#include "tools/simulated_hybrid_file_system.h"
#include <sstream>
#include <string>
#include "rocksdb/rate_limiter.h"
namespace ROCKSDB_NAMESPACE {
const int kLatencyAddedPerRequestUs = 15000;
const int64_t kRequestPerSec = 100;
const int64_t kDummyBytesPerRequest = 1024 * 1024;
// The metadata file format: each line is a full filename of a file which is
// warm
SimulatedHybridFileSystem::SimulatedHybridFileSystem(
const std::shared_ptr<FileSystem>& base,
const std::string& metadata_file_name)
: FileSystemWrapper(base),
// Limit to 100 requests per second.
rate_limiter_(NewGenericRateLimiter(
kDummyBytesPerRequest * kRequestPerSec /* rate_bytes_per_sec */,
1000 /* refill_period_us */)),
metadata_file_name_(metadata_file_name),
name_("SimulatedHybridFileSystem: " + std::string(target()->Name())) {
IOStatus s = base->FileExists(metadata_file_name, IOOptions(), nullptr);
if (s.IsNotFound()) {
return;
}
std::string metadata;
s = ReadFileToString(base.get(), metadata_file_name, &metadata);
if (!s.ok()) {
fprintf(stderr, "Error reading from file %s: %s",
metadata_file_name.c_str(), s.ToString().c_str());
// Exit rather than assert as this file system is built to run with
// benchmarks, which usually run on release mode.
std::exit(1);
}
std::istringstream input;
input.str(metadata);
std::string line;
while (std::getline(input, line)) {
fprintf(stderr, "Warm file %s\n", line.c_str());
warm_file_set_.insert(line);
}
}
// Need to write out the metadata file to file. See comment of
// SimulatedHybridFileSystem::SimulatedHybridFileSystem() for format of the
// file.
SimulatedHybridFileSystem::~SimulatedHybridFileSystem() {
std::string metadata;
for (const auto& f : warm_file_set_) {
metadata += f;
metadata += "\n";
}
IOStatus s = WriteStringToFile(target(), metadata, metadata_file_name_, true);
if (!s.ok()) {
fprintf(stderr, "Error writing to file %s: %s", metadata_file_name_.c_str(),
s.ToString().c_str());
}
}
IOStatus SimulatedHybridFileSystem::NewRandomAccessFile(
const std::string& fname, const FileOptions& file_opts,
std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
Temperature temperature = Temperature::kUnknown;
{
const std::lock_guard<std::mutex> lock(mutex_);
if (warm_file_set_.find(fname) != warm_file_set_.end()) {
temperature = Temperature::kWarm;
}
}
IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
result->reset(
new SimulatedHybridRaf(result->release(), rate_limiter_, temperature));
return s;
}
IOStatus SimulatedHybridFileSystem::NewWritableFile(
const std::string& fname, const FileOptions& file_opts,
std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
if (file_opts.temperature == Temperature::kWarm) {
const std::lock_guard<std::mutex> lock(mutex_);
fprintf(stderr, "warm file %s\n", fname.c_str());
warm_file_set_.insert(fname);
}
return target()->NewWritableFile(fname, file_opts, result, dbg);
}
IOStatus SimulatedHybridFileSystem::DeleteFile(const std::string& fname,
const IOOptions& options,
IODebugContext* dbg) {
{
const std::lock_guard<std::mutex> lock(mutex_);
warm_file_set_.erase(fname);
}
return target()->DeleteFile(fname, options, dbg);
}
IOStatus SimulatedHybridRaf::Read(uint64_t offset, size_t n,
const IOOptions& options, Slice* result,
char* scratch, IODebugContext* dbg) const {
if (temperature_ == Temperature::kWarm) {
Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs);
rate_limiter_->Request(kDummyBytesPerRequest, Env::IOPriority::IO_LOW,
nullptr);
}
return target()->Read(offset, n, options, result, scratch, dbg);
}
IOStatus SimulatedHybridRaf::MultiRead(FSReadRequest* reqs, size_t num_reqs,
const IOOptions& options,
IODebugContext* dbg) {
if (temperature_ == Temperature::kWarm) {
Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs *
static_cast<int>(num_reqs));
rate_limiter_->Request(
static_cast<int64_t>(num_reqs) * kDummyBytesPerRequest,
Env::IOPriority::IO_LOW, nullptr);
}
return target()->MultiRead(reqs, num_reqs, options, dbg);
}
IOStatus SimulatedHybridRaf::Prefetch(uint64_t offset, size_t n,
const IOOptions& options,
IODebugContext* dbg) {
if (temperature_ == Temperature::kWarm) {
rate_limiter_->Request(kDummyBytesPerRequest, Env::IOPriority::IO_LOW,
nullptr);
Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs);
}
return target()->Prefetch(offset, n, options, dbg);
}
} // namespace ROCKSDB_NAMESPACE
#endif // ROCKSDB_LITE

View file

@ -0,0 +1,89 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#ifndef ROCKSDB_LITE
#include <utility>
#include "rocksdb/file_system.h"
namespace ROCKSDB_NAMESPACE {
// A FileSystem simulates hybrid file system by ingesting latency and limit
// IOPs.
// This class is only used for development purpose and should not be used
// in production.
// Right now we ingest 15ms latency and allow 100 requests per second when
// the file is for warm temperature.
// When the object is destroyed, the list of warm files are written to a
// file, which can be used to reopen a FileSystem and still recover the
// list. This is to allow the information to preserve between db_bench
// runs.
class SimulatedHybridFileSystem : public FileSystemWrapper {
public:
// metadata_file_name stores metadata of the files, so that it can be
// loaded after process restarts. If the file doesn't exist, create
// one. The file is written when the class is destroyed.
explicit SimulatedHybridFileSystem(const std::shared_ptr<FileSystem>& base,
const std::string& metadata_file_name);
~SimulatedHybridFileSystem() override;
public:
IOStatus NewRandomAccessFile(const std::string& fname,
const FileOptions& file_opts,
std::unique_ptr<FSRandomAccessFile>* result,
IODebugContext* dbg) override;
IOStatus NewWritableFile(const std::string& fname,
const FileOptions& file_opts,
std::unique_ptr<FSWritableFile>* result,
IODebugContext* dbg) override;
IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
IODebugContext* dbg) override;
const char* Name() const override { return name_.c_str(); }
private:
// Limit 100 requests per second. Rate limiter is designed to byte but
// we use it as fixed bytes is one request.
std::shared_ptr<RateLimiter> rate_limiter_;
std::mutex mutex_;
std::unordered_set<std::string> warm_file_set_;
std::string metadata_file_name_;
std::string name_;
};
// Simulated random access file that can control IOPs and latency to simulate
// specific storage media
class SimulatedHybridRaf : public FSRandomAccessFileWrapper {
public:
SimulatedHybridRaf(FSRandomAccessFile* t,
std::shared_ptr<RateLimiter> rate_limiter,
Temperature temperature)
: FSRandomAccessFileWrapper(t),
rate_limiter_(rate_limiter),
temperature_(temperature) {}
~SimulatedHybridRaf() override {}
IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
Slice* result, char* scratch,
IODebugContext* dbg) const override;
IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
const IOOptions& options, IODebugContext* dbg) override;
IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
IODebugContext* dbg) override;
private:
std::shared_ptr<RateLimiter> rate_limiter_;
Temperature temperature_;
};
} // namespace ROCKSDB_NAMESPACE
#endif // ROCKSDB_LITE