Summary:
Add a simple policy for NVMe write time life hint
Closes https://github.com/facebook/rocksdb/pull/3095

Differential Revision: D6298030

Pulled By: shligit

fbshipit-source-id: 9a72a42e32e92193af11599eb71f0cf77448e24d
This commit is contained in:
Shaohua Li 2017-11-10 09:25:26 -08:00 committed by Facebook Github Bot
parent f1c5eaba56
commit eefd75a228
14 changed files with 79 additions and 7 deletions

View file

@ -75,7 +75,7 @@ Status BuildTable(
InternalStats* internal_stats, TableFileCreationReason reason, InternalStats* internal_stats, TableFileCreationReason reason,
EventLogger* event_logger, int job_id, const Env::IOPriority io_priority, EventLogger* event_logger, int job_id, const Env::IOPriority io_priority,
TableProperties* table_properties, int level, const uint64_t creation_time, TableProperties* table_properties, int level, const uint64_t creation_time,
const uint64_t oldest_key_time) { const uint64_t oldest_key_time, Env::WriteLifeTimeHint write_hint) {
assert((column_family_id == assert((column_family_id ==
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
column_family_name.empty()); column_family_name.empty());
@ -117,6 +117,7 @@ Status BuildTable(
return s; return s;
} }
file->SetIOPriority(io_priority); file->SetIOPriority(io_priority);
file->SetWriteLifeTimeHint(write_hint);
file_writer.reset(new WritableFileWriter(std::move(file), env_options, file_writer.reset(new WritableFileWriter(std::move(file), env_options,
ioptions.statistics)); ioptions.statistics));

View file

@ -79,6 +79,7 @@ extern Status BuildTable(
EventLogger* event_logger = nullptr, int job_id = 0, EventLogger* event_logger = nullptr, int job_id = 0,
const Env::IOPriority io_priority = Env::IO_HIGH, const Env::IOPriority io_priority = Env::IO_HIGH,
TableProperties* table_properties = nullptr, int level = -1, TableProperties* table_properties = nullptr, int level = -1,
const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0); const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET);
} // namespace rocksdb } // namespace rocksdb

View file

@ -1015,6 +1015,24 @@ Status ColumnFamilyData::SetOptions(
} }
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
// REQUIRES: DB mutex held
Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
return Env::WLTH_NOT_SET;
}
if (level == 0) {
return Env::WLTH_MEDIUM;
}
int base_level = current_->storage_info()->base_level();
// L1: medium, L2: long, ...
if (level - base_level >= 2) {
return Env::WLTH_EXTREME;
}
return static_cast<Env::WriteLifeTimeHint>(level - base_level +
static_cast<int>(Env::WLTH_MEDIUM));
}
ColumnFamilySet::ColumnFamilySet(const std::string& dbname, ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
const ImmutableDBOptions* db_options, const ImmutableDBOptions* db_options,
const EnvOptions& env_options, const EnvOptions& env_options,

View file

@ -339,6 +339,8 @@ class ColumnFamilyData {
bool initialized() const { return initialized_.load(); } bool initialized() const { return initialized_.load(); }
Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
private: private:
friend class ColumnFamilySet; friend class ColumnFamilySet;
ColumnFamilyData(uint32_t id, const std::string& name, ColumnFamilyData(uint32_t id, const std::string& name,

View file

@ -297,7 +297,8 @@ CompactionJob::CompactionJob(
table_cache_(std::move(table_cache)), table_cache_(std::move(table_cache)),
event_logger_(event_logger), event_logger_(event_logger),
paranoid_file_checks_(paranoid_file_checks), paranoid_file_checks_(paranoid_file_checks),
measure_io_stats_(measure_io_stats) { measure_io_stats_(measure_io_stats),
write_hint_(Env::WLTH_NOT_SET) {
assert(log_buffer_ != nullptr); assert(log_buffer_ != nullptr);
const auto* cfd = compact_->compaction->column_family_data(); const auto* cfd = compact_->compaction->column_family_data();
ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
@ -368,6 +369,8 @@ void CompactionJob::Prepare() {
assert(c->column_family_data()->current()->storage_info() assert(c->column_family_data()->current()->storage_info()
->NumLevelFiles(compact_->compaction->level()) > 0); ->NumLevelFiles(compact_->compaction->level()) > 0);
write_hint_ = c->column_family_data()->CalculateSSTWriteHint(
c->output_level());
// Is this compaction producing files at the bottommost level? // Is this compaction producing files at the bottommost level?
bottommost_level_ = c->bottommost_level(); bottommost_level_ = c->bottommost_level();
@ -1305,6 +1308,7 @@ Status CompactionJob::OpenCompactionOutputFile(
sub_compact->outputs.push_back(out); sub_compact->outputs.push_back(out);
writable_file->SetIOPriority(Env::IO_LOW); writable_file->SetIOPriority(Env::IO_LOW);
writable_file->SetWriteLifeTimeHint(write_hint_);
writable_file->SetPreallocationBlockSize(static_cast<size_t>( writable_file->SetPreallocationBlockSize(static_cast<size_t>(
sub_compact->compaction->OutputFilePreallocationSize())); sub_compact->compaction->OutputFilePreallocationSize()));
sub_compact->outfile.reset(new WritableFileWriter( sub_compact->outfile.reset(new WritableFileWriter(

View file

@ -167,6 +167,7 @@ class CompactionJob {
std::vector<Slice> boundaries_; std::vector<Slice> boundaries_;
// Stores the approx size of keys covered in the range of each subcompaction // Stores the approx size of keys covered in the range of each subcompaction
std::vector<uint64_t> sizes_; std::vector<uint64_t> sizes_;
Env::WriteLifeTimeHint write_hint_;
}; };
} // namespace rocksdb } // namespace rocksdb

View file

@ -1316,6 +1316,9 @@ class DBImpl : public DB {
bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
Env::WriteLifeTimeHint CalculateWALWriteHint() {
return Env::WLTH_SHORT;
}
// When set, we use a seprate queue for writes that dont write to memtable. In // When set, we use a seprate queue for writes that dont write to memtable. In
// 2PC these are the writes at Prepare phase. // 2PC these are the writes at Prepare phase.

View file

@ -893,6 +893,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
const uint64_t current_time = static_cast<uint64_t>(_current_time); const uint64_t current_time = static_cast<uint64_t>(_current_time);
{ {
auto write_hint = cfd->CalculateSSTWriteHint(0);
mutex_.Unlock(); mutex_.Unlock();
SequenceNumber earliest_write_conflict_snapshot; SequenceNumber earliest_write_conflict_snapshot;
@ -913,7 +914,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
cfd->ioptions()->compression_opts, paranoid_file_checks, cfd->ioptions()->compression_opts, paranoid_file_checks,
cfd->internal_stats(), TableFileCreationReason::kRecovery, cfd->internal_stats(), TableFileCreationReason::kRecovery,
&event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */, &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */,
-1 /* level */, current_time); -1 /* level */, current_time, write_hint);
LogFlush(immutable_db_options_.info_log); LogFlush(immutable_db_options_.info_log);
ROCKS_LOG_DEBUG(immutable_db_options_.info_log, ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] [WriteLevel0TableForRecovery]" "[%s] [WriteLevel0TableForRecovery]"
@ -1007,6 +1008,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
return s; return s;
} }
impl->mutex_.Lock(); impl->mutex_.Lock();
auto write_hint = impl->CalculateWALWriteHint();
// Handles create_if_missing, error_if_exists // Handles create_if_missing, error_if_exists
s = impl->Recover(column_families); s = impl->Recover(column_families);
if (s.ok()) { if (s.ok()) {
@ -1022,6 +1024,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
LogFileName(impl->immutable_db_options_.wal_dir, new_log_number), LogFileName(impl->immutable_db_options_.wal_dir, new_log_number),
&lfile, opt_env_options); &lfile, opt_env_options);
if (s.ok()) { if (s.ok()) {
lfile->SetWriteLifeTimeHint(write_hint);
lfile->SetPreallocationBlockSize( lfile->SetPreallocationBlockSize(
impl->GetWalPreallocateBlockSize(max_write_buffer_size)); impl->GetWalPreallocateBlockSize(max_write_buffer_size));
{ {

View file

@ -1168,6 +1168,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
BuildDBOptions(immutable_db_options_, mutable_db_options_); BuildDBOptions(immutable_db_options_, mutable_db_options_);
const auto preallocate_block_size = const auto preallocate_block_size =
GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size); GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
auto write_hint = CalculateWALWriteHint();
mutex_.Unlock(); mutex_.Unlock();
{ {
if (creating_new_log) { if (creating_new_log) {
@ -1193,6 +1194,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
// use preallocate_block_size instead // use preallocate_block_size instead
// of calling GetWalPreallocateBlockSize() // of calling GetWalPreallocateBlockSize()
lfile->SetPreallocationBlockSize(preallocate_block_size); lfile->SetPreallocationBlockSize(preallocate_block_size);
lfile->SetWriteLifeTimeHint(write_hint);
unique_ptr<WritableFileWriter> file_writer( unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(lfile), opt_env_opt)); new WritableFileWriter(std::move(lfile), opt_env_opt));
new_log = new log::Writer( new_log = new log::Writer(

View file

@ -246,6 +246,7 @@ Status FlushJob::WriteLevel0Table() {
const uint64_t start_micros = db_options_.env->NowMicros(); const uint64_t start_micros = db_options_.env->NowMicros();
Status s; Status s;
{ {
auto write_hint = cfd_->CalculateSSTWriteHint(0);
db_mutex_->Unlock(); db_mutex_->Unlock();
if (log_buffer_) { if (log_buffer_) {
log_buffer_->FlushBufferToLog(); log_buffer_->FlushBufferToLog();
@ -315,7 +316,7 @@ Status FlushJob::WriteLevel0Table() {
mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(), mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
TableFileCreationReason::kFlush, event_logger_, job_context_->job_id, TableFileCreationReason::kFlush, event_logger_, job_context_->job_id,
Env::IO_HIGH, &table_properties_, 0 /* level */, current_time, Env::IO_HIGH, &table_properties_, 0 /* level */, current_time,
oldest_key_time); oldest_key_time, write_hint);
LogFlush(db_options_.info_log); LogFlush(db_options_.info_log);
} }
ROCKS_LOG_INFO(db_options_.info_log, ROCKS_LOG_INFO(db_options_.info_log,

View file

@ -401,6 +401,8 @@ class Repairer {
status = env_->GetCurrentTime(&_current_time); // ignore error status = env_->GetCurrentTime(&_current_time); // ignore error
const uint64_t current_time = static_cast<uint64_t>(_current_time); const uint64_t current_time = static_cast<uint64_t>(_current_time);
SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance(); SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
auto write_hint = cfd->CalculateSSTWriteHint(0);
status = BuildTable( status = BuildTable(
dbname_, env_, *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), dbname_, env_, *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
env_options_, table_cache_, iter.get(), env_options_, table_cache_, iter.get(),
@ -411,7 +413,7 @@ class Repairer {
CompressionOptions(), false, nullptr /* internal_stats */, CompressionOptions(), false, nullptr /* internal_stats */,
TableFileCreationReason::kRecovery, nullptr /* event_logger */, TableFileCreationReason::kRecovery, nullptr /* event_logger */,
0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */, 0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */,
-1 /* level */, current_time); -1 /* level */, current_time, write_hint);
ROCKS_LOG_INFO(db_options_.info_log, ROCKS_LOG_INFO(db_options_.info_log,
"Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
log, counter, meta.fd.GetNumber(), log, counter, meta.fd.GetNumber(),

16
env/io_posix.cc vendored
View file

@ -35,6 +35,11 @@
#include "util/string_util.h" #include "util/string_util.h"
#include "util/sync_point.h" #include "util/sync_point.h"
#if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
#define F_LINUX_SPECIFIC_BASE 1024
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
#endif
namespace rocksdb { namespace rocksdb {
// A wrapper for fadvise, if the platform doesn't support fadvise, // A wrapper for fadvise, if the platform doesn't support fadvise,
@ -858,6 +863,17 @@ bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
uint64_t PosixWritableFile::GetFileSize() { return filesize_; } uint64_t PosixWritableFile::GetFileSize() { return filesize_; }
void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
#ifdef OS_LINUX
if (hint == write_hint_) {
return;
}
if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) {
write_hint_ = hint;
}
#endif
}
Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) { Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
if (use_direct_io()) { if (use_direct_io()) {
return Status::OK(); return Status::OK();

1
env/io_posix.h vendored
View file

@ -132,6 +132,7 @@ class PosixWritableFile : public WritableFile {
virtual Status Fsync() override; virtual Status Fsync() override;
virtual bool IsSyncThreadSafe() const override; virtual bool IsSyncThreadSafe() const override;
virtual bool use_direct_io() const override { return use_direct_io_; } virtual bool use_direct_io() const override { return use_direct_io_; }
virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override;
virtual uint64_t GetFileSize() override; virtual uint64_t GetFileSize() override;
virtual Status InvalidateCache(size_t offset, size_t length) override; virtual Status InvalidateCache(size_t offset, size_t length) override;
virtual size_t GetRequiredBufferAlignment() const override { virtual size_t GetRequiredBufferAlignment() const override {

View file

@ -152,6 +152,16 @@ class Env {
unique_ptr<RandomAccessFile>* result, unique_ptr<RandomAccessFile>* result,
const EnvOptions& options) const EnvOptions& options)
= 0; = 0;
// These values match Linux definition
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
enum WriteLifeTimeHint {
WLTH_NOT_SET = 0, // No hint information set
WLTH_NONE, // No hints about write life time
WLTH_SHORT, // Data written has a short life time
WLTH_MEDIUM, // Data written has a medium life time
WLTH_LONG, // Data written has a long life time
WLTH_EXTREME, // Data written has an extremely long life time
};
// Create an object that writes to a new file with the specified // Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a // name. Deletes any existing file with the same name and creates a
@ -573,7 +583,8 @@ class WritableFile {
WritableFile() WritableFile()
: last_preallocated_block_(0), : last_preallocated_block_(0),
preallocation_block_size_(0), preallocation_block_size_(0),
io_priority_(Env::IO_TOTAL) { io_priority_(Env::IO_TOTAL),
write_hint_(Env::WLTH_NOT_SET) {
} }
virtual ~WritableFile(); virtual ~WritableFile();
@ -650,6 +661,11 @@ class WritableFile {
virtual Env::IOPriority GetIOPriority() { return io_priority_; } virtual Env::IOPriority GetIOPriority() { return io_priority_; }
virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
write_hint_ = hint;
}
virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; }
/* /*
* Get the size of valid data in the file. * Get the size of valid data in the file.
*/ */
@ -738,6 +754,7 @@ class WritableFile {
friend class WritableFileMirror; friend class WritableFileMirror;
Env::IOPriority io_priority_; Env::IOPriority io_priority_;
Env::WriteLifeTimeHint write_hint_;
}; };
// A file abstraction for random reading and writing. // A file abstraction for random reading and writing.