Summary:
Add a simple policy for NVMe write time life hint
Closes https://github.com/facebook/rocksdb/pull/3095

Differential Revision: D6298030

Pulled By: shligit

fbshipit-source-id: 9a72a42e32e92193af11599eb71f0cf77448e24d
This commit is contained in:
Shaohua Li 2017-11-10 09:25:26 -08:00 committed by Facebook Github Bot
parent f1c5eaba56
commit eefd75a228
14 changed files with 79 additions and 7 deletions

View File

@ -75,7 +75,7 @@ Status BuildTable(
InternalStats* internal_stats, TableFileCreationReason reason,
EventLogger* event_logger, int job_id, const Env::IOPriority io_priority,
TableProperties* table_properties, int level, const uint64_t creation_time,
const uint64_t oldest_key_time) {
const uint64_t oldest_key_time, Env::WriteLifeTimeHint write_hint) {
assert((column_family_id ==
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
column_family_name.empty());
@ -117,6 +117,7 @@ Status BuildTable(
return s;
}
file->SetIOPriority(io_priority);
file->SetWriteLifeTimeHint(write_hint);
file_writer.reset(new WritableFileWriter(std::move(file), env_options,
ioptions.statistics));

View File

@ -79,6 +79,7 @@ extern Status BuildTable(
EventLogger* event_logger = nullptr, int job_id = 0,
const Env::IOPriority io_priority = Env::IO_HIGH,
TableProperties* table_properties = nullptr, int level = -1,
const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0);
const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET);
} // namespace rocksdb

View File

@ -1015,6 +1015,24 @@ Status ColumnFamilyData::SetOptions(
}
#endif // ROCKSDB_LITE
// REQUIRES: DB mutex held
Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
return Env::WLTH_NOT_SET;
}
if (level == 0) {
return Env::WLTH_MEDIUM;
}
int base_level = current_->storage_info()->base_level();
// L1: medium, L2: long, ...
if (level - base_level >= 2) {
return Env::WLTH_EXTREME;
}
return static_cast<Env::WriteLifeTimeHint>(level - base_level +
static_cast<int>(Env::WLTH_MEDIUM));
}
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
const ImmutableDBOptions* db_options,
const EnvOptions& env_options,

View File

@ -339,6 +339,8 @@ class ColumnFamilyData {
bool initialized() const { return initialized_.load(); }
Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
private:
friend class ColumnFamilySet;
ColumnFamilyData(uint32_t id, const std::string& name,

View File

@ -297,7 +297,8 @@ CompactionJob::CompactionJob(
table_cache_(std::move(table_cache)),
event_logger_(event_logger),
paranoid_file_checks_(paranoid_file_checks),
measure_io_stats_(measure_io_stats) {
measure_io_stats_(measure_io_stats),
write_hint_(Env::WLTH_NOT_SET) {
assert(log_buffer_ != nullptr);
const auto* cfd = compact_->compaction->column_family_data();
ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
@ -368,6 +369,8 @@ void CompactionJob::Prepare() {
assert(c->column_family_data()->current()->storage_info()
->NumLevelFiles(compact_->compaction->level()) > 0);
write_hint_ = c->column_family_data()->CalculateSSTWriteHint(
c->output_level());
// Is this compaction producing files at the bottommost level?
bottommost_level_ = c->bottommost_level();
@ -1305,6 +1308,7 @@ Status CompactionJob::OpenCompactionOutputFile(
sub_compact->outputs.push_back(out);
writable_file->SetIOPriority(Env::IO_LOW);
writable_file->SetWriteLifeTimeHint(write_hint_);
writable_file->SetPreallocationBlockSize(static_cast<size_t>(
sub_compact->compaction->OutputFilePreallocationSize()));
sub_compact->outfile.reset(new WritableFileWriter(

View File

@ -167,6 +167,7 @@ class CompactionJob {
std::vector<Slice> boundaries_;
// Stores the approx size of keys covered in the range of each subcompaction
std::vector<uint64_t> sizes_;
Env::WriteLifeTimeHint write_hint_;
};
} // namespace rocksdb

View File

@ -1316,6 +1316,9 @@ class DBImpl : public DB {
bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
Env::WriteLifeTimeHint CalculateWALWriteHint() {
return Env::WLTH_SHORT;
}
// When set, we use a seprate queue for writes that dont write to memtable. In
// 2PC these are the writes at Prepare phase.

View File

@ -893,6 +893,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
const uint64_t current_time = static_cast<uint64_t>(_current_time);
{
auto write_hint = cfd->CalculateSSTWriteHint(0);
mutex_.Unlock();
SequenceNumber earliest_write_conflict_snapshot;
@ -913,7 +914,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
cfd->ioptions()->compression_opts, paranoid_file_checks,
cfd->internal_stats(), TableFileCreationReason::kRecovery,
&event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */,
-1 /* level */, current_time);
-1 /* level */, current_time, write_hint);
LogFlush(immutable_db_options_.info_log);
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] [WriteLevel0TableForRecovery]"
@ -1007,6 +1008,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
return s;
}
impl->mutex_.Lock();
auto write_hint = impl->CalculateWALWriteHint();
// Handles create_if_missing, error_if_exists
s = impl->Recover(column_families);
if (s.ok()) {
@ -1022,6 +1024,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
LogFileName(impl->immutable_db_options_.wal_dir, new_log_number),
&lfile, opt_env_options);
if (s.ok()) {
lfile->SetWriteLifeTimeHint(write_hint);
lfile->SetPreallocationBlockSize(
impl->GetWalPreallocateBlockSize(max_write_buffer_size));
{

View File

@ -1168,6 +1168,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
BuildDBOptions(immutable_db_options_, mutable_db_options_);
const auto preallocate_block_size =
GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
auto write_hint = CalculateWALWriteHint();
mutex_.Unlock();
{
if (creating_new_log) {
@ -1193,6 +1194,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
// use preallocate_block_size instead
// of calling GetWalPreallocateBlockSize()
lfile->SetPreallocationBlockSize(preallocate_block_size);
lfile->SetWriteLifeTimeHint(write_hint);
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(lfile), opt_env_opt));
new_log = new log::Writer(

View File

@ -246,6 +246,7 @@ Status FlushJob::WriteLevel0Table() {
const uint64_t start_micros = db_options_.env->NowMicros();
Status s;
{
auto write_hint = cfd_->CalculateSSTWriteHint(0);
db_mutex_->Unlock();
if (log_buffer_) {
log_buffer_->FlushBufferToLog();
@ -315,7 +316,7 @@ Status FlushJob::WriteLevel0Table() {
mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
TableFileCreationReason::kFlush, event_logger_, job_context_->job_id,
Env::IO_HIGH, &table_properties_, 0 /* level */, current_time,
oldest_key_time);
oldest_key_time, write_hint);
LogFlush(db_options_.info_log);
}
ROCKS_LOG_INFO(db_options_.info_log,

View File

@ -401,6 +401,8 @@ class Repairer {
status = env_->GetCurrentTime(&_current_time); // ignore error
const uint64_t current_time = static_cast<uint64_t>(_current_time);
SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
auto write_hint = cfd->CalculateSSTWriteHint(0);
status = BuildTable(
dbname_, env_, *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
env_options_, table_cache_, iter.get(),
@ -411,7 +413,7 @@ class Repairer {
CompressionOptions(), false, nullptr /* internal_stats */,
TableFileCreationReason::kRecovery, nullptr /* event_logger */,
0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */,
-1 /* level */, current_time);
-1 /* level */, current_time, write_hint);
ROCKS_LOG_INFO(db_options_.info_log,
"Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
log, counter, meta.fd.GetNumber(),

16
env/io_posix.cc vendored
View File

@ -35,6 +35,11 @@
#include "util/string_util.h"
#include "util/sync_point.h"
#if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
#define F_LINUX_SPECIFIC_BASE 1024
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
#endif
namespace rocksdb {
// A wrapper for fadvise, if the platform doesn't support fadvise,
@ -858,6 +863,17 @@ bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
uint64_t PosixWritableFile::GetFileSize() { return filesize_; }
void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
#ifdef OS_LINUX
if (hint == write_hint_) {
return;
}
if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) {
write_hint_ = hint;
}
#endif
}
Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
if (use_direct_io()) {
return Status::OK();

1
env/io_posix.h vendored
View File

@ -132,6 +132,7 @@ class PosixWritableFile : public WritableFile {
virtual Status Fsync() override;
virtual bool IsSyncThreadSafe() const override;
virtual bool use_direct_io() const override { return use_direct_io_; }
virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override;
virtual uint64_t GetFileSize() override;
virtual Status InvalidateCache(size_t offset, size_t length) override;
virtual size_t GetRequiredBufferAlignment() const override {

View File

@ -152,6 +152,16 @@ class Env {
unique_ptr<RandomAccessFile>* result,
const EnvOptions& options)
= 0;
// These values match Linux definition
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
enum WriteLifeTimeHint {
WLTH_NOT_SET = 0, // No hint information set
WLTH_NONE, // No hints about write life time
WLTH_SHORT, // Data written has a short life time
WLTH_MEDIUM, // Data written has a medium life time
WLTH_LONG, // Data written has a long life time
WLTH_EXTREME, // Data written has an extremely long life time
};
// Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a
@ -573,7 +583,8 @@ class WritableFile {
WritableFile()
: last_preallocated_block_(0),
preallocation_block_size_(0),
io_priority_(Env::IO_TOTAL) {
io_priority_(Env::IO_TOTAL),
write_hint_(Env::WLTH_NOT_SET) {
}
virtual ~WritableFile();
@ -650,6 +661,11 @@ class WritableFile {
virtual Env::IOPriority GetIOPriority() { return io_priority_; }
virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
write_hint_ = hint;
}
virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; }
/*
* Get the size of valid data in the file.
*/
@ -738,6 +754,7 @@ class WritableFile {
friend class WritableFileMirror;
Env::IOPriority io_priority_;
Env::WriteLifeTimeHint write_hint_;
};
// A file abstraction for random reading and writing.