diff --git a/db/builder.cc b/db/builder.cc index 11b8fc783e..b91cbe6c76 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -75,7 +75,7 @@ Status BuildTable( InternalStats* internal_stats, TableFileCreationReason reason, EventLogger* event_logger, int job_id, const Env::IOPriority io_priority, TableProperties* table_properties, int level, const uint64_t creation_time, - const uint64_t oldest_key_time) { + const uint64_t oldest_key_time, Env::WriteLifeTimeHint write_hint) { assert((column_family_id == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == column_family_name.empty()); @@ -117,6 +117,7 @@ Status BuildTable( return s; } file->SetIOPriority(io_priority); + file->SetWriteLifeTimeHint(write_hint); file_writer.reset(new WritableFileWriter(std::move(file), env_options, ioptions.statistics)); diff --git a/db/builder.h b/db/builder.h index fa96e12d2b..d83644499b 100644 --- a/db/builder.h +++ b/db/builder.h @@ -79,6 +79,7 @@ extern Status BuildTable( EventLogger* event_logger = nullptr, int job_id = 0, const Env::IOPriority io_priority = Env::IO_HIGH, TableProperties* table_properties = nullptr, int level = -1, - const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0); + const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0, + Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET); } // namespace rocksdb diff --git a/db/column_family.cc b/db/column_family.cc index 3095203021..81b63262ec 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1015,6 +1015,24 @@ Status ColumnFamilyData::SetOptions( } #endif // ROCKSDB_LITE +// REQUIRES: DB mutex held +Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) { + if (initial_cf_options_.compaction_style != kCompactionStyleLevel) { + return Env::WLTH_NOT_SET; + } + if (level == 0) { + return Env::WLTH_MEDIUM; + } + int base_level = current_->storage_info()->base_level(); + + // L1: medium, L2: long, ... + if (level - base_level >= 2) { + return Env::WLTH_EXTREME; + } + return static_cast(level - base_level + + static_cast(Env::WLTH_MEDIUM)); +} + ColumnFamilySet::ColumnFamilySet(const std::string& dbname, const ImmutableDBOptions* db_options, const EnvOptions& env_options, diff --git a/db/column_family.h b/db/column_family.h index d7e6c24ae5..2092deed4d 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -339,6 +339,8 @@ class ColumnFamilyData { bool initialized() const { return initialized_.load(); } + Env::WriteLifeTimeHint CalculateSSTWriteHint(int level); + private: friend class ColumnFamilySet; ColumnFamilyData(uint32_t id, const std::string& name, diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 381ebf227c..ba8ec1be45 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -297,7 +297,8 @@ CompactionJob::CompactionJob( table_cache_(std::move(table_cache)), event_logger_(event_logger), paranoid_file_checks_(paranoid_file_checks), - measure_io_stats_(measure_io_stats) { + measure_io_stats_(measure_io_stats), + write_hint_(Env::WLTH_NOT_SET) { assert(log_buffer_ != nullptr); const auto* cfd = compact_->compaction->column_family_data(); ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, @@ -368,6 +369,8 @@ void CompactionJob::Prepare() { assert(c->column_family_data()->current()->storage_info() ->NumLevelFiles(compact_->compaction->level()) > 0); + write_hint_ = c->column_family_data()->CalculateSSTWriteHint( + c->output_level()); // Is this compaction producing files at the bottommost level? bottommost_level_ = c->bottommost_level(); @@ -1305,6 +1308,7 @@ Status CompactionJob::OpenCompactionOutputFile( sub_compact->outputs.push_back(out); writable_file->SetIOPriority(Env::IO_LOW); + writable_file->SetWriteLifeTimeHint(write_hint_); writable_file->SetPreallocationBlockSize(static_cast( sub_compact->compaction->OutputFilePreallocationSize())); sub_compact->outfile.reset(new WritableFileWriter( diff --git a/db/compaction_job.h b/db/compaction_job.h index 498655a083..e92991d497 100644 --- a/db/compaction_job.h +++ b/db/compaction_job.h @@ -167,6 +167,7 @@ class CompactionJob { std::vector boundaries_; // Stores the approx size of keys covered in the range of each subcompaction std::vector sizes_; + Env::WriteLifeTimeHint write_hint_; }; } // namespace rocksdb diff --git a/db/db_impl.h b/db/db_impl.h index bef2e5c256..f26e543447 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -1316,6 +1316,9 @@ class DBImpl : public DB { bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + Env::WriteLifeTimeHint CalculateWALWriteHint() { + return Env::WLTH_SHORT; + } // When set, we use a seprate queue for writes that dont write to memtable. In // 2PC these are the writes at Prepare phase. diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index 64854b4413..5186c90abf 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -893,6 +893,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, const uint64_t current_time = static_cast(_current_time); { + auto write_hint = cfd->CalculateSSTWriteHint(0); mutex_.Unlock(); SequenceNumber earliest_write_conflict_snapshot; @@ -913,7 +914,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, cfd->ioptions()->compression_opts, paranoid_file_checks, cfd->internal_stats(), TableFileCreationReason::kRecovery, &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */, - -1 /* level */, current_time); + -1 /* level */, current_time, write_hint); LogFlush(immutable_db_options_.info_log); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] [WriteLevel0TableForRecovery]" @@ -1007,6 +1008,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, return s; } impl->mutex_.Lock(); + auto write_hint = impl->CalculateWALWriteHint(); // Handles create_if_missing, error_if_exists s = impl->Recover(column_families); if (s.ok()) { @@ -1022,6 +1024,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, LogFileName(impl->immutable_db_options_.wal_dir, new_log_number), &lfile, opt_env_options); if (s.ok()) { + lfile->SetWriteLifeTimeHint(write_hint); lfile->SetPreallocationBlockSize( impl->GetWalPreallocateBlockSize(max_write_buffer_size)); { diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc index 67ebc08fa6..decbf215f8 100644 --- a/db/db_impl_write.cc +++ b/db/db_impl_write.cc @@ -1168,6 +1168,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { BuildDBOptions(immutable_db_options_, mutable_db_options_); const auto preallocate_block_size = GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size); + auto write_hint = CalculateWALWriteHint(); mutex_.Unlock(); { if (creating_new_log) { @@ -1193,6 +1194,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { // use preallocate_block_size instead // of calling GetWalPreallocateBlockSize() lfile->SetPreallocationBlockSize(preallocate_block_size); + lfile->SetWriteLifeTimeHint(write_hint); unique_ptr file_writer( new WritableFileWriter(std::move(lfile), opt_env_opt)); new_log = new log::Writer( diff --git a/db/flush_job.cc b/db/flush_job.cc index cd0af167a4..41a97f5870 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -246,6 +246,7 @@ Status FlushJob::WriteLevel0Table() { const uint64_t start_micros = db_options_.env->NowMicros(); Status s; { + auto write_hint = cfd_->CalculateSSTWriteHint(0); db_mutex_->Unlock(); if (log_buffer_) { log_buffer_->FlushBufferToLog(); @@ -315,7 +316,7 @@ Status FlushJob::WriteLevel0Table() { mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(), TableFileCreationReason::kFlush, event_logger_, job_context_->job_id, Env::IO_HIGH, &table_properties_, 0 /* level */, current_time, - oldest_key_time); + oldest_key_time, write_hint); LogFlush(db_options_.info_log); } ROCKS_LOG_INFO(db_options_.info_log, diff --git a/db/repair.cc b/db/repair.cc index f88720878c..069c5adc9a 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -401,6 +401,8 @@ class Repairer { status = env_->GetCurrentTime(&_current_time); // ignore error const uint64_t current_time = static_cast(_current_time); SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance(); + + auto write_hint = cfd->CalculateSSTWriteHint(0); status = BuildTable( dbname_, env_, *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), env_options_, table_cache_, iter.get(), @@ -411,7 +413,7 @@ class Repairer { CompressionOptions(), false, nullptr /* internal_stats */, TableFileCreationReason::kRecovery, nullptr /* event_logger */, 0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */, - -1 /* level */, current_time); + -1 /* level */, current_time, write_hint); ROCKS_LOG_INFO(db_options_.info_log, "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log, counter, meta.fd.GetNumber(), diff --git a/env/io_posix.cc b/env/io_posix.cc index c5b14d3eff..2bbe80fe17 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -35,6 +35,11 @@ #include "util/string_util.h" #include "util/sync_point.h" +#if defined(OS_LINUX) && !defined(F_SET_RW_HINT) +#define F_LINUX_SPECIFIC_BASE 1024 +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#endif + namespace rocksdb { // A wrapper for fadvise, if the platform doesn't support fadvise, @@ -858,6 +863,17 @@ bool PosixWritableFile::IsSyncThreadSafe() const { return true; } uint64_t PosixWritableFile::GetFileSize() { return filesize_; } +void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { +#ifdef OS_LINUX + if (hint == write_hint_) { + return; + } + if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) { + write_hint_ = hint; + } +#endif +} + Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) { if (use_direct_io()) { return Status::OK(); diff --git a/env/io_posix.h b/env/io_posix.h index 69c98438f2..804864cd1f 100644 --- a/env/io_posix.h +++ b/env/io_posix.h @@ -132,6 +132,7 @@ class PosixWritableFile : public WritableFile { virtual Status Fsync() override; virtual bool IsSyncThreadSafe() const override; virtual bool use_direct_io() const override { return use_direct_io_; } + virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override; virtual uint64_t GetFileSize() override; virtual Status InvalidateCache(size_t offset, size_t length) override; virtual size_t GetRequiredBufferAlignment() const override { diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 0e70ef372a..81dc60a4d0 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -152,6 +152,16 @@ class Env { unique_ptr* result, const EnvOptions& options) = 0; + // These values match Linux definition + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 + enum WriteLifeTimeHint { + WLTH_NOT_SET = 0, // No hint information set + WLTH_NONE, // No hints about write life time + WLTH_SHORT, // Data written has a short life time + WLTH_MEDIUM, // Data written has a medium life time + WLTH_LONG, // Data written has a long life time + WLTH_EXTREME, // Data written has an extremely long life time + }; // Create an object that writes to a new file with the specified // name. Deletes any existing file with the same name and creates a @@ -573,7 +583,8 @@ class WritableFile { WritableFile() : last_preallocated_block_(0), preallocation_block_size_(0), - io_priority_(Env::IO_TOTAL) { + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET) { } virtual ~WritableFile(); @@ -650,6 +661,11 @@ class WritableFile { virtual Env::IOPriority GetIOPriority() { return io_priority_; } + virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { + write_hint_ = hint; + } + + virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; } /* * Get the size of valid data in the file. */ @@ -738,6 +754,7 @@ class WritableFile { friend class WritableFileMirror; Env::IOPriority io_priority_; + Env::WriteLifeTimeHint write_hint_; }; // A file abstraction for random reading and writing.