diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc index 35269fdb50..dceb90cee5 100644 --- a/db/blob/blob_file_builder.cc +++ b/db/blob/blob_file_builder.cc @@ -34,9 +34,9 @@ BlobFileBuilder::BlobFileBuilder( VersionSet* versions, FileSystem* fs, const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, - std::string db_id, std::string db_session_id, int job_id, - uint32_t column_family_id, const std::string& column_family_name, - Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const WriteOptions* write_options, std::string db_id, + std::string db_session_id, int job_id, uint32_t column_family_id, + const std::string& column_family_name, Env::WriteLifeTimeHint write_hint, const std::shared_ptr& io_tracer, BlobFileCompletionCallback* blob_callback, BlobFileCreationReason creation_reason, @@ -44,18 +44,18 @@ BlobFileBuilder::BlobFileBuilder( std::vector* blob_file_additions) : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs, immutable_options, mutable_cf_options, file_options, - db_id, db_session_id, job_id, column_family_id, - column_family_name, io_priority, write_hint, io_tracer, - blob_callback, creation_reason, blob_file_paths, - blob_file_additions) {} + write_options, db_id, db_session_id, job_id, + column_family_id, column_family_name, write_hint, + io_tracer, blob_callback, creation_reason, + blob_file_paths, blob_file_additions) {} BlobFileBuilder::BlobFileBuilder( std::function file_number_generator, FileSystem* fs, const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, - std::string db_id, std::string db_session_id, int job_id, - uint32_t column_family_id, const std::string& column_family_name, - Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const WriteOptions* write_options, std::string db_id, + std::string db_session_id, int job_id, uint32_t column_family_id, + const std::string& column_family_name, Env::WriteLifeTimeHint write_hint, const std::shared_ptr& io_tracer, BlobFileCompletionCallback* blob_callback, BlobFileCreationReason creation_reason, @@ -69,12 +69,12 @@ BlobFileBuilder::BlobFileBuilder( blob_compression_type_(mutable_cf_options->blob_compression_type), prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache), file_options_(file_options), + write_options_(write_options), db_id_(std::move(db_id)), db_session_id_(std::move(db_session_id)), job_id_(job_id), column_family_id_(column_family_id), column_family_name_(column_family_name), - io_priority_(io_priority), write_hint_(write_hint), io_tracer_(io_tracer), blob_callback_(blob_callback), @@ -87,6 +87,7 @@ BlobFileBuilder::BlobFileBuilder( assert(fs_); assert(immutable_options_); assert(file_options_); + assert(write_options_); assert(blob_file_paths_); assert(blob_file_paths_->empty()); assert(blob_file_additions_); @@ -207,14 +208,14 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() { blob_file_paths_->emplace_back(std::move(blob_file_path)); assert(file); - file->SetIOPriority(io_priority_); + file->SetIOPriority(write_options_->rate_limiter_priority); file->SetWriteLifeTimeHint(write_hint_); FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types; Statistics* const statistics = immutable_options_->stats; std::unique_ptr file_writer(new WritableFileWriter( std::move(file), blob_file_paths_->back(), *file_options_, immutable_options_->clock, io_tracer_, statistics, - immutable_options_->listeners, + Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS, immutable_options_->listeners, immutable_options_->file_checksum_gen_factory.get(), tmp_set.Contains(FileType::kBlobFile), false)); @@ -231,7 +232,7 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() { expiration_range); { - Status s = blob_log_writer->WriteHeader(header); + Status s = blob_log_writer->WriteHeader(*write_options_, header); TEST_SYNC_POINT_CALLBACK( "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s); @@ -296,7 +297,8 @@ Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob, uint64_t key_offset = 0; - Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset); + Status s = + writer_->AddRecord(*write_options_, key, blob, &key_offset, blob_offset); TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s); @@ -321,7 +323,8 @@ Status BlobFileBuilder::CloseBlobFile() { std::string checksum_method; std::string checksum_value; - Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value); + Status s = writer_->AppendFooter(*write_options_, footer, &checksum_method, + &checksum_value); TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s); diff --git a/db/blob/blob_file_builder.h b/db/blob/blob_file_builder.h index 8e7aab502d..6ba7181aa0 100644 --- a/db/blob/blob_file_builder.h +++ b/db/blob/blob_file_builder.h @@ -13,6 +13,7 @@ #include "rocksdb/advanced_options.h" #include "rocksdb/compression_type.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" #include "rocksdb/rocksdb_namespace.h" #include "rocksdb/types.h" @@ -36,11 +37,11 @@ class BlobFileBuilder { BlobFileBuilder(VersionSet* versions, FileSystem* fs, const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, - const FileOptions* file_options, std::string db_id, + const FileOptions* file_options, + const WriteOptions* write_options, std::string db_id, std::string db_session_id, int job_id, uint32_t column_family_id, const std::string& column_family_name, - Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, const std::shared_ptr& io_tracer, BlobFileCompletionCallback* blob_callback, @@ -51,11 +52,11 @@ class BlobFileBuilder { BlobFileBuilder(std::function file_number_generator, FileSystem* fs, const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, - const FileOptions* file_options, std::string db_id, + const FileOptions* file_options, + const WriteOptions* write_options, std::string db_id, std::string db_session_id, int job_id, uint32_t column_family_id, const std::string& column_family_name, - Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, const std::shared_ptr& io_tracer, BlobFileCompletionCallback* blob_callback, @@ -92,12 +93,12 @@ class BlobFileBuilder { CompressionType blob_compression_type_; PrepopulateBlobCache prepopulate_blob_cache_; const FileOptions* file_options_; + const WriteOptions* write_options_; const std::string db_id_; const std::string db_session_id_; int job_id_; uint32_t column_family_id_; std::string column_family_name_; - Env::IOPriority io_priority_; Env::WriteLifeTimeHint write_hint_; std::shared_ptr io_tracer_; BlobFileCompletionCallback* blob_callback_; diff --git a/db/blob/blob_file_builder_test.cc b/db/blob/blob_file_builder_test.cc index 5882e219fe..8a2ecff13a 100644 --- a/db/blob/blob_file_builder_test.cc +++ b/db/blob/blob_file_builder_test.cc @@ -43,6 +43,7 @@ class BlobFileBuilderTest : public testing::Test { mock_env_.reset(MockEnv::Create(Env::Default())); fs_ = mock_env_->GetFileSystem().get(); clock_ = mock_env_->GetSystemClock().get(); + write_options_.rate_limiter_priority = Env::IO_HIGH; } void VerifyBlobFile(uint64_t blob_file_number, @@ -113,6 +114,7 @@ class BlobFileBuilderTest : public testing::Test { FileSystem* fs_; SystemClock* clock_; FileOptions file_options_; + WriteOptions write_options_; }; TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { @@ -136,7 +138,6 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -144,8 +145,8 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -221,7 +222,6 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -229,8 +229,8 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -309,7 +309,6 @@ TEST_F(BlobFileBuilderTest, InlinedValues) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -317,8 +316,8 @@ TEST_F(BlobFileBuilderTest, InlinedValues) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -364,7 +363,6 @@ TEST_F(BlobFileBuilderTest, Compression) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -372,8 +370,8 @@ TEST_F(BlobFileBuilderTest, Compression) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -448,7 +446,6 @@ TEST_F(BlobFileBuilderTest, CompressionError) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -456,8 +453,8 @@ TEST_F(BlobFileBuilderTest, CompressionError) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -528,7 +525,6 @@ TEST_F(BlobFileBuilderTest, Checksum) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -536,8 +532,8 @@ TEST_F(BlobFileBuilderTest, Checksum) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -589,11 +585,13 @@ class BlobFileBuilderIOErrorTest BlobFileBuilderIOErrorTest() : sync_point_(GetParam()) { mock_env_.reset(MockEnv::Create(Env::Default())); fs_ = mock_env_->GetFileSystem().get(); + write_options_.rate_limiter_priority = Env::IO_HIGH; } std::unique_ptr mock_env_; FileSystem* fs_; FileOptions file_options_; + WriteOptions write_options_; std::string sync_point_; }; @@ -626,7 +624,6 @@ TEST_P(BlobFileBuilderIOErrorTest, IOError) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -634,8 +631,8 @@ TEST_P(BlobFileBuilderIOErrorTest, IOError) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); diff --git a/db/blob/blob_file_cache_test.cc b/db/blob/blob_file_cache_test.cc index 8c3c56de9b..edfeb7e810 100644 --- a/db/blob/blob_file_cache_test.cc +++ b/db/blob/blob_file_cache_test.cc @@ -57,7 +57,7 @@ void WriteBlobFile(uint32_t column_family_id, BlobLogHeader header(column_family_id, kNoCompression, has_ttl, expiration_range); - ASSERT_OK(blob_log_writer.WriteHeader(header)); + ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header)); constexpr char key[] = "key"; constexpr char blob[] = "blob"; @@ -67,7 +67,8 @@ void WriteBlobFile(uint32_t column_family_id, uint64_t key_offset = 0; uint64_t blob_offset = 0; - ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset)); + ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), key, blob, &key_offset, + &blob_offset)); BlobLogFooter footer; footer.blob_count = 1; @@ -76,8 +77,8 @@ void WriteBlobFile(uint32_t column_family_id, std::string checksum_method; std::string checksum_value; - ASSERT_OK( - blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); + ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer, + &checksum_method, &checksum_value)); } } // anonymous namespace diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc index b6049d1ef5..b42b866859 100644 --- a/db/blob/blob_file_reader_test.cc +++ b/db/blob/blob_file_reader_test.cc @@ -63,7 +63,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, BlobLogHeader header(column_family_id, compression, has_ttl, expiration_range_header); - ASSERT_OK(blob_log_writer.WriteHeader(header)); + ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header)); std::vector compressed_blobs(num); std::vector blobs_to_write(num); @@ -91,7 +91,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, for (size_t i = 0; i < num; ++i) { uint64_t key_offset = 0; - ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset, + ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), keys[i], + blobs_to_write[i], &key_offset, &blob_offsets[i])); } @@ -101,8 +102,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, std::string checksum_method; std::string checksum_value; - ASSERT_OK( - blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); + ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer, + &checksum_method, &checksum_value)); } // Creates a test blob file with a single blob in it. Note: this method @@ -473,7 +474,7 @@ TEST_F(BlobFileReaderTest, Malformed) { BlobLogHeader header(column_family_id, kNoCompression, has_ttl, expiration_range); - ASSERT_OK(blob_log_writer.WriteHeader(header)); + ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header)); } constexpr HistogramImpl* blob_file_read_hist = nullptr; diff --git a/db/blob/blob_log_writer.cc b/db/blob/blob_log_writer.cc index bf5ef27c1d..d1768f9020 100644 --- a/db/blob/blob_log_writer.cc +++ b/db/blob/blob_log_writer.cc @@ -33,35 +33,49 @@ BlobLogWriter::BlobLogWriter(std::unique_ptr&& dest, BlobLogWriter::~BlobLogWriter() = default; -Status BlobLogWriter::Sync() { +Status BlobLogWriter::Sync(const WriteOptions& write_options) { TEST_SYNC_POINT("BlobLogWriter::Sync"); StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS); - Status s = dest_->Sync(use_fsync_); - RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED); + IOOptions opts; + Status s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = dest_->Sync(opts, use_fsync_); + } + if (s.ok()) { + RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED); + } return s; } -Status BlobLogWriter::WriteHeader(BlobLogHeader& header) { +Status BlobLogWriter::WriteHeader(const WriteOptions& write_options, + BlobLogHeader& header) { assert(block_offset_ == 0); assert(last_elem_type_ == kEtNone); std::string str; header.EncodeTo(&str); - Status s = dest_->Append(Slice(str)); + IOOptions opts; + Status s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = dest_->Append(opts, Slice(str)); + } if (s.ok()) { block_offset_ += str.size(); if (do_flush_) { - s = dest_->Flush(); + s = dest_->Flush(opts); } } last_elem_type_ = kEtFileHdr; - RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, - BlobLogHeader::kSize); + if (s.ok()) { + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogHeader::kSize); + } return s; } -Status BlobLogWriter::AppendFooter(BlobLogFooter& footer, +Status BlobLogWriter::AppendFooter(const WriteOptions& write_options, + BlobLogFooter& footer, std::string* checksum_method, std::string* checksum_value) { assert(block_offset_ != 0); @@ -75,14 +89,17 @@ Status BlobLogWriter::AppendFooter(BlobLogFooter& footer, s.PermitUncheckedError(); return Status::IOError("Seen Error. Skip closing."); } else { - s = dest_->Append(Slice(str)); + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = dest_->Append(opts, Slice(str)); + } if (s.ok()) { block_offset_ += str.size(); - - s = Sync(); + s = Sync(write_options); if (s.ok()) { - s = dest_->Close(); + s = dest_->Close(opts); if (s.ok()) { assert(!!checksum_method == !!checksum_value); @@ -111,12 +128,15 @@ Status BlobLogWriter::AppendFooter(BlobLogFooter& footer, } last_elem_type_ = kEtFileFooter; - RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, - BlobLogFooter::kSize); + if (s.ok()) { + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogFooter::kSize); + } return s; } -Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, +Status BlobLogWriter::AddRecord(const WriteOptions& write_options, + const Slice& key, const Slice& val, uint64_t expiration, uint64_t* key_offset, uint64_t* blob_offset) { assert(block_offset_ != 0); @@ -125,11 +145,13 @@ Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, std::string buf; ConstructBlobHeader(&buf, key, val, expiration); - Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + Status s = + EmitPhysicalRecord(write_options, buf, key, val, key_offset, blob_offset); return s; } -Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, +Status BlobLogWriter::AddRecord(const WriteOptions& write_options, + const Slice& key, const Slice& val, uint64_t* key_offset, uint64_t* blob_offset) { assert(block_offset_ != 0); assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); @@ -137,7 +159,8 @@ Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, std::string buf; ConstructBlobHeader(&buf, key, val, 0); - Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + Status s = + EmitPhysicalRecord(write_options, buf, key, val, key_offset, blob_offset); return s; } @@ -150,28 +173,34 @@ void BlobLogWriter::ConstructBlobHeader(std::string* buf, const Slice& key, record.EncodeHeaderTo(buf); } -Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf, +Status BlobLogWriter::EmitPhysicalRecord(const WriteOptions& write_options, + const std::string& headerbuf, const Slice& key, const Slice& val, uint64_t* key_offset, uint64_t* blob_offset) { - StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS); - Status s = dest_->Append(Slice(headerbuf)); + IOOptions opts; + Status s = WritableFileWriter::PrepareIOOptions(write_options, opts); if (s.ok()) { - s = dest_->Append(key); + s = dest_->Append(opts, Slice(headerbuf)); } if (s.ok()) { - s = dest_->Append(val); + s = dest_->Append(opts, key); + } + if (s.ok()) { + s = dest_->Append(opts, val); } if (do_flush_ && s.ok()) { - s = dest_->Flush(); + s = dest_->Flush(opts); } *key_offset = block_offset_ + BlobLogRecord::kHeaderSize; *blob_offset = *key_offset + key.size(); block_offset_ = *blob_offset + val.size(); last_elem_type_ = kEtRecord; - RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, - BlobLogRecord::kHeaderSize + key.size() + val.size()); + if (s.ok()) { + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogRecord::kHeaderSize + key.size() + val.size()); + } return s; } diff --git a/db/blob/blob_log_writer.h b/db/blob/blob_log_writer.h index c1f9f31ad0..0ba4f9c2a2 100644 --- a/db/blob/blob_log_writer.h +++ b/db/blob/blob_log_writer.h @@ -43,20 +43,24 @@ class BlobLogWriter { static void ConstructBlobHeader(std::string* buf, const Slice& key, const Slice& val, uint64_t expiration); - Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset, + Status AddRecord(const WriteOptions& write_options, const Slice& key, + const Slice& val, uint64_t* key_offset, uint64_t* blob_offset); - Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration, - uint64_t* key_offset, uint64_t* blob_offset); + Status AddRecord(const WriteOptions& write_options, const Slice& key, + const Slice& val, uint64_t expiration, uint64_t* key_offset, + uint64_t* blob_offset); - Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key, + Status EmitPhysicalRecord(const WriteOptions& write_options, + const std::string& headerbuf, const Slice& key, const Slice& val, uint64_t* key_offset, uint64_t* blob_offset); - Status AppendFooter(BlobLogFooter& footer, std::string* checksum_method, + Status AppendFooter(const WriteOptions& write_options, BlobLogFooter& footer, + std::string* checksum_method, std::string* checksum_value); - Status WriteHeader(BlobLogHeader& header); + Status WriteHeader(const WriteOptions& write_options, BlobLogHeader& header); WritableFileWriter* file() { return dest_.get(); } @@ -64,7 +68,7 @@ class BlobLogWriter { uint64_t get_log_number() const { return log_number_; } - Status Sync(); + Status Sync(const WriteOptions& write_options); private: std::unique_ptr dest_; diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc index 258d2da5e1..9fc1931c1e 100644 --- a/db/blob/blob_source_test.cc +++ b/db/blob/blob_source_test.cc @@ -65,7 +65,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, BlobLogHeader header(column_family_id, compression, has_ttl, expiration_range_header); - ASSERT_OK(blob_log_writer.WriteHeader(header)); + ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header)); std::vector compressed_blobs(num); std::vector blobs_to_write(num); @@ -93,7 +93,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, for (size_t i = 0; i < num; ++i) { uint64_t key_offset = 0; - ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset, + ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), keys[i], + blobs_to_write[i], &key_offset, &blob_offsets[i])); } @@ -103,8 +104,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, std::string checksum_method; std::string checksum_value; - ASSERT_OK( - blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); + ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer, + &checksum_method, &checksum_value)); } } // anonymous namespace diff --git a/db/builder.cc b/db/builder.cc index d3040ee9e2..f9cc2a5eac 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -32,6 +32,7 @@ #include "options/options_helper.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/file_system.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/table.h" @@ -57,8 +58,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, - const FileOptions& file_options, const ReadOptions& read_options, - TableCache* table_cache, InternalIterator* iter, + const FileOptions& file_options, TableCache* table_cache, + InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, @@ -69,9 +70,8 @@ Status BuildTable( IOStatus* io_status, const std::shared_ptr& io_tracer, BlobFileCreationReason blob_creation_reason, const SeqnoToTimeMapping& seqno_to_time_mapping, EventLogger* event_logger, - int job_id, const Env::IOPriority io_priority, - TableProperties* table_properties, Env::WriteLifeTimeHint write_hint, - const std::string* full_history_ts_low, + int job_id, TableProperties* table_properties, + Env::WriteLifeTimeHint write_hint, const std::string* full_history_ts_low, BlobFileCompletionCallback* blob_callback, Version* version, uint64_t* num_input_entries, uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) { @@ -164,11 +164,11 @@ Status BuildTable( table_file_created = true; FileTypeSet tmp_set = ioptions.checksum_handoff_file_types; - file->SetIOPriority(io_priority); + file->SetIOPriority(tboptions.write_options.rate_limiter_priority); file->SetWriteLifeTimeHint(write_hint); file_writer.reset(new WritableFileWriter( std::move(file), fname, file_options, ioptions.clock, io_tracer, - ioptions.stats, ioptions.listeners, + ioptions.stats, Histograms::SST_WRITE_MICROS, ioptions.listeners, ioptions.file_checksum_gen_factory.get(), tmp_set.Contains(FileType::kTableFile), false)); @@ -188,10 +188,11 @@ Status BuildTable( blob_file_additions) ? new BlobFileBuilder( versions, fs, &ioptions, &mutable_cf_options, &file_options, - tboptions.db_id, tboptions.db_session_id, job_id, - tboptions.column_family_id, tboptions.column_family_name, - io_priority, write_hint, io_tracer, blob_callback, - blob_creation_reason, &blob_file_paths, blob_file_additions) + &(tboptions.write_options), tboptions.db_id, + tboptions.db_session_id, job_id, tboptions.column_family_id, + tboptions.column_family_name, write_hint, io_tracer, + blob_callback, blob_creation_reason, &blob_file_paths, + blob_file_additions) : nullptr); const std::atomic kManualCompactionCanceledFalse{false}; @@ -244,7 +245,11 @@ Status BuildTable( } // TODO(noetzli): Update stats after flush, too. - if (io_priority == Env::IO_HIGH && + // TODO(hx235): Replace `rate_limiter_priority` with `io_activity` for + // flush IO in repair when we have an `Env::IOActivity` enum for it + if ((tboptions.write_options.io_activity == Env::IOActivity::kFlush || + tboptions.write_options.io_activity == Env::IOActivity::kDBOpen || + tboptions.write_options.rate_limiter_priority == Env::IO_HIGH) && IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) { ThreadStatusUtil::SetThreadOperationProperty( ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); @@ -275,7 +280,7 @@ Status BuildTable( SizeApproximationOptions approx_opts; approx_opts.files_size_error_margin = 0.1; meta->compensated_range_deletion_size += versions->ApproximateSize( - approx_opts, read_options, version, kv.first.Encode(), + approx_opts, tboptions.read_options, version, kv.first.Encode(), tombstone_end.Encode(), 0 /* start_level */, -1 /* end_level */, TableReaderCaller::kFlush); } @@ -346,13 +351,16 @@ Status BuildTable( // Finish and check for file errors TEST_SYNC_POINT("BuildTable:BeforeSyncTable"); - if (s.ok() && !empty) { + IOOptions opts; + *io_status = + WritableFileWriter::PrepareIOOptions(tboptions.write_options, opts); + if (s.ok() && io_status->ok() && !empty) { StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS); - *io_status = file_writer->Sync(ioptions.use_fsync); + *io_status = file_writer->Sync(opts, ioptions.use_fsync); } TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile"); if (s.ok() && io_status->ok() && !empty) { - *io_status = file_writer->Close(); + *io_status = file_writer->Close(opts); } if (s.ok() && io_status->ok() && !empty) { // Add the checksum information to file metadata. @@ -396,9 +404,9 @@ Status BuildTable( // No matter whether use_direct_io_for_flush_and_compaction is true, // the goal is to cache it here for further user reads. std::unique_ptr it(table_cache->NewIterator( - read_options, file_options, tboptions.internal_comparator, *meta, - nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor, - nullptr, + tboptions.read_options, file_options, tboptions.internal_comparator, + *meta, nullptr /* range_del_agg */, + mutable_cf_options.prefix_extractor, nullptr, (internal_stats == nullptr) ? nullptr : internal_stats->GetFileReadHist(0), TableReaderCaller::kFlush, /*arena=*/nullptr, @@ -436,8 +444,13 @@ Status BuildTable( constexpr IODebugContext* dbg = nullptr; if (table_file_created) { - Status ignored = fs->DeleteFile(fname, IOOptions(), dbg); - ignored.PermitUncheckedError(); + IOOptions opts; + Status prepare = + WritableFileWriter::PrepareIOOptions(tboptions.write_options, opts); + if (prepare.ok()) { + Status ignored = fs->DeleteFile(fname, opts, dbg); + ignored.PermitUncheckedError(); + } } assert(blob_file_additions || blob_file_paths.empty()); diff --git a/db/builder.h b/db/builder.h index 6a6a1866a1..96d87677bb 100644 --- a/db/builder.h +++ b/db/builder.h @@ -53,8 +53,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, extern Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, - const FileOptions& file_options, const ReadOptions& read_options, - TableCache* table_cache, InternalIterator* iter, + const FileOptions& file_options, TableCache* table_cache, + InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, @@ -66,7 +66,6 @@ extern Status BuildTable( BlobFileCreationReason blob_creation_reason, const SeqnoToTimeMapping& seqno_to_time_mapping, EventLogger* event_logger = nullptr, int job_id = 0, - const Env::IOPriority io_priority = Env::IO_HIGH, TableProperties* table_properties = nullptr, Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET, const std::string* full_history_ts_low = nullptr, diff --git a/db/column_family.cc b/db/column_family.cc index 1e61dfab27..bad6ec889c 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1168,7 +1168,7 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( *overlap = false; // Create an InternalIterator over all unflushed memtables Arena arena; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions read_opts; read_opts.total_order_seek = true; MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 99b099759d..9d1a45f5b7 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -1130,6 +1130,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // (b) CompactionFilter::Decision::kRemoveAndSkipUntil. read_options.total_order_seek = true; + const WriteOptions write_options(Env::IOPriority::IO_LOW, + Env::IOActivity::kCompaction); + // Remove the timestamps from boundaries because boundaries created in // GenSubcompactionBoundaries doesn't strip away the timestamp. size_t ts_sz = cfd->user_comparator()->timestamp_size(); @@ -1264,8 +1267,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { ? new BlobFileBuilder( versions_, fs_.get(), sub_compact->compaction->immutable_options(), - mutable_cf_options, &file_options_, db_id_, db_session_id_, - job_id_, cfd->GetID(), cfd->GetName(), Env::IOPriority::IO_LOW, + mutable_cf_options, &file_options_, &write_options, db_id_, + db_session_id_, job_id_, cfd->GetID(), cfd->GetName(), write_hint_, io_tracer_, blob_callback_, BlobFileCreationReason::kCompaction, &blob_file_paths, sub_compact->Current().GetBlobFileAdditionsPtr()) @@ -1710,6 +1713,8 @@ Status CompactionJob::InstallCompactionResults( db_mutex_->AssertHeld(); const ReadOptions read_options(Env::IOActivity::kCompaction); + const WriteOptions write_options(Env::IOActivity::kCompaction); + auto* compaction = compact_->compaction; assert(compaction); @@ -1792,8 +1797,9 @@ Status CompactionJob::InstallCompactionResults( }; return versions_->LogAndApply( - compaction->column_family_data(), mutable_cf_options, read_options, edit, - db_mutex_, db_directory_, /*new_descriptor_log=*/false, + compaction->column_family_data(), mutable_cf_options, read_options, + write_options, edit, db_mutex_, db_directory_, + /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, manifest_wcb); } @@ -1943,13 +1949,17 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, sub_compact->compaction->immutable_options()->listeners; outputs.AssignFileWriter(new WritableFileWriter( std::move(writable_file), fname, fo_copy, db_options_.clock, io_tracer_, - db_options_.stats, listeners, db_options_.file_checksum_gen_factory.get(), + db_options_.stats, Histograms::SST_WRITE_MICROS, listeners, + db_options_.file_checksum_gen_factory.get(), tmp_set.Contains(FileType::kTableFile), false)); // TODO(hx235): pass in the correct `oldest_key_time` instead of `0` + const ReadOptions read_options(Env::IOActivity::kCompaction); + const WriteOptions write_options(Env::IOActivity::kCompaction); TableBuilderOptions tboptions( *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()), - cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + read_options, write_options, cfd->internal_comparator(), + cfd->int_tbl_prop_collector_factories(), sub_compact->compaction->output_compression(), sub_compact->compaction->output_compression_opts(), cfd->GetID(), cfd->GetName(), sub_compact->compaction->output_level(), diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 886bcb6e18..bd805358eb 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -295,9 +295,12 @@ class CompactionJobTestBase : public testing::Test { Status s = WritableFileWriter::Create(fs_, table_name, FileOptions(), &file_writer, nullptr); ASSERT_OK(s); + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr table_builder( cf_options_.table_factory->NewTableBuilder( TableBuilderOptions(*cfd_->ioptions(), mutable_cf_options_, + read_options, write_options, cfd_->internal_comparator(), cfd_->int_tbl_prop_collector_factories(), CompressionType::kNoCompression, @@ -394,7 +397,7 @@ class CompactionJobTestBase : public testing::Test { mutex_.Lock(); EXPECT_OK(versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - read_options_, &edit, &mutex_, nullptr)); + read_options_, write_options_, &edit, &mutex_, nullptr)); mutex_.Unlock(); } @@ -549,7 +552,7 @@ class CompactionJobTestBase : public testing::Test { /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", /*error_handler=*/nullptr, /*read_only=*/false)); compaction_job_stats_.Reset(); - ASSERT_OK(SetIdentityFile(env_, dbname_)); + ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); VersionEdit new_db; new_db.SetLogNumber(0); @@ -568,11 +571,11 @@ class CompactionJobTestBase : public testing::Test { log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); - s = log.AddRecord(record); + s = log.AddRecord(WriteOptions(), record); } ASSERT_OK(s); // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); @@ -736,6 +739,7 @@ class CompactionJobTestBase : public testing::Test { MutableCFOptions mutable_cf_options_; MutableDBOptions mutable_db_options_; const ReadOptions read_options_; + const WriteOptions write_options_; std::shared_ptr table_cache_; WriteController write_controller_; WriteBufferManager write_buffer_manager_; diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index eb76cd849a..9ad2b3a0d5 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -62,12 +62,15 @@ IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status, Statistics* statistics, bool use_fsync) { IOStatus io_s; - if (input_status.ok()) { + IOOptions opts; + io_s = WritableFileWriter::PrepareIOOptions( + WriteOptions(Env::IOActivity::kCompaction), opts); + if (input_status.ok() && io_s.ok()) { StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS); - io_s = file_writer_->Sync(use_fsync); + io_s = file_writer_->Sync(opts, use_fsync); } if (input_status.ok() && io_s.ok()) { - io_s = file_writer_->Close(); + io_s = file_writer_->Close(opts); } if (input_status.ok() && io_s.ok()) { diff --git a/db/convenience.cc b/db/convenience.cc index 08bddc8e8f..9e78adc74e 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -34,7 +34,7 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, const std::string& file_path) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; return VerifySstFileChecksum(options, env_options, read_options, file_path); } diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 0c8ae60339..5f7b2a0b0e 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -3126,7 +3126,8 @@ TEST_F(DBBasicTest, LastSstFileNotInManifest) { // Manually add a sst file. constexpr uint64_t kSstFileNumber = 100; const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber); - ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content", + ASSERT_OK(WriteStringToFile(env_, + /* data = */ "bad sst file content", /* fname = */ kSstFile, /* should_sync = */ true)); ASSERT_OK(env_->FileExists(kSstFile)); diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 8084f02cf1..36b18f9a2b 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -333,8 +333,10 @@ Status DBImpl::Resume() { Status DBImpl::ResumeImpl(DBRecoverContext context) { mutex_.AssertHeld(); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + WaitForBackgroundWork(); Status s; @@ -373,8 +375,8 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { assert(cfh); ColumnFamilyData* cfd = cfh->cfd(); const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); - s = versions_->LogAndApply(cfd, cf_opts, read_options, &edit, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, cf_opts, read_options, write_options, + &edit, &mutex_, directories_.GetDbDir()); if (!s.ok()) { io_s = versions_->io_status(); if (!io_s.ok()) { @@ -716,23 +718,26 @@ Status DBImpl::CloseHelper() { Status DBImpl::CloseImpl() { return CloseHelper(); } DBImpl::~DBImpl() { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); + // TODO: remove this. init_logger_creation_s_.PermitUncheckedError(); InstrumentedMutexLock closing_lock_guard(&closing_mutex_); - if (closed_) { - return; + if (!closed_) { + closed_ = true; + + { + const Status s = MaybeReleaseTimestampedSnapshotsAndCheck(); + s.PermitUncheckedError(); + } + + closing_status_ = CloseImpl(); + closing_status_.PermitUncheckedError(); } - - closed_ = true; - - { - const Status s = MaybeReleaseTimestampedSnapshotsAndCheck(); - s.PermitUncheckedError(); - } - - closing_status_ = CloseImpl(); - closing_status_.PermitUncheckedError(); + ThreadStatusUtil::SetThreadOperation(cur_op_type); } void DBImpl::MaybeIgnoreError(Status* s) const { @@ -807,7 +812,9 @@ Status DBImpl::StartPeriodicTaskScheduler() { return s; } -Status DBImpl::RegisterRecordSeqnoTimeWorker(bool is_new_db) { +Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options, + const WriteOptions& write_options, + bool is_new_db) { options_mutex_.AssertHeld(); uint64_t min_preserve_seconds = std::numeric_limits::max(); @@ -890,7 +897,8 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(bool is_new_db) { VersionEdit edit; edit.SetLastSequence(kMax); s = versions_->LogAndApplyToDefaultColumnFamily( - {}, &edit, &mutex_, directories_.GetDbDir()); + read_options, write_options, &edit, &mutex_, + directories_.GetDbDir()); if (!s.ok() && versions_->io_status().IsIOError()) { s = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); @@ -1000,6 +1008,7 @@ void DBImpl::PersistStats() { stats_slice_initialized_ = true; std::swap(stats_slice_, stats_map); if (s.ok()) { + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions wo; wo.low_pri = true; wo.no_slowdown = true; @@ -1214,8 +1223,10 @@ FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { Status DBImpl::SetOptions( ColumnFamilyHandle* column_family, const std::unordered_map& options_map) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + auto* cfd = static_cast_with_check(column_family)->cfd(); if (options_map.empty()) { @@ -1238,14 +1249,15 @@ Status DBImpl::SetOptions( new_options = *cfd->GetLatestMutableCFOptions(); // Append new version to recompute compaction score. VersionEdit dummy_edit; - s = versions_->LogAndApply(cfd, new_options, read_options, &dummy_edit, - &mutex_, directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, new_options, read_options, write_options, + &dummy_edit, &mutex_, directories_.GetDbDir()); // Trigger possible flush/compactions. This has to be before we persist // options to file, otherwise there will be a deadlock with writer // thread. InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options); - persist_options_status = WriteOptionsFile(true /*db_mutex_already_held*/); + persist_options_status = + WriteOptionsFile(write_options, true /*db_mutex_already_held*/); bg_cv_.SignalAll(); } } @@ -1424,7 +1436,8 @@ Status DBImpl::SetDBOptions( } write_thread_.ExitUnbatched(&w); } - persist_options_status = WriteOptionsFile(true /*db_mutex_already_held*/); + persist_options_status = + WriteOptionsFile(WriteOptions(), true /*db_mutex_already_held*/); } else { // To get here, we must have had invalid options and will not attempt to // persist the options, which means the status is "OK/Uninitialized. @@ -1476,14 +1489,14 @@ int DBImpl::FindMinimumEmptyLevelFitting( return minimum_level; } -Status DBImpl::FlushWAL(bool sync) { +Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) { if (manual_wal_flush_) { IOStatus io_s; { // We need to lock log_write_mutex_ since logs_ might change concurrently InstrumentedMutexLock wl(&log_write_mutex_); log::Writer* cur_log_writer = logs_.back().writer; - io_s = cur_log_writer->WriteBuffer(); + io_s = cur_log_writer->WriteBuffer(write_options); } if (!io_s.ok()) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", @@ -1556,11 +1569,22 @@ Status DBImpl::SyncWAL() { RecordTick(stats_, WAL_FILE_SYNCED); Status status; IOStatus io_s; - for (log::Writer* log : logs_to_sync) { - io_s = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync); - if (!io_s.ok()) { - status = io_s; - break; + // TODO: plumb Env::IOActivity, Env::IOPriority + const ReadOptions read_options; + const WriteOptions write_options; + IOOptions opts; + io_s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (!io_s.ok()) { + status = io_s; + } + if (io_s.ok()) { + for (log::Writer* log : logs_to_sync) { + io_s = + log->file()->SyncWithoutFlush(opts, immutable_db_options_.use_fsync); + if (!io_s.ok()) { + status = io_s; + break; + } } } if (!io_s.ok()) { @@ -1589,9 +1613,7 @@ Status DBImpl::SyncWAL() { } if (status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - // TODO: plumb Env::IOActivity - const ReadOptions read_options; - status = ApplyWALToManifest(read_options, &synced_wals); + status = ApplyWALToManifest(read_options, write_options, &synced_wals); } TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); @@ -1600,12 +1622,14 @@ Status DBImpl::SyncWAL() { } Status DBImpl::ApplyWALToManifest(const ReadOptions& read_options, + const WriteOptions& write_options, VersionEdit* synced_wals) { // not empty, write to MANIFEST. mutex_.AssertHeld(); Status status = versions_->LogAndApplyToDefaultColumnFamily( - read_options, synced_wals, &mutex_, directories_.GetDbDir()); + read_options, write_options, synced_wals, &mutex_, + directories_.GetDbDir()); if (!status.ok() && versions_->io_status().IsIOError()) { status = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); @@ -3486,6 +3510,7 @@ void DBImpl::MultiGetEntity(const ReadOptions& _read_options, size_t num_keys, } Status DBImpl::WrapUpCreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, const std::vector& cf_options) { // NOTE: this function is skipped for create_missing_column_families and // DB::Open, so new functionality here might need to go into Open also. @@ -3498,26 +3523,32 @@ Status DBImpl::WrapUpCreateColumnFamilies( } } // Attempt both follow-up actions even if one fails - Status s = WriteOptionsFile(false /*db_mutex_already_held*/); + Status s = WriteOptionsFile(write_options, false /*db_mutex_already_held*/); if (register_worker) { - s.UpdateIfOk(RegisterRecordSeqnoTimeWorker(/*from_db_open=*/false)); + s.UpdateIfOk(RegisterRecordSeqnoTimeWorker(read_options, write_options, + /* is_new_db */ false)); } return s; } -Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, +Status DBImpl::CreateColumnFamily(const ReadOptions& read_options, + const WriteOptions& write_options, + const ColumnFamilyOptions& cf_options, const std::string& column_family, ColumnFamilyHandle** handle) { assert(handle != nullptr); InstrumentedMutexLock ol(&options_mutex_); - Status s = CreateColumnFamilyImpl(cf_options, column_family, handle); + Status s = CreateColumnFamilyImpl(read_options, write_options, cf_options, + column_family, handle); if (s.ok()) { - s.UpdateIfOk(WrapUpCreateColumnFamilies({&cf_options})); + s.UpdateIfOk( + WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options})); } return s; } Status DBImpl::CreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, const ColumnFamilyOptions& cf_options, const std::vector& column_family_names, std::vector* handles) { @@ -3529,7 +3560,8 @@ Status DBImpl::CreateColumnFamilies( bool success_once = false; for (size_t i = 0; i < num_cf; i++) { ColumnFamilyHandle* handle; - s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle); + s = CreateColumnFamilyImpl(read_options, write_options, cf_options, + column_family_names[i], &handle); if (!s.ok()) { break; } @@ -3537,12 +3569,14 @@ Status DBImpl::CreateColumnFamilies( success_once = true; } if (success_once) { - s.UpdateIfOk(WrapUpCreateColumnFamilies({&cf_options})); + s.UpdateIfOk( + WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options})); } return s; } Status DBImpl::CreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, const std::vector& column_families, std::vector* handles) { assert(handles != nullptr); @@ -3555,7 +3589,8 @@ Status DBImpl::CreateColumnFamilies( cf_opts.reserve(num_cf); for (size_t i = 0; i < num_cf; i++) { ColumnFamilyHandle* handle; - s = CreateColumnFamilyImpl(column_families[i].options, + s = CreateColumnFamilyImpl(read_options, write_options, + column_families[i].options, column_families[i].name, &handle); if (!s.ok()) { break; @@ -3565,17 +3600,18 @@ Status DBImpl::CreateColumnFamilies( cf_opts.push_back(&column_families[i].options); } if (success_once) { - s.UpdateIfOk(WrapUpCreateColumnFamilies(cf_opts)); + s.UpdateIfOk( + WrapUpCreateColumnFamilies(read_options, write_options, cf_opts)); } return s; } -Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, +Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options, + const WriteOptions& write_options, + const ColumnFamilyOptions& cf_options, const std::string& column_family_name, ColumnFamilyHandle** handle) { options_mutex_.AssertHeld(); - // TODO: plumb Env::IOActivity - const ReadOptions read_options; Status s; *handle = nullptr; @@ -3619,7 +3655,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, // LogAndApply will both write the creation in MANIFEST and create // ColumnFamilyData object s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir(), false, &cf_options); write_thread_.ExitUnbatched(&w); } @@ -3668,7 +3704,8 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { InstrumentedMutexLock ol(&options_mutex_); Status s = DropColumnFamilyImpl(column_family); if (s.ok()) { - s = WriteOptionsFile(false /*db_mutex_already_held*/); + // TODO: plumb Env::IOActivity, Env::IOPriority + s = WriteOptionsFile(WriteOptions(), false /*db_mutex_already_held*/); } return s; } @@ -3686,8 +3723,9 @@ Status DBImpl::DropColumnFamilies( success_once = true; } if (success_once) { + // TODO: plumb Env::IOActivity, Env::IOPriority Status persist_options_status = - WriteOptionsFile(false /*db_mutex_already_held*/); + WriteOptionsFile(WriteOptions(), false /*db_mutex_already_held*/); if (s.ok() && !persist_options_status.ok()) { s = persist_options_status; } @@ -3696,8 +3734,10 @@ Status DBImpl::DropColumnFamilies( } Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); if (cfd->GetID() == 0) { @@ -3721,7 +3761,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { WriteThread::Writer w; write_thread_.EnterUnbatched(&w, &mutex_); s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir()); write_thread_.ExitUnbatched(&w); } @@ -3748,7 +3788,8 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { if (cfd->ioptions()->preserve_internal_time_seconds > 0 || cfd->ioptions()->preclude_last_level_data_seconds > 0) { - s = RegisterRecordSeqnoTimeWorker(/*from_db_open=*/false); + s = RegisterRecordSeqnoTimeWorker(read_options, write_options, + /* is_new_db */ false); } if (s.ok()) { @@ -3779,7 +3820,7 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, // falsify later if key-may-exist but can't fetch value *value_found = true; } - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions roptions = read_options; roptions.read_tier = kBlockCacheTier; // read from block cache only PinnableSlice pinnable_val; @@ -4298,7 +4339,7 @@ Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, version->Ref(); mutex_.Unlock(); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = version->GetPropertiesOfAllTables(read_options, props); @@ -4322,7 +4363,7 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, version->Ref(); mutex_.Unlock(); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = version->GetPropertiesOfTablesInRange(read_options, range, n, props); @@ -4664,7 +4705,7 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, SuperVersion* sv = GetAndRefSuperVersion(cfd); v = sv->current; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; for (int i = 0; i < n; i++) { // Add timestamp if needed @@ -4728,8 +4769,10 @@ Status DBImpl::GetUpdatesSince( } Status DBImpl::DeleteFile(std::string name) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + uint64_t number; FileType type; WalFileType log_type; @@ -4809,7 +4852,7 @@ Status DBImpl::DeleteFile(std::string name) { edit.SetColumnFamily(cfd->GetID()); edit.DeleteFile(level, number); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, @@ -4832,8 +4875,10 @@ Status DBImpl::DeleteFile(std::string name) { Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, const RangePtr* ranges, size_t n, bool include_end) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + Status status = Status::OK(); auto cfh = static_cast_with_check(column_family); ColumnFamilyData* cfd = cfh->cfd(); @@ -4901,7 +4946,7 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, } input_version->Ref(); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, @@ -5315,7 +5360,8 @@ Status DestroyDB(const std::string& dbname, const Options& options, return result; } -Status DBImpl::WriteOptionsFile(bool db_mutex_already_held) { +Status DBImpl::WriteOptionsFile(const WriteOptions& write_options, + bool db_mutex_already_held) { options_mutex_.AssertHeld(); if (db_mutex_already_held) { @@ -5349,8 +5395,8 @@ Status DBImpl::WriteOptionsFile(bool db_mutex_already_held) { std::string file_name = TempOptionsFileName(GetName(), versions_->NewFileNumber()); - Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name, - fs_.get()); + Status s = PersistRocksDBOptions(write_options, db_options, cf_names, cf_opts, + file_name, fs_.get()); if (s.ok()) { s = RenameTempFileToOptionsFile(file_name); @@ -5543,7 +5589,7 @@ Status DBImpl::GetLatestSequenceForKey( MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions read_options; SequenceNumber current_seq = versions_->LastSequence(); @@ -5699,8 +5745,10 @@ Status DBImpl::IngestExternalFile( Status DBImpl::IngestExternalFiles( const std::vector& args) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + if (args.empty()) { return Status::InvalidArgument("ingestion arg list is empty"); } @@ -5918,9 +5966,10 @@ Status DBImpl::IngestExternalFiles( } assert(0 == num_entries); } - status = versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, - read_options, edit_lists, &mutex_, - directories_.GetDbDir()); + status = versions_->LogAndApply( + cfds_to_commit, mutable_cf_options_list, read_options, write_options, + + edit_lists, &mutex_, directories_.GetDbDir()); // It is safe to update VersionSet last seqno here after LogAndApply since // LogAndApply persists last sequence number from VersionEdits, // which are from file's largest seqno and not from VersionSet. @@ -6022,8 +6071,10 @@ Status DBImpl::CreateColumnFamilyWithImport( ColumnFamilyHandle** handle) { assert(handle != nullptr); assert(*handle == nullptr); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + std::string cf_comparator_name = options.comparator->Name(); size_t total_file_num = 0; @@ -6039,7 +6090,8 @@ Status DBImpl::CreateColumnFamilyWithImport( } // Create column family. - auto status = CreateColumnFamily(options, column_family_name, handle); + auto status = CreateColumnFamily(read_options, write_options, options, + column_family_name, handle); if (!status.ok()) { return status; } @@ -6075,8 +6127,8 @@ Status DBImpl::CreateColumnFamilyWithImport( next_file_number = versions_->FetchAddFileNumber(total_file_num); auto cf_options = cfd->GetLatestMutableCFOptions(); status = - versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, - &mutex_, directories_.GetDbDir()); + versions_->LogAndApply(cfd, *cf_options, read_options, write_options, + &dummy_edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); } @@ -6113,8 +6165,8 @@ Status DBImpl::CreateColumnFamilyWithImport( if (status.ok()) { auto cf_options = cfd->GetLatestMutableCFOptions(); status = versions_->LogAndApply(cfd, *cf_options, read_options, - import_job.edit(), &mutex_, - directories_.GetDbDir()); + write_options, import_job.edit(), + &mutex_, directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options); } @@ -6198,6 +6250,7 @@ Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family, empty_after_delete = true; } else { const Comparator* const ucmp = column_family->GetComparator(); + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions wo; // Delete [smallest_user_key, clip_begin_key) if (ucmp->Compare(smallest_user_key, begin_key) < 0) { @@ -6518,8 +6571,10 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion( ColumnFamilyData* cfd, uint64_t num, std::unique_ptr::iterator>& pending_output_elem, uint64_t* next_file_number) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + Status s; SuperVersionContext dummy_sv_ctx(true /* create_superversion */); assert(nullptr != next_file_number); @@ -6537,8 +6592,8 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion( // reuse the file number that has already assigned to the internal file, // and this will overwrite the external file. To protect the external // file, we have to make sure the file number will never being reused. - s = versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, - &mutex_, directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, *cf_options, read_options, write_options, + &dummy_edit, &mutex_, directories_.GetDbDir()); if (s.ok()) { InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); } diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 34a5f33989..8853033ae1 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -321,14 +321,41 @@ class DBImpl : public DB { virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options, const std::string& column_family, - ColumnFamilyHandle** handle) override; + ColumnFamilyHandle** handle) override { + // TODO: plumb Env::IOActivity, Env::IOPriority + return CreateColumnFamily(ReadOptions(), WriteOptions(), cf_options, + column_family, handle); + } + virtual Status CreateColumnFamily(const ReadOptions& read_options, + const WriteOptions& write_options, + const ColumnFamilyOptions& cf_options, + const std::string& column_family, + ColumnFamilyHandle** handle); virtual Status CreateColumnFamilies( const ColumnFamilyOptions& cf_options, const std::vector& column_family_names, - std::vector* handles) override; + std::vector* handles) override { + // TODO: plumb Env::IOActivity, Env::IOPriority + return CreateColumnFamilies(ReadOptions(), WriteOptions(), cf_options, + column_family_names, handles); + } + virtual Status CreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, + const ColumnFamilyOptions& cf_options, + const std::vector& column_family_names, + std::vector* handles); + virtual Status CreateColumnFamilies( const std::vector& column_families, - std::vector* handles) override; + std::vector* handles) override { + // TODO: plumb Env::IOActivity, Env::IOPriority + return CreateColumnFamilies(ReadOptions(), WriteOptions(), column_families, + handles); + } + virtual Status CreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, + const std::vector& column_families, + std::vector* handles); virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; virtual Status DropColumnFamilies( const std::vector& column_families) override; @@ -440,7 +467,12 @@ class DBImpl : public DB { virtual Status Flush( const FlushOptions& options, const std::vector& column_families) override; - virtual Status FlushWAL(bool sync) override; + virtual Status FlushWAL(bool sync) override { + // TODO: plumb Env::IOActivity, Env::IOPriority + return FlushWAL(WriteOptions(), sync); + } + + virtual Status FlushWAL(const WriteOptions& write_options, bool sync); bool WALBufferIsEmpty(); virtual Status SyncWAL() override; virtual Status LockWAL() override; @@ -1406,7 +1438,8 @@ class DBImpl : public DB { // Persist options to options file. Must be holding options_mutex_. // Will lock DB mutex if !db_mutex_already_held. - Status WriteOptionsFile(bool db_mutex_already_held); + Status WriteOptionsFile(const WriteOptions& write_options, + bool db_mutex_already_held); Status CompactRangeInternal(const CompactRangeOptions& options, ColumnFamilyHandle* column_family, @@ -1532,7 +1565,8 @@ class DBImpl : public DB { virtual bool OwnTablesAndLogs() const { return true; } // Setup DB identity file, and write DB ID to manifest if necessary. - Status SetupDBId(bool read_only, RecoveryContext* recovery_ctx); + Status SetupDBId(const WriteOptions& write_options, bool read_only, + RecoveryContext* recovery_ctx); // Assign db_id_ and write DB ID to manifest if necessary. void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx); @@ -1659,7 +1693,8 @@ class DBImpl : public DB { return w; } Status ClearWriter() { - Status s = writer->WriteBuffer(); + // TODO: plumb Env::IOActivity, Env::IOPriority + Status s = writer->WriteBuffer(WriteOptions()); delete writer; writer = nullptr; return s; @@ -1835,12 +1870,15 @@ class DBImpl : public DB { const Status CreateArchivalDirectory(); // Create a column family, without some of the follow-up work yet - Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, + Status CreateColumnFamilyImpl(const ReadOptions& read_options, + const WriteOptions& write_options, + const ColumnFamilyOptions& cf_options, const std::string& cf_name, ColumnFamilyHandle** handle); // Follow-up work to user creating a column family or (families) Status WrapUpCreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, const std::vector& cf_options); Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family); @@ -1872,7 +1910,8 @@ class DBImpl : public DB { void ReleaseFileNumberFromPendingOutputs( std::unique_ptr::iterator>& v); - IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals, + IOStatus SyncClosedLogs(const WriteOptions& write_options, + JobContext* job_context, VersionEdit* synced_wals, bool error_recovery_in_prog); // Flush the in-memory write buffer to storage. Switches to a new @@ -2058,12 +2097,10 @@ class DBImpl : public DB { WriteBatch* tmp_batch, WriteBatch** merged_batch, size_t* write_with_wal, WriteBatch** to_be_cached_state); - // rate_limiter_priority is used to charge `DBOptions::rate_limiter` - // for automatic WAL flush (`Options::manual_wal_flush` == false) - // associated with this WriteToWAL - IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer, - uint64_t* log_used, uint64_t* log_size, - Env::IOPriority rate_limiter_priority, + IOStatus WriteToWAL(const WriteBatch& merged_batch, + const WriteOptions& write_options, + log::Writer* log_writer, uint64_t* log_used, + uint64_t* log_size, LogFileNumberSize& log_file_number_size); IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group, @@ -2175,7 +2212,9 @@ class DBImpl : public DB { // Cancel scheduled periodic tasks Status CancelPeriodicTaskScheduler(); - Status RegisterRecordSeqnoTimeWorker(bool is_new_db); + Status RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options, + const WriteOptions& write_options, + bool is_new_db); void PrintStatistics(); @@ -2203,7 +2242,9 @@ class DBImpl : public DB { // helper function to call after some of the logs_ were synced void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit); - Status ApplyWALToManifest(const ReadOptions& read_options, VersionEdit* edit); + Status ApplyWALToManifest(const ReadOptions& read_options, + const WriteOptions& write_options, + VersionEdit* edit); // WALs with log number up to up_to are not synced successfully. void MarkLogsNotSynced(uint64_t up_to); @@ -2275,8 +2316,9 @@ class DBImpl : public DB { size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; } - IOStatus CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, - size_t preallocate_block_size, log::Writer** new_log); + IOStatus CreateWAL(const WriteOptions& write_options, uint64_t log_file_num, + uint64_t recycle_log_number, size_t preallocate_block_size, + log::Writer** new_log); // Validate self-consistency of DB options static Status ValidateOptions(const DBOptions& db_options); diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 71c23de95a..a47f095b94 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -19,6 +19,10 @@ #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" +#include "rocksdb/file_system.h" +#include "rocksdb/io_status.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" #include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/coding.h" @@ -112,7 +116,8 @@ bool DBImpl::ShouldRescheduleFlushRequestToRetainUDT( return true; } -IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, +IOStatus DBImpl::SyncClosedLogs(const WriteOptions& write_options, + JobContext* job_context, VersionEdit* synced_wals, bool error_recovery_in_prog) { TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); @@ -143,7 +148,13 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, if (error_recovery_in_prog) { log->file()->reset_seen_error(); } - io_s = log->file()->Sync(immutable_db_options_.use_fsync); + + IOOptions io_options; + io_s = WritableFileWriter::PrepareIOOptions(write_options, io_options); + if (!io_s.ok()) { + break; + } + io_s = log->file()->Sync(io_options, immutable_db_options_.use_fsync); if (!io_s.ok()) { break; } @@ -152,16 +163,21 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, if (error_recovery_in_prog) { log->file()->reset_seen_error(); } - io_s = log->Close(); + // TODO: plumb Env::IOActivity, Env::IOPriority + io_s = log->Close(WriteOptions()); if (!io_s.ok()) { break; } } } if (io_s.ok()) { - io_s = directories_.GetWalDir()->FsyncWithDirOptions( - IOOptions(), nullptr, - DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + IOOptions io_options; + io_s = WritableFileWriter::PrepareIOOptions(write_options, io_options); + if (io_s.ok()) { + io_s = directories_.GetWalDir()->FsyncWithDirOptions( + io_options, nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } } TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock", @@ -199,6 +215,8 @@ Status DBImpl::FlushMemTableToOutputFile( assert(cfd->imm()->IsFlushPending()); assert(versions_); assert(versions_->GetColumnFamilySet()); + const ReadOptions read_options(Env::IOActivity::kFlush); + const WriteOptions write_options(Env::IOActivity::kFlush); // If there are more than one column families, we need to make sure that // all the log files except the most recent one are synced. Otherwise if // the host crashes after flushing and before WAL is persistent, the @@ -265,13 +283,12 @@ Status DBImpl::FlushMemTableToOutputFile( VersionEdit synced_wals; bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress(); mutex_.Unlock(); - log_io_s = - SyncClosedLogs(job_context, &synced_wals, error_recovery_in_prog); + log_io_s = SyncClosedLogs(write_options, job_context, &synced_wals, + error_recovery_in_prog); mutex_.Lock(); if (log_io_s.ok() && synced_wals.IsWalAddition()) { - const ReadOptions read_options(Env::IOActivity::kFlush); - log_io_s = - status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); + log_io_s = status_to_io_status( + ApplyWALToManifest(read_options, write_options, &synced_wals)); TEST_SYNC_POINT_CALLBACK("DBImpl::FlushMemTableToOutputFile:CommitWal:1", nullptr); } @@ -465,6 +482,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( const autovector& bg_flush_args, bool* made_progress, JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) { mutex_.AssertHeld(); + const ReadOptions read_options(Env::IOActivity::kFlush); + const WriteOptions write_options(Env::IOActivity::kFlush); autovector cfds; for (const auto& arg : bg_flush_args) { @@ -552,13 +571,12 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( VersionEdit synced_wals; bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress(); mutex_.Unlock(); - log_io_s = - SyncClosedLogs(job_context, &synced_wals, error_recovery_in_prog); + log_io_s = SyncClosedLogs(write_options, job_context, &synced_wals, + error_recovery_in_prog); mutex_.Lock(); if (log_io_s.ok() && synced_wals.IsWalAddition()) { - const ReadOptions read_options(Env::IOActivity::kFlush); - log_io_s = - status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); + log_io_s = status_to_io_status( + ApplyWALToManifest(read_options, write_options, &synced_wals)); } if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && @@ -653,9 +671,14 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // Sync on all distinct output directories. for (auto dir : distinct_output_dirs) { if (dir != nullptr) { - Status error_status = dir->FsyncWithDirOptions( - IOOptions(), nullptr, - DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + IOOptions io_options; + Status error_status = + WritableFileWriter::PrepareIOOptions(write_options, io_options); + if (error_status.ok()) { + error_status = dir->FsyncWithDirOptions( + io_options, nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } if (!error_status.ok()) { s = error_status; break; @@ -1049,8 +1072,10 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, edit.SetColumnFamily(cfd->GetID()); edit.SetFullHistoryTsLow(ts_low); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + TEST_SYNC_POINT_CALLBACK("DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit", &edit); @@ -1064,7 +1089,7 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, } Status s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir()); if (!s.ok()) { return s; @@ -1754,6 +1779,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { } const ReadOptions read_options(Env::IOActivity::kCompaction); + const WriteOptions write_options(Env::IOActivity::kCompaction); SuperVersionContext sv_context(/* create_superversion */ true); @@ -1870,9 +1896,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); - Status status = - versions_->LogAndApply(cfd, mutable_cf_options, read_options, &edit, - &mutex_, directories_.GetDbDir()); + Status status = versions_->LogAndApply(cfd, mutable_cf_options, + read_options, write_options, &edit, + &mutex_, directories_.GetDbDir()); cfd->compaction_picker()->UnregisterCompaction(c.get()); c.reset(); @@ -3480,6 +3506,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); const ReadOptions read_options(Env::IOActivity::kCompaction); + const WriteOptions write_options(Env::IOActivity::kCompaction); bool is_manual = (manual_compaction != nullptr); std::unique_ptr c; @@ -3692,7 +3719,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } status = versions_->LogAndApply( c->column_family_data(), *c->mutable_cf_options(), read_options, - c->edit(), &mutex_, directories_.GetDbDir(), + write_options, c->edit(), &mutex_, directories_.GetDbDir(), /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, [&c, &compaction_released](const Status& s) { c->ReleaseCompactionFiles(s); @@ -3766,7 +3793,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } status = versions_->LogAndApply( c->column_family_data(), *c->mutable_cf_options(), read_options, - c->edit(), &mutex_, directories_.GetDbDir(), + write_options, c->edit(), &mutex_, directories_.GetDbDir(), /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, [&c, &compaction_released](const Status& s) { c->ReleaseCompactionFiles(s); diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 442cb47679..c90df262e8 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -61,8 +61,10 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { "PromoteL0 FAILED. Invalid target level %d\n", target_level); return Status::InvalidArgument("Invalid target level"); } - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + Status status; VersionEdit edit; JobContext job_context(next_job_id_.fetch_add(1), true); @@ -143,7 +145,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index bd48796474..3519ecec1d 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -18,6 +18,7 @@ #include "file/sst_file_manager_impl.h" #include "logging/logging.h" #include "port/port.h" +#include "rocksdb/options.h" #include "util/autovector.h" #include "util/defer.h" @@ -510,7 +511,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { // Close WALs before trying to delete them. for (const auto w : state.logs_to_free) { // TODO: maybe check the return value of Close. - auto s = w->Close(); + // TODO: plumb Env::IOActivity, Env::IOPriority + auto s = w->Close(WriteOptions()); s.PermitUncheckedError(); } @@ -925,7 +927,8 @@ void DBImpl::SetDBId(std::string&& id, bool read_only, } } -Status DBImpl::SetupDBId(bool read_only, RecoveryContext* recovery_ctx) { +Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only, + RecoveryContext* recovery_ctx) { Status s; // Check for the IDENTITY file and create it if not there or // broken or not matching manifest @@ -958,7 +961,7 @@ Status DBImpl::SetupDBId(bool read_only, RecoveryContext* recovery_ctx) { } // Persist it to IDENTITY file if allowed if (!read_only) { - s = SetIdentityFile(env_, dbname_, db_id_); + s = SetIdentityFile(write_options, env_, dbname_, db_id_); } return s; } diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 074fa86214..4389118621 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -21,6 +21,7 @@ #include "monitoring/persistent_stats_history.h" #include "monitoring/thread_status_util.h" #include "options/options_helper.h" +#include "rocksdb/options.h" #include "rocksdb/table.h" #include "rocksdb/wal_filter.h" #include "test_util/sync_point.h" @@ -309,7 +310,8 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) { Status DBImpl::NewDB(std::vector* new_filenames) { VersionEdit new_db; - Status s = SetIdentityFile(env_, dbname_); + const WriteOptions write_options(Env::IOActivity::kDBOpen); + Status s = SetIdentityFile(write_options, env_, dbname_); if (!s.ok()) { return s; } @@ -339,20 +341,23 @@ Status DBImpl::NewDB(std::vector* new_filenames) { immutable_db_options_.manifest_preallocation_size); std::unique_ptr file_writer(new WritableFileWriter( std::move(file), manifest, file_options, immutable_db_options_.clock, - io_tracer_, nullptr /* stats */, immutable_db_options_.listeners, - nullptr, tmp_set.Contains(FileType::kDescriptorFile), + io_tracer_, nullptr /* stats */, + Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + immutable_db_options_.listeners, nullptr, + tmp_set.Contains(FileType::kDescriptorFile), tmp_set.Contains(FileType::kDescriptorFile))); log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); - s = log.AddRecord(record); + s = log.AddRecord(write_options, record); if (s.ok()) { - s = SyncManifest(&immutable_db_options_, log.file()); + s = SyncManifest(&immutable_db_options_, write_options, log.file()); } } if (s.ok()) { // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(fs_.get(), dbname_, 1, directories_.GetDbDir()); + s = SetCurrentFile(write_options, fs_.get(), dbname_, 1, + directories_.GetDbDir()); if (new_filenames) { new_filenames->emplace_back( manifest.substr(manifest.find_last_of("/\\") + 1)); @@ -418,6 +423,7 @@ Status DBImpl::Recover( uint64_t* recovered_seq, RecoveryContext* recovery_ctx) { mutex_.AssertHeld(); + const WriteOptions write_options(Env::IOActivity::kDBOpen); bool tmp_is_new_db = false; bool& is_new_db = recovery_ctx ? recovery_ctx->is_new_db_ : tmp_is_new_db; assert(db_lock_ == nullptr); @@ -642,7 +648,7 @@ Status DBImpl::Recover( } } } - s = SetupDBId(read_only, recovery_ctx); + s = SetupDBId(write_options, read_only, recovery_ctx); ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str()); if (s.ok() && !read_only) { s = DeleteUnreferencedSstFiles(recovery_ctx); @@ -872,8 +878,9 @@ Status DBImpl::PersistentStatsProcessFormatVersion() { if (s.ok()) { ColumnFamilyOptions cfo; OptimizeForPersistentStats(&cfo); - s = CreateColumnFamilyImpl(cfo, kPersistentStatsColumnFamilyName, - &handle); + s = CreateColumnFamilyImpl(ReadOptions(Env::IOActivity::kDBOpen), + WriteOptions(Env::IOActivity::kDBOpen), cfo, + kPersistentStatsColumnFamilyName, &handle); } if (s.ok()) { persist_stats_cf_handle_ = static_cast(handle); @@ -895,6 +902,7 @@ Status DBImpl::PersistentStatsProcessFormatVersion() { std::to_string(kStatsCFCompatibleFormatVersion)); } if (s.ok()) { + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions wo; wo.low_pri = true; wo.no_slowdown = true; @@ -926,7 +934,9 @@ Status DBImpl::InitPersistStatsColumnFamily() { ColumnFamilyHandle* handle = nullptr; ColumnFamilyOptions cfo; OptimizeForPersistentStats(&cfo); - s = CreateColumnFamilyImpl(cfo, kPersistentStatsColumnFamilyName, &handle); + s = CreateColumnFamilyImpl(ReadOptions(Env::IOActivity::kDBOpen), + WriteOptions(Env::IOActivity::kDBOpen), cfo, + kPersistentStatsColumnFamilyName, &handle); persist_stats_cf_handle_ = static_cast(handle); mutex_.Lock(); } @@ -937,9 +947,12 @@ Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) { mutex_.AssertHeld(); assert(versions_->descriptor_log_ == nullptr); const ReadOptions read_options(Env::IOActivity::kDBOpen); - Status s = versions_->LogAndApply( - recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_, read_options, - recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir()); + const WriteOptions write_options(Env::IOActivity::kDBOpen); + + Status s = versions_->LogAndApply(recovery_ctx.cfds_, + recovery_ctx.mutable_cf_opts_, read_options, + write_options, recovery_ctx.edit_lists_, + &mutex_, directories_.GetDbDir()); if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) { mutex_.Unlock(); for (const auto& stale_sst_file : recovery_ctx.files_to_delete_) { @@ -1665,9 +1678,11 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, } IOStatus io_s; + const ReadOptions read_option(Env::IOActivity::kDBOpen); + const WriteOptions write_option(Env::IO_HIGH, Env::IOActivity::kDBOpen); TableBuilderOptions tboptions( - *cfd->ioptions(), mutable_cf_options, cfd->internal_comparator(), - cfd->int_tbl_prop_collector_factories(), + *cfd->ioptions(), mutable_cf_options, read_option, write_option, + cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(), 0 /* level */, false /* is_bottommost */, @@ -1677,16 +1692,15 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, SeqnoToTimeMapping empty_seqno_to_time_mapping; Version* version = cfd->current(); version->Ref(); - const ReadOptions read_option(Env::IOActivity::kDBOpen); uint64_t num_input_entries = 0; s = BuildTable( dbname_, versions_.get(), immutable_db_options_, tboptions, - file_options_for_compaction_, read_option, cfd->table_cache(), - iter.get(), std::move(range_del_iters), &meta, &blob_file_additions, + file_options_for_compaction_, cfd->table_cache(), iter.get(), + std::move(range_del_iters), &meta, &blob_file_additions, snapshot_seqs, earliest_write_conflict_snapshot, kMaxSequenceNumber, snapshot_checker, paranoid_file_checks, cfd->internal_stats(), &io_s, io_tracer_, BlobFileCreationReason::kRecovery, - empty_seqno_to_time_mapping, &event_logger_, job_id, Env::IO_HIGH, + empty_seqno_to_time_mapping, &event_logger_, job_id, nullptr /* table_properties */, write_hint, nullptr /*full_history_ts_low*/, &blob_callback_, version, &num_input_entries); @@ -1888,7 +1902,8 @@ Status DB::OpenAndTrimHistory( return s; } -IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, +IOStatus DBImpl::CreateWAL(const WriteOptions& write_options, + uint64_t log_file_num, uint64_t recycle_log_number, size_t preallocate_block_size, log::Writer** new_log) { IOStatus io_s; @@ -1922,14 +1937,15 @@ IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; std::unique_ptr file_writer(new WritableFileWriter( std::move(lfile), log_fname, opt_file_options, - immutable_db_options_.clock, io_tracer_, nullptr /* stats */, listeners, - nullptr, tmp_set.Contains(FileType::kWalFile), + immutable_db_options_.clock, io_tracer_, nullptr /* stats */, + Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, listeners, nullptr, + tmp_set.Contains(FileType::kWalFile), tmp_set.Contains(FileType::kWalFile))); *new_log = new log::Writer(std::move(file_writer), log_file_num, immutable_db_options_.recycle_log_file_num > 0, immutable_db_options_.manual_wal_flush, immutable_db_options_.wal_compression); - io_s = (*new_log)->AddCompressionTypeRecord(); + io_s = (*new_log)->AddCompressionTypeRecord(write_options); } return io_s; } @@ -1938,6 +1954,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, const std::vector& column_families, std::vector* handles, DB** dbptr, const bool seq_per_batch, const bool batch_per_txn) { + const WriteOptions write_options(Env::IOActivity::kDBOpen); + const ReadOptions read_options(Env::IOActivity::kDBOpen); + Status s = ValidateOptionsByTable(db_options, column_families); if (!s.ok()) { return s; @@ -2014,7 +2033,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, log::Writer* new_log = nullptr; const size_t preallocate_block_size = impl->GetWalPreallocateBlockSize(max_write_buffer_size); - s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/, + s = impl->CreateWAL(write_options, new_log_number, 0 /*recycle_log_number*/, preallocate_block_size, &new_log); if (s.ok()) { InstrumentedMutexLock wl(&impl->log_write_mutex_); @@ -2039,21 +2058,25 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, if (recovered_seq != kMaxSequenceNumber) { WriteBatch empty_batch; WriteBatchInternal::SetSequence(&empty_batch, recovered_seq); - WriteOptions write_options; uint64_t log_used, log_size; log::Writer* log_writer = impl->logs_.back().writer; LogFileNumberSize& log_file_number_size = impl->alive_log_files_.back(); assert(log_writer->get_log_number() == log_file_number_size.number); impl->mutex_.AssertHeld(); - s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size, - Env::IO_TOTAL, log_file_number_size); + s = impl->WriteToWAL(empty_batch, write_options, log_writer, &log_used, + &log_size, log_file_number_size); if (s.ok()) { // Need to fsync, otherwise it might get lost after a power reset. - s = impl->FlushWAL(false); + s = impl->FlushWAL(write_options, false); TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s); + IOOptions opts; if (s.ok()) { - s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync); + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + } + if (s.ok()) { + s = log_writer->file()->Sync(opts, + impl->immutable_db_options_.use_fsync); } } } @@ -2084,7 +2107,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, impl->mutex_.Unlock(); // NOTE: the work normally done in WrapUpCreateColumnFamilies will // be done separately below. - s = impl->CreateColumnFamilyImpl(cf.options, cf.name, &handle); + s = impl->CreateColumnFamilyImpl(read_options, write_options, + cf.options, cf.name, &handle); impl->mutex_.Lock(); if (s.ok()) { handles->push_back(handle); @@ -2136,7 +2160,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, // Persist RocksDB Options before scheduling the compaction. // The WriteOptionsFile() will release and lock the mutex internally. persist_options_status = - impl->WriteOptionsFile(true /*db_mutex_already_held*/); + impl->WriteOptionsFile(write_options, true /*db_mutex_already_held*/); *dbptr = impl; impl->opened_successfully_ = true; impl->DeleteObsoleteFiles(); @@ -2236,12 +2260,17 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, impl); LogFlush(impl->immutable_db_options_.info_log); if (!impl->WALBufferIsEmpty()) { - s = impl->FlushWAL(false); + s = impl->FlushWAL(write_options, false); if (s.ok()) { // Sync is needed otherwise WAL buffered data might get lost after a // power reset. log::Writer* log_writer = impl->logs_.back().writer; - s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync); + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = log_writer->file()->Sync(opts, + impl->immutable_db_options_.use_fsync); + } } } if (s.ok() && !persist_options_status.ok()) { @@ -2258,7 +2287,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, s = impl->StartPeriodicTaskScheduler(); } if (s.ok()) { - s = impl->RegisterRecordSeqnoTimeWorker(recovery_ctx.is_new_db_); + s = impl->RegisterRecordSeqnoTimeWorker(read_options, write_options, + recovery_ctx.is_new_db_); } impl->options_mutex_.Unlock(); if (!s.ok()) { diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 34a7de122d..c29240e088 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -620,9 +620,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, log_write_mutex_.Unlock(); if (status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; - status = ApplyWALToManifest(read_options, &synced_wals); + status = ApplyWALToManifest(read_options, write_options, &synced_wals); } // Requesting sync with two_write_queues_ is expected to be very rare. We @@ -783,9 +783,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, } if (w.status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; - w.status = ApplyWALToManifest(read_options, &synced_wals); + w.status = ApplyWALToManifest(read_options, write_options, &synced_wals); } write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); } @@ -1318,9 +1318,9 @@ Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group, // When two_write_queues_ is disabled, this function is called from the only // write thread. Otherwise this must be called holding log_write_mutex_. IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, + const WriteOptions& write_options, log::Writer* log_writer, uint64_t* log_used, uint64_t* log_size, - Env::IOPriority rate_limiter_priority, LogFileNumberSize& log_file_number_size) { assert(log_size != nullptr); @@ -1343,12 +1343,11 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, log_write_mutex_.Lock(); } IOStatus io_s = log_writer->MaybeAddUserDefinedTimestampSizeRecord( - versions_->GetColumnFamiliesTimestampSizeForRecord(), - rate_limiter_priority); + write_options, versions_->GetColumnFamiliesTimestampSizeForRecord()); if (!io_s.ok()) { return io_s; } - io_s = log_writer->AddRecord(log_entry, rate_limiter_priority); + io_s = log_writer->AddRecord(write_options, log_entry); if (UNLIKELY(needs_locking)) { log_write_mutex_.Unlock(); @@ -1391,9 +1390,13 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, WriteBatchInternal::SetSequence(merged_batch, sequence); uint64_t log_size; - io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size, - write_group.leader->rate_limiter_priority, - log_file_number_size); + + // TODO: plumb Env::IOActivity, Env::IOPriority + WriteOptions write_options; + write_options.rate_limiter_priority = + write_group.leader->rate_limiter_priority; + io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used, + &log_size, log_file_number_size); if (to_be_cached_state) { cached_recoverable_state_ = *to_be_cached_state; cached_recoverable_state_empty_ = false; @@ -1420,10 +1423,17 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, log_write_mutex_.Lock(); } - for (auto& log : logs_) { - io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync); - if (!io_s.ok()) { - break; + if (io_s.ok()) { + for (auto& log : logs_) { + IOOptions opts; + io_s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (!io_s.ok()) { + break; + } + io_s = log.writer->file()->Sync(opts, immutable_db_options_.use_fsync); + if (!io_s.ok()) { + break; + } } } @@ -1496,9 +1506,13 @@ IOStatus DBImpl::ConcurrentWriteToWAL( assert(log_writer->get_log_number() == log_file_number_size.number); uint64_t log_size; - io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size, - write_group.leader->rate_limiter_priority, - log_file_number_size); + + // TODO: plumb Env::IOActivity, Env::IOPriority + WriteOptions write_options; + write_options.rate_limiter_priority = + write_group.leader->rate_limiter_priority; + io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used, + &log_size, log_file_number_size); if (to_be_cached_state) { cached_recoverable_state_ = *to_be_cached_state; cached_recoverable_state_empty_ = false; @@ -2117,8 +2131,10 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, // two_write_queues_ is true (This is to simplify the reasoning.) Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { mutex_.AssertHeld(); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + log::Writer* new_log = nullptr; MemTable* new_mem = nullptr; IOStatus io_s; @@ -2165,8 +2181,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { if (creating_new_log) { // TODO: Write buffer size passed in should be max of all CF's instead // of mutable_cf_options.write_buffer_size. - io_s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size, - &new_log); + io_s = CreateWAL(write_options, new_log_number, recycle_log_number, + preallocate_block_size, &new_log); if (s.ok()) { s = io_s; } @@ -2203,7 +2219,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { // In recovery path, we force another try of writing WAL buffer. cur_log_writer->file()->reset_seen_error(); } - io_s = cur_log_writer->WriteBuffer(); + io_s = cur_log_writer->WriteBuffer(write_options); if (s.ok()) { s = io_s; } @@ -2271,7 +2287,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { VersionEdit wal_deletion; wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); s = versions_->LogAndApplyToDefaultColumnFamily( - read_options, &wal_deletion, &mutex_, directories_.GetDbDir()); + read_options, write_options, &wal_deletion, &mutex_, + directories_.GetDbDir()); if (!s.ok() && versions_->io_status().IsIOError()) { s = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); diff --git a/db/db_iter.cc b/db/db_iter.cc index 507bb2577b..4687031f95 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -201,6 +201,7 @@ bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to // avoid having to copy options back and forth. + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions read_options; read_options.read_tier = read_tier_; read_options.fill_cache = fill_cache_; diff --git a/db/db_iter.h b/db/db_iter.h index 5022405c32..9a1649c342 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -126,6 +126,10 @@ class DBIter final : public Iterator { void operator=(const DBIter&) = delete; ~DBIter() override { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_UNKNOWN); // Release pinned data if any if (pinned_iters_mgr_.PinningEnabled()) { pinned_iters_mgr_.ReleasePinnedData(); @@ -134,6 +138,7 @@ class DBIter final : public Iterator { ResetInternalKeysSkippedCounter(); local_stats_.BumpGlobalStatistics(statistics_); iter_.DeleteIter(arena_mode_); + ThreadStatusUtil::SetThreadOperation(cur_op_type); } void SetIter(InternalIterator* iter) { assert(iter_.iter() == nullptr); diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 7590aa2f11..8432831fe7 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -957,15 +957,18 @@ TEST_F(DBSSTTest, OpenDBWithExistingTrashAndObsoleteSstFile) { // Add some trash files to the db directory so the DB can clean them up ASSERT_OK(env_->CreateDirIfMissing(dbname_)); - ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash")); - ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash")); - ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash")); + ASSERT_OK( + WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash", false)); + ASSERT_OK( + WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash", false)); + ASSERT_OK( + WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash", false)); // Manually add an obsolete sst file. Obsolete SST files are discovered and // deleted upon recovery. constexpr uint64_t kSstFileNumber = 100; const std::string kObsoleteSstFile = MakeTableFileName(dbname_, kSstFileNumber); - ASSERT_OK(WriteStringToFile(env_, "abc", kObsoleteSstFile)); + ASSERT_OK(WriteStringToFile(env_, "abc", kObsoleteSstFile, false)); // Reopen the DB and verify that it deletes existing trash files and obsolete // SST files with rate limiting. diff --git a/db/db_test2.cc b/db/db_test2.cc index e471685b21..c7cc88a46e 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -5691,7 +5691,7 @@ TEST_F(DBTest2, CrashInRecoveryMultipleCF) { ASSERT_OK(ReadFileToString(env_, fname, &file_content)); file_content[400] = 'h'; file_content[401] = 'a'; - ASSERT_OK(WriteStringToFile(env_, file_content, fname)); + ASSERT_OK(WriteStringToFile(env_, file_content, fname, false)); break; } } diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 88c6d1aacf..ecb1858083 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -1561,7 +1561,7 @@ class RecoveryTestHelper { new log::Writer(std::move(file_writer), current_log_number, db_options.recycle_log_file_num > 0, false, db_options.wal_compression); - ASSERT_OK(log_writer->AddCompressionTypeRecord()); + ASSERT_OK(log_writer->AddCompressionTypeRecord(WriteOptions())); current_log_writer.reset(log_writer); WriteBatch batch; @@ -1574,7 +1574,7 @@ class RecoveryTestHelper { ASSERT_OK(batch.Put(key, value)); WriteBatchInternal::SetSequence(&batch, seq); ASSERT_OK(current_log_writer->AddRecord( - WriteBatchInternal::Contents(&batch))); + WriteOptions(), WriteBatchInternal::Contents(&batch))); versions->SetLastAllocatedSequence(seq); versions->SetLastPublishedSequence(seq); versions->SetLastSequence(seq); diff --git a/db/experimental.cc b/db/experimental.cc index f6f920b2cc..44816e7107 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -38,8 +38,9 @@ Status UpdateManifestForFilesState( const DBOptions& db_opts, const std::string& db_name, const std::vector& column_families, const UpdateManifestForFilesStateOptions& opts) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; OfflineManifestWriter w(db_opts, db_name); Status s = w.Recover(column_families); @@ -117,7 +118,8 @@ Status UpdateManifestForFilesState( std::unique_ptr db_dir; s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = w.LogAndApply(read_options, cfd, &edit, db_dir.get()); + s = w.LogAndApply(read_options, write_options, cfd, &edit, + db_dir.get()); } if (s.ok()) { ++cfs_updated; diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index a4a1947145..e17c4097a0 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -710,7 +710,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( // If customized readahead size is needed, we can pass a user option // all the way to here. Right now we just rely on the default readahead // to keep things simple. - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; status = table_reader->VerifyChecksum( @@ -764,7 +764,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( file_to_ingest->num_range_deletions = props->num_range_deletions; ParsedInternalKey key; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, @@ -902,7 +902,7 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( bool overlap_with_db = false; Arena arena; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; ro.total_order_seek = true; int target_level = 0; diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index d888dfde10..17b4c03428 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -572,7 +572,7 @@ TEST_P(FaultInjectionTest, NoDuplicateTrailingEntries) { edit.SetColumnFamily(0); std::string buf; assert(edit.EncodeTo(&buf)); - const Status s = log_writer->AddRecord(buf); + const Status s = log_writer->AddRecord(WriteOptions(), buf); ASSERT_NOK(s); } diff --git a/db/flush_job.cc b/db/flush_job.cc index a3e168823a..ff6cf36ef1 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -409,7 +409,7 @@ Status FlushJob::MemPurge() { // Create two iterators, one for the memtable data (contains // info from puts + deletes), and one for the memtable // Range Tombstones (from DeleteRanges). - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -701,8 +701,8 @@ bool FlushJob::MemPurgeDecider(double threshold) { // Cochran formula for determining sample size. // 95% confidence interval, 7% precision. // n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0 - // TODO: plumb Env::IOActivity double n0 = 196.0; + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; ro.total_order_seek = true; @@ -961,29 +961,30 @@ Status FlushJob::WriteLevel0Table() { const std::string* const full_history_ts_low = (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_; + const ReadOptions read_options(Env::IOActivity::kFlush); + const WriteOptions write_options(io_priority, Env::IOActivity::kFlush); TableBuilderOptions tboptions( - *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(), - cfd_->int_tbl_prop_collector_factories(), output_compression_, - mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(), - 0 /* level */, false /* is_bottommost */, - TableFileCreationReason::kFlush, oldest_key_time, current_time, - db_id_, db_session_id_, 0 /* target_file_size */, - meta_.fd.GetNumber()); + *cfd_->ioptions(), mutable_cf_options_, read_options, write_options, + cfd_->internal_comparator(), cfd_->int_tbl_prop_collector_factories(), + output_compression_, mutable_cf_options_.compression_opts, + cfd_->GetID(), cfd_->GetName(), 0 /* level */, + false /* is_bottommost */, TableFileCreationReason::kFlush, + oldest_key_time, current_time, db_id_, db_session_id_, + 0 /* target_file_size */, meta_.fd.GetNumber()); const SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence(); - const ReadOptions read_options(Env::IOActivity::kFlush); - s = BuildTable(dbname_, versions_, db_options_, tboptions, file_options_, - read_options, cfd_->table_cache(), iter.get(), - std::move(range_del_iters), &meta_, &blob_file_additions, - existing_snapshots_, earliest_write_conflict_snapshot_, - job_snapshot_seq, snapshot_checker_, - mutable_cf_options_.paranoid_file_checks, - cfd_->internal_stats(), &io_s, io_tracer_, - BlobFileCreationReason::kFlush, seqno_to_time_mapping_, - event_logger_, job_context_->job_id, io_priority, - &table_properties_, write_hint, full_history_ts_low, - blob_callback_, base_, &num_input_entries, - &memtable_payload_bytes, &memtable_garbage_bytes); + + s = BuildTable( + dbname_, versions_, db_options_, tboptions, file_options_, + cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_, + &blob_file_additions, existing_snapshots_, + earliest_write_conflict_snapshot_, job_snapshot_seq, + snapshot_checker_, mutable_cf_options_.paranoid_file_checks, + cfd_->internal_stats(), &io_s, io_tracer_, + BlobFileCreationReason::kFlush, seqno_to_time_mapping_, event_logger_, + job_context_->job_id, &table_properties_, write_hint, + full_history_ts_low, blob_callback_, base_, &num_input_entries, + &memtable_payload_bytes, &memtable_garbage_bytes); TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:s", &s); // TODO: Cleanup io_status in BuildTable and table builders assert(!s.ok() || io_s.ok()); @@ -1177,8 +1178,9 @@ Status FlushJob::MaybeIncreaseFullHistoryTsLowToAboveCutoffUDT() { VersionEdit edit; edit.SetColumnFamily(cfd_->GetID()); edit.SetFullHistoryTsLow(new_full_history_ts_low); + // TODO: plumb Env::IOActivity, Env::IOPriority return versions_->LogAndApply(cfd_, *cfd_->GetLatestMutableCFOptions(), - ReadOptions(), &edit, db_mutex_, + ReadOptions(), WriteOptions(), &edit, db_mutex_, output_file_directory_); } diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 95cde2d4df..1838a93891 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -55,7 +55,7 @@ class FlushJobTestBase : public testing::Test { } void NewDB() { - ASSERT_OK(SetIdentityFile(env_, dbname_)); + ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); VersionEdit new_db; new_db.SetLogNumber(0); @@ -89,19 +89,19 @@ class FlushJobTestBase : public testing::Test { log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); - s = log.AddRecord(record); + s = log.AddRecord(WriteOptions(), record); ASSERT_OK(s); for (const auto& e : new_cfs) { record.clear(); e.EncodeTo(&record); - s = log.AddRecord(record); + s = log.AddRecord(WriteOptions(), record); ASSERT_OK(s); } } ASSERT_OK(s); // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); } diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index f7b8a50aef..fcd38f24b2 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -355,7 +355,7 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( // in file_meta. if (file_meta.smallest.empty()) { assert(file_meta.largest.empty()); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 6ef4b43023..d4cf19dcfb 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -1155,7 +1155,7 @@ bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) { bool InternalStats::HandleAggregatedTableProperties(std::string* value, Slice /*suffix*/) { std::shared_ptr tp; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); if (!s.ok()) { @@ -1177,7 +1177,7 @@ static std::map MapUint64ValuesToString( bool InternalStats::HandleAggregatedTablePropertiesMap( std::map* values, Slice /*suffix*/) { std::shared_ptr tp; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); if (!s.ok()) { @@ -1195,7 +1195,7 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values, return false; } std::shared_ptr tp; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties( read_options, &tp, static_cast(level)); @@ -1214,7 +1214,7 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap( return false; } std::shared_ptr tp; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties( read_options, &tp, static_cast(level)); @@ -1418,7 +1418,7 @@ bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, DBImpl* /*db*/, Version* version) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; *value = (version == nullptr) ? 0 @@ -1473,7 +1473,7 @@ bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/, ->compaction_options_fifo.allow_compaction) { return false; } - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; TablePropertiesCollection collection; auto s = cfd_->current()->GetPropertiesOfAllTables(read_options, &collection); diff --git a/db/log_test.cc b/db/log_test.cc index 0bf3bf5aec..bd5aaf6d6b 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -185,9 +185,10 @@ class LogTest void Write(const std::string& msg, const UnorderedMap* cf_to_ts_sz = nullptr) { if (cf_to_ts_sz != nullptr && !cf_to_ts_sz->empty()) { - ASSERT_OK(writer_->MaybeAddUserDefinedTimestampSizeRecord(*cf_to_ts_sz)); + ASSERT_OK(writer_->MaybeAddUserDefinedTimestampSizeRecord(WriteOptions(), + *cf_to_ts_sz)); } - ASSERT_OK(writer_->AddRecord(Slice(msg))); + ASSERT_OK(writer_->AddRecord(WriteOptions(), Slice(msg))); } size_t WrittenBytes() const { return dest_contents().size(); } @@ -732,8 +733,8 @@ TEST_P(LogTest, Recycle) { std::unique_ptr dest_holder(new WritableFileWriter( std::move(sink), "" /* don't care */, FileOptions())); Writer recycle_writer(std::move(dest_holder), 123, true); - ASSERT_OK(recycle_writer.AddRecord(Slice("foooo"))); - ASSERT_OK(recycle_writer.AddRecord(Slice("bar"))); + ASSERT_OK(recycle_writer.AddRecord(WriteOptions(), Slice("foooo"))); + ASSERT_OK(recycle_writer.AddRecord(WriteOptions(), Slice("bar"))); ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2); ASSERT_EQ("foooo", Read()); ASSERT_EQ("bar", Read()); @@ -764,9 +765,10 @@ TEST_P(LogTest, RecycleWithTimestampSize) { UnorderedMap ts_sz_two = { {2, sizeof(uint64_t)}, }; - ASSERT_OK(recycle_writer.MaybeAddUserDefinedTimestampSizeRecord(ts_sz_two)); - ASSERT_OK(recycle_writer.AddRecord(Slice("foooo"))); - ASSERT_OK(recycle_writer.AddRecord(Slice("bar"))); + ASSERT_OK(recycle_writer.MaybeAddUserDefinedTimestampSizeRecord( + WriteOptions(), ts_sz_two)); + ASSERT_OK(recycle_writer.AddRecord(WriteOptions(), Slice("foooo"))); + ASSERT_OK(recycle_writer.AddRecord(WriteOptions(), Slice("bar"))); ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2); CheckRecordAndTimestampSize("foooo", ts_sz_two); CheckRecordAndTimestampSize("bar", ts_sz_two); @@ -853,12 +855,12 @@ class RetriableLogTest : public ::testing::TestWithParam { std::string contents() { return sink_->contents_; } void Encode(const std::string& msg) { - ASSERT_OK(log_writer_->AddRecord(Slice(msg))); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), Slice(msg))); } void Write(const Slice& data) { - ASSERT_OK(writer_->Append(data)); - ASSERT_OK(writer_->Sync(true)); + ASSERT_OK(writer_->Append(IOOptions(), data)); + ASSERT_OK(writer_->Sync(IOOptions(), true)); } bool TryRead(std::string* result) { @@ -991,7 +993,9 @@ INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2)); class CompressionLogTest : public LogTest { public: - Status SetupTestEnv() { return writer_->AddCompressionTypeRecord(); } + Status SetupTestEnv() { + return writer_->AddCompressionTypeRecord(WriteOptions()); + } }; TEST_P(CompressionLogTest, Empty) { @@ -1109,7 +1113,7 @@ TEST_P(CompressionLogTest, AlignedFragmentation) { // beginning of the block. while ((WrittenBytes() & (kBlockSize - 1)) >= kHeaderSize) { char entry = 'a'; - ASSERT_OK(writer_->AddRecord(Slice(&entry, 1))); + ASSERT_OK(writer_->AddRecord(WriteOptions(), Slice(&entry, 1))); num_filler_records++; } const std::vector wal_entries = { diff --git a/db/log_writer.cc b/db/log_writer.cc index 5fc46b33f2..8e0f7a4a9d 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -38,32 +38,43 @@ Writer::Writer(std::unique_ptr&& dest, uint64_t log_number, } Writer::~Writer() { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); if (dest_) { - WriteBuffer().PermitUncheckedError(); + WriteBuffer(WriteOptions()).PermitUncheckedError(); } if (compress_) { delete compress_; } + ThreadStatusUtil::SetThreadOperation(cur_op_type); } -IOStatus Writer::WriteBuffer() { +IOStatus Writer::WriteBuffer(const WriteOptions& write_options) { if (dest_->seen_error()) { return IOStatus::IOError("Seen error. Skip writing buffer."); } - return dest_->Flush(); + IOOptions opts; + IOStatus s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (!s.ok()) { + return s; + } + return dest_->Flush(opts); } -IOStatus Writer::Close() { +IOStatus Writer::Close(const WriteOptions& write_options) { IOStatus s; - if (dest_) { - s = dest_->Close(); + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok() && dest_) { + s = dest_->Close(opts); dest_.reset(); } return s; } -IOStatus Writer::AddRecord(const Slice& slice, - Env::IOPriority rate_limiter_priority) { +IOStatus Writer::AddRecord(const WriteOptions& write_options, + const Slice& slice) { const char* ptr = slice.data(); size_t left = slice.size(); @@ -83,83 +94,87 @@ IOStatus Writer::AddRecord(const Slice& slice, } IOStatus s; - do { - const int64_t leftover = kBlockSize - block_offset_; - assert(leftover >= 0); - if (leftover < header_size) { - // Switch to a new block - if (leftover > 0) { - // Fill the trailer (literal below relies on kHeaderSize and - // kRecyclableHeaderSize being <= 11) - assert(header_size <= 11); - s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", - static_cast(leftover)), - 0 /* crc32c_checksum */, rate_limiter_priority); - if (!s.ok()) { - break; + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + do { + const int64_t leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover < header_size) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer (literal below relies on kHeaderSize and + // kRecyclableHeaderSize being <= 11) + assert(header_size <= 11); + s = dest_->Append(opts, + Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + static_cast(leftover)), + 0 /* crc32c_checksum */); + if (!s.ok()) { + break; + } } + block_offset_ = 0; } - block_offset_ = 0; - } - // Invariant: we never leave < header_size bytes in a block. - assert(static_cast(kBlockSize - block_offset_) >= header_size); + // Invariant: we never leave < header_size bytes in a block. + assert(static_cast(kBlockSize - block_offset_) >= header_size); - const size_t avail = kBlockSize - block_offset_ - header_size; + const size_t avail = kBlockSize - block_offset_ - header_size; - // Compress the record if compression is enabled. - // Compress() is called at least once (compress_start=true) and after the - // previous generated compressed chunk is written out as one or more - // physical records (left=0). - if (compress_ && (compress_start || left == 0)) { - compress_remaining = compress_->Compress(slice.data(), slice.size(), - compressed_buffer_.get(), &left); + // Compress the record if compression is enabled. + // Compress() is called at least once (compress_start=true) and after the + // previous generated compressed chunk is written out as one or more + // physical records (left=0). + if (compress_ && (compress_start || left == 0)) { + compress_remaining = compress_->Compress( + slice.data(), slice.size(), compressed_buffer_.get(), &left); - if (compress_remaining < 0) { - // Set failure status - s = IOStatus::IOError("Unexpected WAL compression error"); - s.SetDataLoss(true); - break; - } else if (left == 0) { - // Nothing left to compress - if (!compress_start) { + if (compress_remaining < 0) { + // Set failure status + s = IOStatus::IOError("Unexpected WAL compression error"); + s.SetDataLoss(true); break; + } else if (left == 0) { + // Nothing left to compress + if (!compress_start) { + break; + } } + compress_start = false; + ptr = compressed_buffer_.get(); } - compress_start = false; - ptr = compressed_buffer_.get(); - } - const size_t fragment_length = (left < avail) ? left : avail; + const size_t fragment_length = (left < avail) ? left : avail; - RecordType type; - const bool end = (left == fragment_length && compress_remaining == 0); - if (begin && end) { - type = recycle_log_files_ ? kRecyclableFullType : kFullType; - } else if (begin) { - type = recycle_log_files_ ? kRecyclableFirstType : kFirstType; - } else if (end) { - type = recycle_log_files_ ? kRecyclableLastType : kLastType; - } else { - type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType; - } - - s = EmitPhysicalRecord(type, ptr, fragment_length, rate_limiter_priority); - ptr += fragment_length; - left -= fragment_length; - begin = false; - } while (s.ok() && (left > 0 || compress_remaining > 0)); + RecordType type; + const bool end = (left == fragment_length && compress_remaining == 0); + if (begin && end) { + type = recycle_log_files_ ? kRecyclableFullType : kFullType; + } else if (begin) { + type = recycle_log_files_ ? kRecyclableFirstType : kFirstType; + } else if (end) { + type = recycle_log_files_ ? kRecyclableLastType : kLastType; + } else { + type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType; + } + s = EmitPhysicalRecord(write_options, type, ptr, fragment_length); + ptr += fragment_length; + left -= fragment_length; + begin = false; + } while (s.ok() && (left > 0 || compress_remaining > 0)); + } if (s.ok()) { if (!manual_flush_) { - s = dest_->Flush(rate_limiter_priority); + s = dest_->Flush(opts); } } return s; } -IOStatus Writer::AddCompressionTypeRecord() { +IOStatus Writer::AddCompressionTypeRecord(const WriteOptions& write_options) { // Should be the first record assert(block_offset_ == 0); @@ -171,11 +186,15 @@ IOStatus Writer::AddCompressionTypeRecord() { CompressionTypeRecord record(compression_type_); std::string encode; record.EncodeTo(&encode); - IOStatus s = - EmitPhysicalRecord(kSetCompressionType, encode.data(), encode.size()); + IOStatus s = EmitPhysicalRecord(write_options, kSetCompressionType, + encode.data(), encode.size()); if (s.ok()) { if (!manual_flush_) { - s = dest_->Flush(); + IOOptions io_opts; + s = WritableFileWriter::PrepareIOOptions(write_options, io_opts); + if (s.ok()) { + s = dest_->Flush(io_opts); + } } // Initialize fields required for compression const size_t max_output_buffer_len = @@ -197,8 +216,8 @@ IOStatus Writer::AddCompressionTypeRecord() { } IOStatus Writer::MaybeAddUserDefinedTimestampSizeRecord( - const UnorderedMap& cf_to_ts_sz, - Env::IOPriority rate_limiter_priority) { + const WriteOptions& write_options, + const UnorderedMap& cf_to_ts_sz) { std::vector> ts_sz_to_record; for (const auto& [cf_id, ts_sz] : cf_to_ts_sz) { if (recorded_cf_to_ts_sz_.count(cf_id) != 0) { @@ -219,14 +238,14 @@ IOStatus Writer::MaybeAddUserDefinedTimestampSizeRecord( record.EncodeTo(&encoded); RecordType type = recycle_log_files_ ? kRecyclableUserDefinedTimestampSizeType : kUserDefinedTimestampSizeType; - return EmitPhysicalRecord(type, encoded.data(), encoded.size(), - rate_limiter_priority); + return EmitPhysicalRecord(write_options, type, encoded.data(), + encoded.size()); } bool Writer::BufferIsEmpty() { return dest_->BufferIsEmpty(); } -IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n, - Env::IOPriority rate_limiter_priority) { +IOStatus Writer::EmitPhysicalRecord(const WriteOptions& write_options, + RecordType t, const char* ptr, size_t n) { assert(n <= 0xffff); // Must fit in two bytes size_t header_size; @@ -266,10 +285,13 @@ IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n, EncodeFixed32(buf, crc); // Write the header and the payload - IOStatus s = dest_->Append(Slice(buf, header_size), 0 /* crc32c_checksum */, - rate_limiter_priority); + IOOptions opts; + IOStatus s = WritableFileWriter::PrepareIOOptions(write_options, opts); if (s.ok()) { - s = dest_->Append(Slice(ptr, n), payload_crc, rate_limiter_priority); + s = dest_->Append(opts, Slice(buf, header_size), 0 /* crc32c_checksum */); + } + if (s.ok()) { + s = dest_->Append(opts, Slice(ptr, n), payload_crc); } block_offset_ += header_size + n; return s; diff --git a/db/log_writer.h b/db/log_writer.h index 7a64a85601..1bbf72569e 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -86,9 +86,8 @@ class Writer { ~Writer(); - IOStatus AddRecord(const Slice& slice, - Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); - IOStatus AddCompressionTypeRecord(); + IOStatus AddRecord(const WriteOptions& write_options, const Slice& slice); + IOStatus AddCompressionTypeRecord(const WriteOptions& write_options); // If there are column families in `cf_to_ts_sz` not included in // `recorded_cf_to_ts_sz_` and its user-defined timestamp size is non-zero, @@ -96,17 +95,17 @@ class Writer { // kRecyclableUserDefinedTimestampSizeType for these column families. // This timestamp size record applies to all subsequent records. IOStatus MaybeAddUserDefinedTimestampSizeRecord( - const UnorderedMap& cf_to_ts_sz, - Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); + const WriteOptions& write_options, + const UnorderedMap& cf_to_ts_sz); WritableFileWriter* file() { return dest_.get(); } const WritableFileWriter* file() const { return dest_.get(); } uint64_t get_log_number() const { return log_number_; } - IOStatus WriteBuffer(); + IOStatus WriteBuffer(const WriteOptions& write_options); - IOStatus Close(); + IOStatus Close(const WriteOptions& write_options); bool BufferIsEmpty(); @@ -121,9 +120,8 @@ class Writer { // record type stored in the header. uint32_t type_crc_[kMaxRecordType + 1]; - IOStatus EmitPhysicalRecord( - RecordType type, const char* ptr, size_t length, - Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); + IOStatus EmitPhysicalRecord(const WriteOptions& write_options, + RecordType type, const char* ptr, size_t length); // If true, it does not flush after each write. Instead it relies on the upper // layer to manually does the flush by calling ::WriteBuffer() diff --git a/db/memtable.cc b/db/memtable.cc index 0b8786bc2f..56679dd754 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -597,7 +597,7 @@ void MemTable::ConstructFragmentedRangeTombstones() { assert(!IsFragmentedRangeTombstonesConstructed(false)); // There should be no concurrent Construction if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority auto* unfragmented_iter = new MemTableIterator(*this, ReadOptions(), nullptr /* arena */, true /* use_range_del_table */); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index dfa93461bb..a65d3914b6 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -502,6 +502,7 @@ Status MemTableList::TryInstallMemtableFlushResults( mu->AssertHeld(); const ReadOptions read_options(Env::IOActivity::kFlush); + const WriteOptions write_options(Env::IOActivity::kFlush); // Flush was successful // Record the status on the memtable object. Either this call or a call by a @@ -614,10 +615,10 @@ Status MemTableList::TryInstallMemtableFlushResults( }; if (write_edits) { // this can release and reacquire the mutex. - s = vset->LogAndApply(cfd, mutable_cf_options, read_options, edit_list, - mu, db_directory, /*new_descriptor_log=*/false, - /*column_family_options=*/nullptr, - manifest_write_cb); + s = vset->LogAndApply( + cfd, mutable_cf_options, read_options, write_options, edit_list, mu, + db_directory, /*new_descriptor_log=*/false, + /*column_family_options=*/nullptr, manifest_write_cb); } else { // If write_edit is false (e.g: successful mempurge), // then remove old memtables, wake up manifest write queue threads, @@ -835,6 +836,7 @@ Status InstallMemtableAtomicFlushResults( mu->AssertHeld(); const ReadOptions read_options(Env::IOActivity::kFlush); + const WriteOptions write_options(Env::IOActivity::kFlush); size_t num = mems_list.size(); assert(cfds.size() == num); @@ -913,8 +915,8 @@ Status InstallMemtableAtomicFlushResults( } // this can release and reacquire the mutex. - s = vset->LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, - mu, db_directory); + s = vset->LogAndApply(cfds, mutable_cf_options_list, read_options, + write_options, edit_lists, mu, db_directory); for (size_t k = 0; k != cfds.size(); ++k) { auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); diff --git a/db/repair.cc b/db/repair.cc index 7585d97585..bf409e22ac 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -146,8 +146,10 @@ class Repairer { // Adds a column family to the VersionSet with cf_options_ and updates // manifest. Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) { - // TODO: plumb Env::IOActivity; + // TODO: plumb Env::IOActivity, Env::IOPriority; const ReadOptions read_options; + const WriteOptions write_options; + const auto* cf_opts = GetColumnFamilyOptions(cf_name); if (cf_opts == nullptr) { return Status::Corruption("Encountered unknown column family with name=" + @@ -170,9 +172,9 @@ class Repairer { Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir, nullptr); if (status.ok()) { - status = vset_.LogAndApply(cfd, mut_cf_opts, read_options, &edit, &mutex_, - db_dir.get(), false /* new_descriptor_log */, - cf_opts); + status = vset_.LogAndApply(cfd, mut_cf_opts, read_options, write_options, + &edit, &mutex_, db_dir.get(), + false /* new_descriptor_log */, cf_opts); } mutex_.Unlock(); return status; @@ -362,9 +364,6 @@ class Repairer { } }; - // TODO: plumb Env::IOActivity - const ReadOptions read_options; - // Open the log file std::string logname = LogFileName(wal_dir, log); const auto& fs = env_->GetFileSystem(); @@ -440,7 +439,7 @@ class Repairer { FileMetaData meta; meta.fd = FileDescriptor(next_file_number_++, 0, 0); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -463,26 +462,29 @@ class Repairer { IOStatus io_s; CompressionOptions default_compression; + // TODO: plumb Env::IOActivity, Env::IOPriority + const ReadOptions read_options; + const WriteOptions write_option(Env::IO_HIGH); TableBuilderOptions tboptions( - *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), - cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), - kNoCompression, default_compression, cfd->GetID(), cfd->GetName(), - -1 /* level */, false /* is_bottommost */, - TableFileCreationReason::kRecovery, 0 /* oldest_key_time */, - 0 /* file_creation_time */, "DB Repairer" /* db_id */, db_session_id_, - 0 /*target_file_size*/, meta.fd.GetNumber()); + *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), read_options, + write_option, cfd->internal_comparator(), + cfd->int_tbl_prop_collector_factories(), kNoCompression, + default_compression, cfd->GetID(), cfd->GetName(), -1 /* level */, + false /* is_bottommost */, TableFileCreationReason::kRecovery, + 0 /* oldest_key_time */, 0 /* file_creation_time */, + "DB Repairer" /* db_id */, db_session_id_, 0 /*target_file_size*/, + meta.fd.GetNumber()); SeqnoToTimeMapping empty_seqno_to_time_mapping; status = BuildTable( dbname_, /* versions */ nullptr, immutable_db_options_, tboptions, - file_options_, read_options, table_cache_.get(), iter.get(), + file_options_, table_cache_.get(), iter.get(), std::move(range_del_iters), &meta, nullptr /* blob_file_additions */, {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker, false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s, nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery, empty_seqno_to_time_mapping, nullptr /* event_logger */, - 0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */, - write_hint); + 0 /* job_id */, nullptr /* table_properties */, write_hint); ROCKS_LOG_INFO(db_options_.info_log, "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log, counter, meta.fd.GetNumber(), @@ -529,7 +531,7 @@ class Repairer { file_size); std::shared_ptr props; if (status.ok()) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; status = table_cache_->GetTableProperties( file_options_, read_options, icmp_, t->meta, &props, @@ -592,7 +594,7 @@ class Repairer { } } if (status.ok()) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ropts; ropts.total_order_seek = true; InternalIterator* iter = table_cache_->NewIterator( @@ -641,7 +643,7 @@ class Repairer { // an SST file is a full sorted run. This probably needs the extra logic // from compaction_job.cc around call to UpdateBoundariesForRange (to // handle range tombstones extendingg beyond range of other entries). - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ropts; std::unique_ptr r_iter; status = table_cache_->GetRangeTombstoneIterator( @@ -666,8 +668,10 @@ class Repairer { } Status AddTables() { - // TODO: plumb Env::IOActivity; + // TODO: plumb Env::IOActivity, Env::IOPriority; const ReadOptions read_options; + const WriteOptions write_options; + std::unordered_map> cf_id_to_tables; SequenceNumber max_sequence = 0; for (size_t i = 0; i < tables_.size(); i++) { @@ -755,8 +759,8 @@ class Repairer { nullptr); if (s.ok()) { s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, db_dir.get(), - false /* new_descriptor_log */); + read_options, write_options, &edit, &mutex_, + db_dir.get(), false /* new_descriptor_log */); } mutex_.Unlock(); } diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index e10f02e67f..6bf265c2b9 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -52,10 +52,13 @@ void MakeBuilder( std::unique_ptr wf(new test::StringSink); writable->reset( new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions())); + const ReadOptions read_options; + const WriteOptions write_options; TableBuilderOptions tboptions( - ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories, - options.compression, options.compression_opts, kTestColumnFamilyId, - kTestColumnFamilyName, kTestLevel); + ioptions, moptions, read_options, write_options, internal_comparator, + int_tbl_prop_collector_factories, options.compression, + options.compression_opts, kTestColumnFamilyId, kTestColumnFamilyName, + kTestLevel); builder->reset(NewTableBuilder(tboptions, writable->get())); } } // namespace @@ -280,7 +283,7 @@ void TestCustomizedTablePropertiesCollector( builder->Add(ikey.Encode(), kv.second); } ASSERT_OK(builder->Finish()); - ASSERT_OK(writer->Flush()); + ASSERT_OK(writer->Flush(IOOptions())); // -- Step 2: Read properties test::StringSink* fwf = @@ -419,7 +422,7 @@ void TestInternalKeyPropertiesCollector( } ASSERT_OK(builder->Finish()); - ASSERT_OK(writable->Flush()); + ASSERT_OK(writable->Flush(IOOptions())); test::StringSink* fwf = static_cast(writable->writable_file()); diff --git a/db/version_set.cc b/db/version_set.cc index 72febac90f..d41f879824 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1623,7 +1623,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, std::stringstream ss; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; for (int level = 0; level < storage_info_.num_levels_; level++) { for (const auto& file_meta : storage_info_.files_[level]) { @@ -5113,7 +5113,7 @@ Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) { std::string manifest_file_name = DescriptorFileName(dbname_, manifest_file_number_); uint64_t size = 0; - IOStatus io_s = descriptor_log_->Close(); + IOStatus io_s = descriptor_log_->Close(WriteOptions()); descriptor_log_.reset(); TEST_SYNC_POINT("VersionSet::Close:AfterClose"); if (io_s.ok()) { @@ -5146,7 +5146,8 @@ Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) { VersionEdit edit; assert(cfd); const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); - s = LogAndApply(cfd, cf_opts, ReadOptions(), &edit, mu, db_dir); + s = LogAndApply(cfd, cf_opts, ReadOptions(), WriteOptions(), &edit, mu, + db_dir); } closed_ = true; @@ -5230,8 +5231,8 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, Status VersionSet::ProcessManifestWrites( std::deque& writers, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options, - const ReadOptions& read_options) { + const ColumnFamilyOptions* new_cf_options, const ReadOptions& read_options, + const WriteOptions& write_options) { mu->AssertHeld(); assert(!writers.empty()); ManifestWriter& first_writer = writers.front(); @@ -5505,13 +5506,15 @@ Status VersionSet::ProcessManifestWrites( FileTypeSet tmp_set = db_options_->checksum_handoff_file_types; std::unique_ptr file_writer(new WritableFileWriter( std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_, - io_tracer_, nullptr, db_options_->listeners, nullptr, + io_tracer_, nullptr, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + db_options_->listeners, nullptr, tmp_set.Contains(FileType::kDescriptorFile), tmp_set.Contains(FileType::kDescriptorFile))); descriptor_log_.reset( new log::Writer(std::move(file_writer), 0, false)); - s = WriteCurrentStateToManifest(curr_state, wal_additions, - descriptor_log_.get(), io_s); + s = WriteCurrentStateToManifest(write_options, curr_state, + wal_additions, descriptor_log_.get(), + io_s); } else { manifest_io_status = io_s; s = io_s; @@ -5555,7 +5558,7 @@ Status VersionSet::ProcessManifestWrites( } ++idx; #endif /* !NDEBUG */ - io_s = descriptor_log_->AddRecord(record); + io_s = descriptor_log_->AddRecord(write_options, record); if (!io_s.ok()) { s = io_s; manifest_io_status = io_s; @@ -5564,7 +5567,8 @@ Status VersionSet::ProcessManifestWrites( } if (s.ok()) { - io_s = SyncManifest(db_options_, descriptor_log_->file()); + io_s = + SyncManifest(db_options_, write_options, descriptor_log_->file()); manifest_io_status = io_s; TEST_SYNC_POINT_CALLBACK( "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s); @@ -5582,7 +5586,8 @@ Status VersionSet::ProcessManifestWrites( assert(manifest_io_status.ok()); } if (s.ok() && new_descriptor_log) { - io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_, + io_s = SetCurrentFile(write_options, fs_.get(), dbname_, + pending_manifest_file_number_, dir_contains_current_file); if (!io_s.ok()) { s = io_s; @@ -5822,7 +5827,7 @@ void VersionSet::WakeUpWaitingManifestWriters() { Status VersionSet::LogAndApply( const autovector& column_family_datas, const autovector& mutable_cf_options_list, - const ReadOptions& read_options, + const ReadOptions& read_options, const WriteOptions& write_options, const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options, @@ -5900,8 +5905,8 @@ Status VersionSet::LogAndApply( return Status::ColumnFamilyDropped(); } return ProcessManifestWrites(writers, mu, dir_contains_current_file, - new_descriptor_log, new_cf_options, - read_options); + new_descriptor_log, new_cf_options, read_options, + write_options); } void VersionSet::LogAndApplyCFHelper(VersionEdit* edit, @@ -6238,7 +6243,7 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, Status VersionSet::ListColumnFamiliesFromManifest( const std::string& manifest_path, FileSystem* fs, std::vector* column_families) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; std::unique_ptr file_reader; Status s; @@ -6282,8 +6287,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, "Number of levels needs to be bigger than 1"); } - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; ImmutableDBOptions db_options(*options); ColumnFamilyOptions cf_options(*options); @@ -6373,8 +6379,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, InstrumentedMutex dummy_mutex; InstrumentedMutexLock l(&dummy_mutex); return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), - mutable_cf_options, read_options, &ve, - &dummy_mutex, nullptr, true); + mutable_cf_options, read_options, write_options, + &ve, &dummy_mutex, nullptr, true); } // Get the checksum information including the checksum and checksum function @@ -6448,7 +6454,7 @@ Status VersionSet::DumpManifest( Options& options, std::string& dscname, bool verbose, bool hex, bool json, const std::vector& cf_descs) { assert(options.env); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; std::vector column_families; @@ -6515,6 +6521,7 @@ void VersionSet::MarkMinLogNumberToKeep(uint64_t number) { } Status VersionSet::WriteCurrentStateToManifest( + const WriteOptions& write_options, const std::unordered_map& curr_state, const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) { // TODO: Break up into multiple records to reduce memory usage on recovery? @@ -6535,7 +6542,7 @@ Status VersionSet::WriteCurrentStateToManifest( return Status::Corruption("Unable to Encode VersionEdit:" + edit_for_db_id.DebugString(true)); } - io_s = log->AddRecord(db_id_record); + io_s = log->AddRecord(write_options, db_id_record); if (!io_s.ok()) { return io_s; } @@ -6550,7 +6557,7 @@ Status VersionSet::WriteCurrentStateToManifest( return Status::Corruption("Unable to Encode VersionEdit: " + wal_additions.DebugString(true)); } - io_s = log->AddRecord(record); + io_s = log->AddRecord(write_options, record); if (!io_s.ok()) { return io_s; } @@ -6567,7 +6574,7 @@ Status VersionSet::WriteCurrentStateToManifest( return Status::Corruption("Unable to Encode VersionEdit: " + wal_deletions.DebugString(true)); } - io_s = log->AddRecord(wal_deletions_record); + io_s = log->AddRecord(write_options, wal_deletions_record); if (!io_s.ok()) { return io_s; } @@ -6597,7 +6604,7 @@ Status VersionSet::WriteCurrentStateToManifest( return Status::Corruption("Unable to Encode VersionEdit:" + edit.DebugString(true)); } - io_s = log->AddRecord(record); + io_s = log->AddRecord(write_options, record); if (!io_s.ok()) { return io_s; } @@ -6679,7 +6686,7 @@ Status VersionSet::WriteCurrentStateToManifest( return Status::Corruption("Unable to Encode VersionEdit:" + edit.DebugString(true)); } - io_s = log->AddRecord(record); + io_s = log->AddRecord(write_options, record); if (!io_s.ok()) { return io_s; } diff --git a/db/version_set.h b/db/version_set.h index d99edfd6c1..d73a535606 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1170,14 +1170,15 @@ class VersionSet { virtual Status Close(FSDirectory* db_dir, InstrumentedMutex* mu); Status LogAndApplyToDefaultColumnFamily( - const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, + const ReadOptions& read_options, const WriteOptions& write_options, + VersionEdit* edit, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr) { ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault(); const MutableCFOptions* cf_options = default_cf->GetLatestMutableCFOptions(); - return LogAndApply(default_cf, *cf_options, read_options, edit, mu, - dir_contains_current_file, new_descriptor_log, + return LogAndApply(default_cf, *cf_options, read_options, write_options, + edit, mu, dir_contains_current_file, new_descriptor_log, column_family_options); } @@ -1190,7 +1191,8 @@ class VersionSet { Status LogAndApply( ColumnFamilyData* column_family_data, const MutableCFOptions& mutable_cf_options, - const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, + const ReadOptions& read_options, const WriteOptions& write_options, + VersionEdit* edit, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr, const std::function& manifest_wcb = {}) { @@ -1202,16 +1204,17 @@ class VersionSet { autovector edit_list; edit_list.emplace_back(edit); edit_lists.emplace_back(edit_list); - return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, - mu, dir_contains_current_file, new_descriptor_log, - column_family_options, {manifest_wcb}); + return LogAndApply(cfds, mutable_cf_options_list, read_options, + write_options, edit_lists, mu, dir_contains_current_file, + new_descriptor_log, column_family_options, + {manifest_wcb}); } // The batch version. If edit_list.size() > 1, caller must ensure that // no edit in the list column family add or drop Status LogAndApply( ColumnFamilyData* column_family_data, const MutableCFOptions& mutable_cf_options, - const ReadOptions& read_options, + const ReadOptions& read_options, const WriteOptions& write_options, const autovector& edit_list, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr, @@ -1222,9 +1225,10 @@ class VersionSet { mutable_cf_options_list.emplace_back(&mutable_cf_options); autovector> edit_lists; edit_lists.emplace_back(edit_list); - return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, - mu, dir_contains_current_file, new_descriptor_log, - column_family_options, {manifest_wcb}); + return LogAndApply(cfds, mutable_cf_options_list, read_options, + write_options, edit_lists, mu, dir_contains_current_file, + new_descriptor_log, column_family_options, + {manifest_wcb}); } // The across-multi-cf batch version. If edit_lists contain more than @@ -1233,7 +1237,7 @@ class VersionSet { virtual Status LogAndApply( const autovector& cfds, const autovector& mutable_cf_options_list, - const ReadOptions& read_options, + const ReadOptions& read_options, const WriteOptions& write_options, const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, @@ -1547,6 +1551,7 @@ class VersionSet { new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_); constexpr bool update_stats = false; + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; version->PrepareAppend(mutable_cf_options, read_options, update_stats); AppendVersion(cfd, version); @@ -1595,6 +1600,7 @@ class VersionSet { // Save current contents to *log Status WriteCurrentStateToManifest( + const WriteOptions& write_options, const std::unordered_map& curr_state, const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s); @@ -1688,7 +1694,8 @@ class VersionSet { FSDirectory* dir_contains_current_file, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options, - const ReadOptions& read_options); + const ReadOptions& read_options, + const WriteOptions& write_options); void LogAndApplyCFHelper(VersionEdit* edit, SequenceNumber* max_last_sequence); @@ -1747,7 +1754,7 @@ class ReactiveVersionSet : public VersionSet { private: std::unique_ptr manifest_tailer_; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options_; using VersionSet::LogAndApply; using VersionSet::Recover; @@ -1756,6 +1763,7 @@ class ReactiveVersionSet : public VersionSet { const autovector& /*cfds*/, const autovector& /*mutable_cf_options_list*/, const ReadOptions& /* read_options */, + const WriteOptions& /* write_options */, const autovector>& /*edit_lists*/, InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/, bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/, diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 390d355e4c..b16ffd0359 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -1322,11 +1322,11 @@ class VersionSetTestBase { log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); std::string record; new_db.EncodeTo(&record); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); for (const auto& e : new_cfs) { record.clear(); e.EncodeTo(&record); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); } } @@ -1342,11 +1342,11 @@ class VersionSetTestBase { void NewDB() { SequenceNumber last_seqno; std::unique_ptr log_writer; - ASSERT_OK(SetIdentityFile(env_, dbname_)); + ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); PrepareManifest(&column_families_, &last_seqno, &log_writer); log_writer.reset(); // Make "CURRENT" file point to the new manifest file. - Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); EXPECT_OK(versions_->Recover(column_families_, false)); @@ -1392,7 +1392,7 @@ class VersionSetTestBase { mutex_.Lock(); Status s = versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - read_options_, &edit, &mutex_, nullptr); + read_options_, write_options_, &edit, &mutex_, nullptr); mutex_.Unlock(); return s; } @@ -1406,7 +1406,7 @@ class VersionSetTestBase { mutex_.Lock(); Status s = versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - read_options_, vedits, &mutex_, nullptr); + read_options_, write_options_, vedits, &mutex_, nullptr); mutex_.Unlock(); return s; } @@ -1418,7 +1418,8 @@ class VersionSetTestBase { VersionEdit dummy; ASSERT_OK(versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - read_options_, &dummy, &mutex_, db_directory, new_descriptor_log)); + read_options_, write_options_, &dummy, &mutex_, db_directory, + new_descriptor_log)); mutex_.Unlock(); } @@ -1436,7 +1437,7 @@ class VersionSetTestBase { mutex_.Lock(); s = versions_->LogAndApply(/*column_family_data=*/nullptr, MutableCFOptions(cf_options), read_options_, - &new_cf, &mutex_, + write_options_, &new_cf, &mutex_, /*db_directory=*/nullptr, /*new_descriptor_log=*/false, &cf_options); mutex_.Unlock(); @@ -1459,6 +1460,8 @@ class VersionSetTestBase { ImmutableOptions immutable_options_; MutableCFOptions mutable_cf_options_; const ReadOptions read_options_; + const WriteOptions write_options_; + std::shared_ptr table_cache_; WriteController write_controller_; WriteBufferManager write_buffer_manager_; @@ -1483,6 +1486,7 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { NewDB(); const int kGroupSize = 5; const ReadOptions read_options; + const WriteOptions write_options; autovector edits; for (int i = 0; i != kGroupSize; ++i) { @@ -1510,8 +1514,9 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { }); SyncPoint::GetInstance()->EnableProcessing(); mutex_.Lock(); - Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, read_options, - edit_lists, &mutex_, nullptr); + Status s = + versions_->LogAndApply(cfds, all_mutable_cf_options, read_options, + write_options, edit_lists, &mutex_, nullptr); mutex_.Unlock(); EXPECT_OK(s); EXPECT_EQ(kGroupSize - 1, count); @@ -1713,7 +1718,7 @@ TEST_F(VersionSetTest, ObsoleteBlobFile) { mutex_.Lock(); Status s = versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - read_options_, &edit, &mutex_, nullptr); + read_options_, write_options_, &edit, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); @@ -2454,7 +2459,8 @@ class VersionSetWithTimestampTest : public VersionSetTest { Status s; mutex_.Lock(); s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()), - read_options_, edits_, &mutex_, nullptr); + read_options_, write_options_, edits_, &mutex_, + nullptr); mutex_.Unlock(); ASSERT_OK(s); VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end())); @@ -2514,7 +2520,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, edits_[i].MarkAtomicGroup(--remaining); edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); } void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) { @@ -2526,7 +2532,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, edits_[i].MarkAtomicGroup(--remaining); edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); } void SetupCorruptedAtomicGroup(int atomic_group_size) { @@ -2540,7 +2546,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, } edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); } void SetupIncorrectAtomicGroup(int atomic_group_size) { @@ -2556,7 +2562,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, } edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); } void SetupTestSyncPoints() { @@ -2602,7 +2608,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, for (int i = 0; i < num_edits; i++) { std::string record; edits_[i].EncodeTo(&record); - ASSERT_OK(log_writer_->AddRecord(record)); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record)); } } @@ -2724,7 +2730,7 @@ TEST_F(VersionSetAtomicGroupTest, // edits. std::string last_record; edits_[kAtomicGroupSize - 1].EncodeTo(&last_record); - EXPECT_OK(log_writer_->AddRecord(last_record)); + EXPECT_OK(log_writer_->AddRecord(WriteOptions(), last_record)); InstrumentedMutex mu; std::unordered_set cfds_changed; mu.Lock(); @@ -2896,12 +2902,13 @@ class VersionSetTestDropOneCF : public VersionSetTestBase, // last column family in an atomic group. TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { const ReadOptions read_options; + const WriteOptions write_options; std::vector column_families; SequenceNumber last_seqno; std::unique_ptr log_writer; PrepareManifest(&column_families, &last_seqno, &log_writer); - Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); EXPECT_OK(versions_->Recover(column_families, false /* read_only */)); @@ -2924,9 +2931,9 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { cfd_to_drop->Ref(); drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID()); mutex_.Lock(); - s = versions_->LogAndApply(cfd_to_drop, - *cfd_to_drop->GetLatestMutableCFOptions(), - read_options, &drop_cf_edit, &mutex_, nullptr); + s = versions_->LogAndApply( + cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), read_options, + write_options, &drop_cf_edit, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); @@ -2976,7 +2983,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { SyncPoint::GetInstance()->EnableProcessing(); mutex_.Lock(); s = versions_->LogAndApply(cfds, mutable_cf_options_list, read_options, - edit_lists, &mutex_, nullptr); + write_options, edit_lists, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); ASSERT_EQ(1, called); @@ -3010,7 +3017,7 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase, log_writer->reset(new log::Writer(std::move(file_writer), 0, true)); std::string record; ASSERT_TRUE(new_db.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); // Create new column family VersionEdit new_cf; @@ -3020,7 +3027,7 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase, new_cf.SetNextFile(2); record.clear(); ASSERT_TRUE(new_cf.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); } @@ -3034,8 +3041,8 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase, TEST_F(EmptyDefaultCfNewManifest, Recover) { PrepareManifest(nullptr, nullptr, &log_writer_); log_writer_.reset(); - Status s = - SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3066,7 +3073,7 @@ class VersionSetTestEmptyDb assert(nullptr != log_writer); VersionEdit new_db; if (db_options_.write_dbid_to_manifest) { - ASSERT_OK(SetIdentityFile(env_, dbname_)); + ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); DBOptions tmp_db_options; tmp_db_options.env = env_; std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); @@ -3085,7 +3092,7 @@ class VersionSetTestEmptyDb log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); std::string record; new_db.EncodeTo(&record); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); } } @@ -3099,8 +3106,8 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) { db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); PrepareManifest(nullptr, nullptr, &log_writer_); log_writer_.reset(); - Status s = - SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; @@ -3140,11 +3147,12 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) { { std::string record; new_cf1.EncodeTo(&record); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; @@ -3187,11 +3195,12 @@ TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) { new_cf.SetColumnFamily(cf_id++); std::string record; ASSERT_TRUE(new_cf.EncodeTo(&record)); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; @@ -3234,7 +3243,7 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) { new_cf.SetColumnFamily(cf_id++); std::string record; ASSERT_TRUE(new_cf.EncodeTo(&record)); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } { @@ -3245,11 +3254,12 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) { tmp_edit.SetLastSequence(0); std::string record; ASSERT_TRUE(tmp_edit.EncodeTo(&record)); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; @@ -3292,7 +3302,7 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) { new_cf.SetColumnFamily(cf_id++); std::string record; ASSERT_TRUE(new_cf.EncodeTo(&record)); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } { @@ -3302,11 +3312,12 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) { tmp_edit.SetLastSequence(0); std::string record; ASSERT_TRUE(tmp_edit.EncodeTo(&record)); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; @@ -3407,7 +3418,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, { std::string record; ASSERT_TRUE(new_db.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); } const std::vector cf_names = { @@ -3425,7 +3436,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, new_cf.SetColumnFamily(cf_id); std::string record; ASSERT_TRUE(new_cf.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); VersionEdit cf_files; @@ -3433,7 +3444,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, cf_files.SetLogNumber(0); record.clear(); ASSERT_TRUE(cf_files.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); ++cf_id; } @@ -3444,7 +3455,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, edit.SetLastSequence(seq); std::string record; ASSERT_TRUE(edit.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); } *last_seqno = seq + 1; @@ -3485,9 +3496,12 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, std::move(file), fname, FileOptions(), env_->GetSystemClock().get())); IntTblPropCollectorFactories int_tbl_prop_collector_factories; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(table_factory_->NewTableBuilder( TableBuilderOptions( - immutable_options_, mutable_cf_options_, *internal_comparator_, + immutable_options_, mutable_cf_options_, read_options, + write_options, *internal_comparator_, &int_tbl_prop_collector_factories, kNoCompression, CompressionOptions(), TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, @@ -3496,7 +3510,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, InternalKey ikey(info.key, 0, ValueType::kTypeValue); builder->Add(ikey.Encode(), "value"); ASSERT_OK(builder->Finish()); - ASSERT_OK(fwriter->Flush()); + ASSERT_OK(fwriter->Flush(IOOptions())); uint64_t file_size = 0; s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr); ASSERT_OK(s); @@ -3528,7 +3542,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, assert(log_writer_.get() != nullptr); std::string record; ASSERT_TRUE(edit.EncodeTo(&record, 0 /* ts_sz */)); - Status s = log_writer_->AddRecord(record); + Status s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } @@ -3573,7 +3587,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, std::vector>(), deleted_files); log_writer_.reset(); - Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3631,7 +3645,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, added_files, std::vector>()); log_writer_.reset(); - Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3685,7 +3699,7 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, std::vector>(), deleted_files); log_writer_.reset(); - Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); std::string manifest_path; VerifyManifest(&manifest_path); diff --git a/db/version_util.h b/db/version_util.h index ca2e7a377f..e499b9e2ed 100644 --- a/db/version_util.h +++ b/db/version_util.h @@ -36,15 +36,17 @@ class OfflineManifestWriter { /*no_error_if_files_missing*/ true); } - Status LogAndApply(const ReadOptions& read_options, ColumnFamilyData* cfd, + Status LogAndApply(const ReadOptions& read_options, + const WriteOptions& write_options, ColumnFamilyData* cfd, VersionEdit* edit, FSDirectory* dir_contains_current_file) { // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`. InstrumentedMutex mutex; mutex.Lock(); - Status s = versions_.LogAndApply( - cfd, *cfd->GetLatestMutableCFOptions(), read_options, edit, &mutex, - dir_contains_current_file, false /* new_descriptor_log */); + Status s = versions_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, write_options, edit, &mutex, + dir_contains_current_file, + false /* new_descriptor_log */); mutex.Unlock(); return s; } diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 3f47c2901d..3be19cb3a4 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -73,8 +73,8 @@ class WalManagerTest : public testing::Test { WriteBatch batch; ASSERT_OK(batch.Put(key, value)); WriteBatchInternal::SetSequence(&batch, seq); - ASSERT_OK( - current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch))); + ASSERT_OK(current_log_writer_->AddRecord( + WriteOptions(), WriteBatchInternal::Contents(&batch))); versions_->SetLastAllocatedSequence(seq); versions_->SetLastPublishedSequence(seq); versions_->SetLastSequence(seq); @@ -146,7 +146,8 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) { WriteBatch batch; ASSERT_OK(batch.Put("foo", "bar")); WriteBatchInternal::SetSequence(&batch, 10); - ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch))); + ASSERT_OK( + writer.AddRecord(WriteOptions(), WriteBatchInternal::Contents(&batch))); // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here. // Waiting for lei to finish with db_test diff --git a/db/write_batch.cc b/db/write_batch.cc index 75f6e1eb48..09fa2c371e 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -2064,7 +2064,7 @@ class MemTableInserter : public WriteBatch::Handler { // key not found in memtable. Do sst get, update, add SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ropts; // it's going to be overwritten for sure, so no point caching data block // containing the old version @@ -2511,7 +2511,7 @@ class MemTableInserter : public WriteBatch::Handler { SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions read_options; read_options.snapshot = &read_from_snapshot; diff --git a/db/write_thread.h b/db/write_thread.h index 6e5805e376..dc64601f9f 100644 --- a/db/write_thread.h +++ b/db/write_thread.h @@ -166,6 +166,8 @@ class WriteThread { PreReleaseCallback* _pre_release_callback = nullptr, PostMemTableCallback* _post_memtable_callback = nullptr) : batch(_batch), + // TODO: store a copy of WriteOptions instead of its seperated data + // members sync(write_options.sync), no_slowdown(write_options.no_slowdown), disable_wal(write_options.disableWAL), diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h index 83e6838c70..c5e2a1c454 100644 --- a/db_stress_tool/db_stress_env_wrapper.h +++ b/db_stress_tool/db_stress_env_wrapper.h @@ -76,6 +76,161 @@ class DbStressRandomAccessFileWrapper : public FSRandomAccessFileOwnerWrapper { } }; +class DbStressWritableFileWrapper : public FSWritableFileOwnerWrapper { + public: + explicit DbStressWritableFileWrapper(std::unique_ptr&& target) + : FSWritableFileOwnerWrapper(std::move(target)) {} + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Append(data, options, dbg); + } + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Append(data, options, verification_info, dbg); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->PositionedAppend(data, offset, options, dbg); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->PositionedAppend(data, offset, options, verification_info, + dbg); + } + + virtual IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Truncate(size, options, dbg); + } + + virtual IOStatus Close(const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Close(options, dbg); + } + + virtual IOStatus Flush(const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Flush(options, dbg); + } + + virtual IOStatus Sync(const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Sync(options, dbg); + } + + virtual IOStatus Fsync(const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Fsync(options, dbg); + } + +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual IOStatus Allocate(uint64_t offset, uint64_t len, + const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Allocate(offset, len, options, dbg); + } +#endif + + virtual IOStatus RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->RangeSync(offset, nbytes, options, dbg); + } +}; + class DbStressFSWrapper : public FileSystemWrapper { public: explicit DbStressFSWrapper(const std::shared_ptr& t) @@ -95,6 +250,17 @@ class DbStressFSWrapper : public FileSystemWrapper { return s; } + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + std::unique_ptr file; + IOStatus s = target()->NewWritableFile(f, file_opts, &file, dbg); + if (s.ok()) { + r->reset(new DbStressWritableFileWrapper(std::move(file))); + } + return s; + } + IOStatus DeleteFile(const std::string& f, const IOOptions& opts, IODebugContext* dbg) override { // We determine whether it is a manifest file by searching a strong, diff --git a/db_stress_tool/db_stress_listener.cc b/db_stress_tool/db_stress_listener.cc index 64adca8777..8b9fb2cbbe 100644 --- a/db_stress_tool/db_stress_listener.cc +++ b/db_stress_tool/db_stress_listener.cc @@ -130,8 +130,13 @@ UniqueIdVerifier::UniqueIdVerifier(const std::string& db_name, Env* env) } UniqueIdVerifier::~UniqueIdVerifier() { - IOStatus s = data_file_writer_->Close(); + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); + IOStatus s; + s = data_file_writer_->Close(IOOptions()); assert(s.ok()); + ThreadStatusUtil::SetThreadOperation(cur_op_type); } void UniqueIdVerifier::VerifyNoWrite(const std::string& id) { @@ -153,13 +158,14 @@ void UniqueIdVerifier::Verify(const std::string& id) { if (id_set_.size() >= 4294967) { return; } - IOStatus s = data_file_writer_->Append(Slice(id)); + IOOptions opts; + IOStatus s = data_file_writer_->Append(opts, Slice(id)); if (!s.ok()) { fprintf(stderr, "Error writing to unique id file: %s\n", s.ToString().c_str()); assert(false); } - s = data_file_writer_->Flush(); + s = data_file_writer_->Flush(opts); if (!s.ok()) { fprintf(stderr, "Error flushing unique id file: %s\n", s.ToString().c_str()); diff --git a/db_stress_tool/multi_ops_txns_stress.cc b/db_stress_tool/multi_ops_txns_stress.cc index 145a96a750..ee90711b1f 100644 --- a/db_stress_tool/multi_ops_txns_stress.cc +++ b/db_stress_tool/multi_ops_txns_stress.cc @@ -373,10 +373,15 @@ Status MultiOpsTxnsStressTest::TestGet( ThreadState* thread, const ReadOptions& read_opts, const std::vector& /*rand_column_families*/, const std::vector& /*rand_keys*/) { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); uint32_t a = 0; uint32_t pos = 0; std::tie(a, pos) = ChooseExistingA(thread); - return PointLookupTxn(thread, read_opts, a); + Status s = PointLookupTxn(thread, read_opts, a); + ThreadStatusUtil::SetThreadOperation(cur_op_type); + return s; } // Not used. @@ -416,10 +421,15 @@ Status MultiOpsTxnsStressTest::TestIterate( ThreadState* thread, const ReadOptions& read_opts, const std::vector& /*rand_column_families*/, const std::vector& /*rand_keys*/) { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); uint32_t c = 0; uint32_t pos = 0; std::tie(c, pos) = ChooseExistingC(thread); - return RangeScanTxn(thread, read_opts, c); + Status s = RangeScanTxn(thread, read_opts, c); + ThreadStatusUtil::SetThreadOperation(cur_op_type); + return s; } // Not intended for use. @@ -1221,7 +1231,11 @@ void MultiOpsTxnsStressTest::VerifyPkSkFast(const ReadOptions& read_options, assert(db_ == db); assert(db_ != nullptr); + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); const Snapshot* const snapshot = db_->GetSnapshot(); + ThreadStatusUtil::SetThreadOperation(cur_op_type); assert(snapshot); ManagedSnapshot snapshot_guard(db_, snapshot); diff --git a/env/env.cc b/env/env.cc index 8ad828a838..7405f66e08 100644 --- a/env/env.cc +++ b/env/env.cc @@ -1051,9 +1051,10 @@ void Log(const std::shared_ptr& info_log, const char* format, ...) { } Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname, - bool should_sync) { + bool should_sync, const IOOptions* io_options) { const auto& fs = env->GetFileSystem(); - return WriteStringToFile(fs.get(), data, fname, should_sync); + return WriteStringToFile(fs.get(), data, fname, should_sync, + io_options ? *io_options : IOOptions()); } Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { diff --git a/env/env_test.cc b/env/env_test.cc index 4cf3c988d3..f478806f8a 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -2610,7 +2610,7 @@ TEST_F(EnvTest, IsDirectory) { FileOptions(), SystemClock::Default().get())); constexpr char buf[] = "test"; - s = fwriter->Append(buf); + s = fwriter->Append(IOOptions(), buf); ASSERT_OK(s); } ASSERT_OK(Env::Default()->IsDirectory(test_file_path, &is_dir)); diff --git a/env/file_system.cc b/env/file_system.cc index e01ec12c9c..27c7207f0f 100644 --- a/env/file_system.cc +++ b/env/file_system.cc @@ -180,19 +180,20 @@ FileOptions FileSystem::OptimizeForBlobFileRead( } IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, - const std::string& fname, bool should_sync) { + const std::string& fname, bool should_sync, + const IOOptions& io_options) { std::unique_ptr file; EnvOptions soptions; IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr); if (!s.ok()) { return s; } - s = file->Append(data, IOOptions(), nullptr); + s = file->Append(data, io_options, nullptr); if (s.ok() && should_sync) { - s = file->Sync(IOOptions(), nullptr); + s = file->Sync(io_options, nullptr); } if (!s.ok()) { - fs->DeleteFile(fname, IOOptions(), nullptr); + fs->DeleteFile(fname, io_options, nullptr); } return s; } diff --git a/file/file_util.cc b/file/file_util.cc index 9eee106378..d78a03491a 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -26,6 +26,7 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, FileOptions soptions; IOStatus io_s; std::unique_ptr src_reader; + const IOOptions opts; { soptions.temperature = temperature; @@ -37,7 +38,7 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, if (size == 0) { // default argument means copy everything - io_s = fs->GetFileSize(source, IOOptions(), &size, nullptr); + io_s = fs->GetFileSize(source, opts, &size, nullptr); if (!io_s.ok()) { return io_s; } @@ -60,13 +61,14 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, if (slice.size() == 0) { return IOStatus::Corruption("file too small"); } - io_s = dest_writer->Append(slice); + + io_s = dest_writer->Append(opts, slice); if (!io_s.ok()) { return io_s; } size -= slice.size(); } - return dest_writer->Sync(use_fsync); + return dest_writer->Sync(opts, use_fsync); } IOStatus CopyFile(FileSystem* fs, const std::string& source, @@ -85,6 +87,7 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, return io_s; } + // TODO: pass in Histograms if the destination file is sst or blob dest_writer.reset( new WritableFileWriter(std::move(destfile), destination, options)); } @@ -99,19 +102,21 @@ IOStatus CreateFile(FileSystem* fs, const std::string& destination, const EnvOptions soptions; IOStatus io_s; std::unique_ptr dest_writer; + const IOOptions opts; std::unique_ptr destfile; io_s = fs->NewWritableFile(destination, soptions, &destfile, nullptr); if (!io_s.ok()) { return io_s; } + // TODO: pass in Histograms if the destination file is sst or blob dest_writer.reset( new WritableFileWriter(std::move(destfile), destination, soptions)); - io_s = dest_writer->Append(Slice(contents)); + io_s = dest_writer->Append(opts, Slice(contents)); if (!io_s.ok()) { return io_s; } - return dest_writer->Sync(use_fsync); + return dest_writer->Sync(opts, use_fsync); } Status DeleteDBFile(const ImmutableDBOptions* db_options, diff --git a/file/file_util.h b/file/file_util.h index 9c95478c79..032afc19b4 100644 --- a/file/file_util.h +++ b/file/file_util.h @@ -87,6 +87,14 @@ inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, return IOStatus::OK(); } +inline IOStatus PrepareIOFromWriteOptions(const WriteOptions& wo, + IOOptions& opts) { + opts.rate_limiter_priority = wo.rate_limiter_priority; + opts.io_activity = wo.io_activity; + + return IOStatus::OK(); +} + // Test method to delete the input directory and all of its contents. // This method is destructive and is meant for use only in tests!!! Status DestroyDir(Env* env, const std::string& dir); diff --git a/file/filename.cc b/file/filename.cc index fb7d254721..b34a0e113e 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -13,8 +13,10 @@ #include #include +#include "file/file_util.h" #include "file/writable_file_writer.h" #include "rocksdb/env.h" +#include "rocksdb/file_system.h" #include "test_util/sync_point.h" #include "util/stop_watch.h" #include "util/string_util.h" @@ -384,8 +386,8 @@ bool ParseFileName(const std::string& fname, uint64_t* number, return true; } -IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, - uint64_t descriptor_number, +IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, + const std::string& dbname, uint64_t descriptor_number, FSDirectory* dir_contains_current_file) { // Remove leading "dbname/" and add newline to manifest file name std::string manifest = DescriptorFileName(dbname, descriptor_number); @@ -393,21 +395,25 @@ IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, assert(contents.starts_with(dbname + "/")); contents.remove_prefix(dbname.size() + 1); std::string tmp = TempFileName(dbname, descriptor_number); - IOStatus s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true); + IOOptions opts; + IOStatus s = PrepareIOFromWriteOptions(write_options, opts); + if (s.ok()) { + s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts); + } TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s); if (s.ok()) { TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:0", REDUCE_ODDS2); - s = fs->RenameFile(tmp, CurrentFileName(dbname), IOOptions(), nullptr); + s = fs->RenameFile(tmp, CurrentFileName(dbname), opts, nullptr); TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:1", REDUCE_ODDS2); TEST_SYNC_POINT_CALLBACK("SetCurrentFile:AfterRename", &s); } if (s.ok()) { if (dir_contains_current_file != nullptr) { s = dir_contains_current_file->FsyncWithDirOptions( - IOOptions(), nullptr, DirFsyncOptions(CurrentFileName(dbname))); + opts, nullptr, DirFsyncOptions(CurrentFileName(dbname))); } } else { - fs->DeleteFile(tmp, IOOptions(), nullptr) + fs->DeleteFile(tmp, opts, nullptr) .PermitUncheckedError(); // NOTE: PermitUncheckedError is acceptable // here as we are already handling an error // case, and this is just a best-attempt @@ -416,8 +422,8 @@ IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, return s; } -Status SetIdentityFile(Env* env, const std::string& dbname, - const std::string& db_id) { +Status SetIdentityFile(const WriteOptions& write_options, Env* env, + const std::string& dbname, const std::string& db_id) { std::string id; if (db_id.empty()) { id = env->GenerateUniqueId(); @@ -428,17 +434,21 @@ Status SetIdentityFile(Env* env, const std::string& dbname, // Reserve the filename dbname/000000.dbtmp for the temporary identity file std::string tmp = TempFileName(dbname, 0); std::string identify_file_name = IdentityFileName(dbname); - Status s = WriteStringToFile(env, id, tmp, true); + Status s; + IOOptions opts; + s = PrepareIOFromWriteOptions(write_options, opts); + if (s.ok()) { + s = WriteStringToFile(env, id, tmp, true, &opts); + } if (s.ok()) { s = env->RenameFile(tmp, identify_file_name); } std::unique_ptr dir_obj; if (s.ok()) { - s = env->GetFileSystem()->NewDirectory(dbname, IOOptions(), &dir_obj, - nullptr); + s = env->GetFileSystem()->NewDirectory(dbname, opts, &dir_obj, nullptr); } if (s.ok()) { - s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr, + s = dir_obj->FsyncWithDirOptions(opts, nullptr, DirFsyncOptions(identify_file_name)); } @@ -446,7 +456,7 @@ Status SetIdentityFile(Env* env, const std::string& dbname, // if it is not impelmented. Detailed explanations can be found in // db/db_impl/db_impl.h if (s.ok()) { - Status temp_s = dir_obj->Close(IOOptions(), nullptr); + Status temp_s = dir_obj->Close(opts, nullptr); if (!temp_s.ok()) { if (temp_s.IsNotSupported()) { temp_s.PermitUncheckedError(); @@ -462,10 +472,16 @@ Status SetIdentityFile(Env* env, const std::string& dbname, } IOStatus SyncManifest(const ImmutableDBOptions* db_options, + const WriteOptions& write_options, WritableFileWriter* file) { TEST_KILL_RANDOM_WITH_WEIGHT("SyncManifest:0", REDUCE_ODDS2); StopWatch sw(db_options->clock, db_options->stats, MANIFEST_FILE_SYNC_MICROS); - return file->Sync(db_options->use_fsync); + IOOptions io_options; + IOStatus s = PrepareIOFromWriteOptions(write_options, io_options); + if (!s.ok()) { + return s; + } + return file->Sync(io_options, db_options->use_fsync); } Status GetInfoLogFiles(const std::shared_ptr& fs, diff --git a/file/filename.h b/file/filename.h index 2eb125b6a1..156b7224ff 100644 --- a/file/filename.h +++ b/file/filename.h @@ -162,16 +162,19 @@ extern bool ParseFileName(const std::string& filename, uint64_t* number, // specified number. On its success and when dir_contains_current_file is not // nullptr, the function will fsync the directory containing the CURRENT file // when -extern IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, +extern IOStatus SetCurrentFile(const WriteOptions& write_options, + FileSystem* fs, const std::string& dbname, uint64_t descriptor_number, FSDirectory* dir_contains_current_file); // Make the IDENTITY file for the db -extern Status SetIdentityFile(Env* env, const std::string& dbname, +extern Status SetIdentityFile(const WriteOptions& write_options, Env* env, + const std::string& dbname, const std::string& db_id = {}); // Sync manifest file `file`. extern IOStatus SyncManifest(const ImmutableDBOptions* db_options, + const WriteOptions& write_options, WritableFileWriter* file); // Return list of file names of info logs in `file_names`. diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc index 908878a5fa..4fadf1d71a 100644 --- a/file/writable_file_writer.cc +++ b/file/writable_file_writer.cc @@ -13,6 +13,7 @@ #include #include "db/version_edit.h" +#include "file/file_util.h" #include "monitoring/histogram.h" #include "monitoring/iostats_context_imp.h" #include "port/port.h" @@ -24,6 +25,24 @@ #include "util/rate_limiter_impl.h" namespace ROCKSDB_NAMESPACE { +inline Histograms GetFileWriteHistograms(Histograms file_writer_hist, + Env::IOActivity io_activity) { + if (file_writer_hist == Histograms::SST_WRITE_MICROS || + file_writer_hist == Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS) { + switch (io_activity) { + case Env::IOActivity::kFlush: + return Histograms::FILE_WRITE_FLUSH_MICROS; + case Env::IOActivity::kCompaction: + return Histograms::FILE_WRITE_COMPACTION_MICROS; + case Env::IOActivity::kDBOpen: + return Histograms::FILE_WRITE_DB_OPEN_MICROS; + default: + break; + } + } + return Histograms::HISTOGRAM_ENUM_MAX; +} + IOStatus WritableFileWriter::Create(const std::shared_ptr& fs, const std::string& fname, const FileOptions& file_opts, @@ -42,12 +61,16 @@ IOStatus WritableFileWriter::Create(const std::shared_ptr& fs, return io_s; } -IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, - Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::Append(const IOOptions& opts, const Slice& data, + uint32_t crc32c_checksum) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } + StopWatch sw(clock_, stats_, hist_type_, + GetFileWriteHistograms(hist_type_, opts.io_activity)); + + const IOOptions io_options = FinalizeIOOptions(opts); const char* src = data.data(); size_t left = data.size(); IOStatus s; @@ -59,10 +82,6 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, UpdateFileChecksum(data); { - IOOptions io_options; - io_options.rate_limiter_priority = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); IOSTATS_TIMER_GUARD(prepare_write_nanos); TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite"); writable_file_->PrepareWrite(static_cast(GetFileSize()), left, @@ -88,7 +107,7 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, // Flush only when buffered I/O if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) { if (buf_.CurrentSize() > 0) { - s = Flush(op_rate_limiter_priority); + s = Flush(io_options); if (!s.ok()) { set_seen_error(); return s; @@ -119,7 +138,7 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, src += appended; if (left > 0) { - s = Flush(op_rate_limiter_priority); + s = Flush(io_options); if (!s.ok()) { break; } @@ -129,7 +148,7 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, } else { assert(buf_.CurrentSize() == 0); buffered_data_crc32c_checksum_ = crc32c_checksum; - s = WriteBufferedWithChecksum(src, left, op_rate_limiter_priority); + s = WriteBufferedWithChecksum(io_options, src, left); } } else { // In this case, either we do not need to do the data verification or @@ -149,7 +168,7 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, src += appended; if (left > 0) { - s = Flush(op_rate_limiter_priority); + s = Flush(io_options); if (!s.ok()) { break; } @@ -160,9 +179,9 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, assert(buf_.CurrentSize() == 0); if (perform_data_verification_ && buffered_data_with_checksum_) { buffered_data_crc32c_checksum_ = crc32c::Value(src, left); - s = WriteBufferedWithChecksum(src, left, op_rate_limiter_priority); + s = WriteBufferedWithChecksum(io_options, src, left); } else { - s = WriteBuffered(src, left, op_rate_limiter_priority); + s = WriteBuffered(io_options, src, left); } } } @@ -177,11 +196,12 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, return s; } -IOStatus WritableFileWriter::Pad(const size_t pad_bytes, - Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::Pad(const IOOptions& opts, + const size_t pad_bytes) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } + const IOOptions io_options = FinalizeIOOptions(opts); assert(pad_bytes < kDefaultPageSize); size_t left = pad_bytes; size_t cap = buf_.Capacity() - buf_.CurrentSize(); @@ -195,7 +215,7 @@ IOStatus WritableFileWriter::Pad(const size_t pad_bytes, buf_.PadWith(append_bytes, 0); left -= append_bytes; if (left > 0) { - IOStatus s = Flush(op_rate_limiter_priority); + IOStatus s = Flush(io_options); if (!s.ok()) { set_seen_error(); return s; @@ -214,11 +234,12 @@ IOStatus WritableFileWriter::Pad(const size_t pad_bytes, return IOStatus::OK(); } -IOStatus WritableFileWriter::Close() { +IOStatus WritableFileWriter::Close(const IOOptions& opts) { + IOOptions io_options = FinalizeIOOptions(opts); if (seen_error()) { IOStatus interim; if (writable_file_.get() != nullptr) { - interim = writable_file_->Close(IOOptions(), nullptr); + interim = writable_file_->Close(io_options, nullptr); writable_file_.reset(); } if (interim.ok()) { @@ -240,11 +261,9 @@ IOStatus WritableFileWriter::Close() { } IOStatus s; - s = Flush(); // flush cache to OS + s = Flush(io_options); // flush cache to OS IOStatus interim; - IOOptions io_options; - io_options.rate_limiter_priority = writable_file_->GetIOPriority(); // In direct I/O mode we write whole pages so // we need to let the file know where data ends. if (use_direct_io()) { @@ -322,11 +341,13 @@ IOStatus WritableFileWriter::Close() { // write out the cached data to the OS cache or storage if direct I/O // enabled -IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::Flush(const IOOptions& opts) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } + const IOOptions io_options = FinalizeIOOptions(opts); + IOStatus s; TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Flush:0", REDUCE_ODDS2); @@ -334,18 +355,17 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { if (use_direct_io()) { if (pending_sync_) { if (perform_data_verification_ && buffered_data_with_checksum_) { - s = WriteDirectWithChecksum(op_rate_limiter_priority); + s = WriteDirectWithChecksum(io_options); } else { - s = WriteDirect(op_rate_limiter_priority); + s = WriteDirect(io_options); } } } else { if (perform_data_verification_ && buffered_data_with_checksum_) { - s = WriteBufferedWithChecksum(buf_.BufferStart(), buf_.CurrentSize(), - op_rate_limiter_priority); + s = WriteBufferedWithChecksum(io_options, buf_.BufferStart(), + buf_.CurrentSize()); } else { - s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize(), - op_rate_limiter_priority); + s = WriteBuffered(io_options, buf_.BufferStart(), buf_.CurrentSize()); } } if (!s.ok()) { @@ -359,10 +379,6 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } - IOOptions io_options; - io_options.rate_limiter_priority = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); s = writable_file_->Flush(io_options, nullptr); if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); @@ -400,7 +416,8 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { assert(offset_sync_to >= last_sync_size_); if (offset_sync_to > 0 && offset_sync_to - last_sync_size_ >= bytes_per_sync_) { - s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_); + s = RangeSync(io_options, last_sync_size_, + offset_sync_to - last_sync_size_); if (!s.ok()) { set_seen_error(); } @@ -429,19 +446,25 @@ const char* WritableFileWriter::GetFileChecksumFuncName() const { } } -IOStatus WritableFileWriter::Sync(bool use_fsync) { +IOStatus WritableFileWriter::PrepareIOOptions(const WriteOptions& wo, + IOOptions& opts) { + return PrepareIOFromWriteOptions(wo, opts); +} + +IOStatus WritableFileWriter::Sync(const IOOptions& opts, bool use_fsync) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } - IOStatus s = Flush(); + IOOptions io_options = FinalizeIOOptions(opts); + IOStatus s = Flush(io_options); if (!s.ok()) { set_seen_error(); return s; } TEST_KILL_RANDOM("WritableFileWriter::Sync:0"); if (!use_direct_io() && pending_sync_) { - s = SyncInternal(use_fsync); + s = SyncInternal(io_options, use_fsync); if (!s.ok()) { set_seen_error(); return s; @@ -452,17 +475,19 @@ IOStatus WritableFileWriter::Sync(bool use_fsync) { return IOStatus::OK(); } -IOStatus WritableFileWriter::SyncWithoutFlush(bool use_fsync) { +IOStatus WritableFileWriter::SyncWithoutFlush(const IOOptions& opts, + bool use_fsync) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } + IOOptions io_options = FinalizeIOOptions(opts); if (!writable_file_->IsSyncThreadSafe()) { return IOStatus::NotSupported( "Can't WritableFileWriter::SyncWithoutFlush() because " "WritableFile::IsSyncThreadSafe() is false"); } TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1"); - IOStatus s = SyncInternal(use_fsync); + IOStatus s = SyncInternal(io_options, use_fsync); TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2"); if (!s.ok()) { #ifndef NDEBUG @@ -473,7 +498,8 @@ IOStatus WritableFileWriter::SyncWithoutFlush(bool use_fsync) { return s; } -IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { +IOStatus WritableFileWriter::SyncInternal(const IOOptions& opts, + bool use_fsync) { // Caller is supposed to check seen_error_ IOStatus s; IOSTATS_TIMER_GUARD(fsync_nanos); @@ -487,12 +513,10 @@ IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { start_ts = FileOperationInfo::StartNow(); } - IOOptions io_options; - io_options.rate_limiter_priority = writable_file_->GetIOPriority(); if (use_fsync) { - s = writable_file_->Fsync(io_options, nullptr); + s = writable_file_->Fsync(opts, nullptr); } else { - s = writable_file_->Sync(io_options, nullptr); + s = writable_file_->Sync(opts, nullptr); } if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); @@ -511,7 +535,8 @@ IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { return s; } -IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { +IOStatus WritableFileWriter::RangeSync(const IOOptions& opts, uint64_t offset, + uint64_t nbytes) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } @@ -522,9 +547,7 @@ IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } - IOOptions io_options; - io_options.rate_limiter_priority = writable_file_->GetIOPriority(); - IOStatus s = writable_file_->RangeSync(offset, nbytes, io_options, nullptr); + IOStatus s = writable_file_->RangeSync(offset, nbytes, opts, nullptr); if (!s.ok()) { set_seen_error(); } @@ -541,8 +564,8 @@ IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { // This method writes to disk the specified data and makes use of the rate // limiter if available -IOStatus WritableFileWriter::WriteBuffered( - const char* data, size_t size, Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::WriteBuffered(const IOOptions& opts, + const char* data, size_t size) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } @@ -553,11 +576,7 @@ IOStatus WritableFileWriter::WriteBuffered( size_t left = size; DataVerificationInfo v_info; char checksum_buf[sizeof(uint32_t)]; - Env::IOPriority rate_limiter_priority_used = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); - IOOptions io_options; - io_options.rate_limiter_priority = rate_limiter_priority_used; + Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority; while (left > 0) { size_t allowed = left; @@ -573,7 +592,7 @@ IOStatus WritableFileWriter::WriteBuffered( TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); FileOperationInfo::StartTimePoint start_ts; - uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr); + uint64_t old_size = writable_file_->GetFileSize(opts, nullptr); if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); old_size = next_write_offset_; @@ -585,10 +604,10 @@ IOStatus WritableFileWriter::WriteBuffered( if (perform_data_verification_) { Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf); v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); - s = writable_file_->Append(Slice(src, allowed), io_options, v_info, + s = writable_file_->Append(Slice(src, allowed), opts, v_info, nullptr); } else { - s = writable_file_->Append(Slice(src, allowed), io_options, nullptr); + s = writable_file_->Append(Slice(src, allowed), opts, nullptr); } if (!s.ok()) { // If writable_file_->Append() failed, then the data may or may not @@ -635,8 +654,9 @@ IOStatus WritableFileWriter::WriteBuffered( return s; } -IOStatus WritableFileWriter::WriteBufferedWithChecksum( - const char* data, size_t size, Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::WriteBufferedWithChecksum(const IOOptions& opts, + const char* data, + size_t size) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } @@ -648,11 +668,7 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum( size_t left = size; DataVerificationInfo v_info; char checksum_buf[sizeof(uint32_t)]; - Env::IOPriority rate_limiter_priority_used = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); - IOOptions io_options; - io_options.rate_limiter_priority = rate_limiter_priority_used; + Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority; // Check how much is allowed. Here, we loop until the rate limiter allows to // write the entire buffer. // TODO: need to be improved since it sort of defeats the purpose of the rate @@ -673,7 +689,7 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum( TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); FileOperationInfo::StartTimePoint start_ts; - uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr); + uint64_t old_size = writable_file_->GetFileSize(opts, nullptr); if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); old_size = next_write_offset_; @@ -685,7 +701,7 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum( EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_); v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); - s = writable_file_->Append(Slice(src, left), io_options, v_info, nullptr); + s = writable_file_->Append(Slice(src, left), opts, v_info, nullptr); SetPerfLevel(prev_perf_level); } if (ShouldNotifyListeners()) { @@ -755,8 +771,7 @@ void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data, // whole number of pages to be written again on the next flush because we can // only write on aligned // offsets. -IOStatus WritableFileWriter::WriteDirect( - Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::WriteDirect(const IOOptions& opts) { if (seen_error()) { assert(false); @@ -785,11 +800,7 @@ IOStatus WritableFileWriter::WriteDirect( size_t left = buf_.CurrentSize(); DataVerificationInfo v_info; char checksum_buf[sizeof(uint32_t)]; - Env::IOPriority rate_limiter_priority_used = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); - IOOptions io_options; - io_options.rate_limiter_priority = rate_limiter_priority_used; + Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority; while (left > 0) { // Check how much is allowed @@ -813,10 +824,10 @@ IOStatus WritableFileWriter::WriteDirect( Crc32cHandoffChecksumCalculation(src, size, checksum_buf); v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); s = writable_file_->PositionedAppend(Slice(src, size), write_offset, - io_options, v_info, nullptr); + opts, v_info, nullptr); } else { s = writable_file_->PositionedAppend(Slice(src, size), write_offset, - io_options, nullptr); + opts, nullptr); } if (ShouldNotifyListeners()) { @@ -859,8 +870,7 @@ IOStatus WritableFileWriter::WriteDirect( return s; } -IOStatus WritableFileWriter::WriteDirectWithChecksum( - Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::WriteDirectWithChecksum(const IOOptions& opts) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } @@ -895,11 +905,7 @@ IOStatus WritableFileWriter::WriteDirectWithChecksum( DataVerificationInfo v_info; char checksum_buf[sizeof(uint32_t)]; - Env::IOPriority rate_limiter_priority_used = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); - IOOptions io_options; - io_options.rate_limiter_priority = rate_limiter_priority_used; + Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority; // Check how much is allowed. Here, we loop until the rate limiter allows to // write the entire buffer. // TODO: need to be improved since it sort of defeats the purpose of the rate @@ -925,8 +931,8 @@ IOStatus WritableFileWriter::WriteDirectWithChecksum( // direct writes must be positional EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_); v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); - s = writable_file_->PositionedAppend(Slice(src, left), write_offset, - io_options, v_info, nullptr); + s = writable_file_->PositionedAppend(Slice(src, left), write_offset, opts, + v_info, nullptr); if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); @@ -986,4 +992,14 @@ Env::IOPriority WritableFileWriter::DecideRateLimiterPriority( } } +IOOptions WritableFileWriter::FinalizeIOOptions(const IOOptions& opts) const { + Env::IOPriority op_rate_limiter_priority = opts.rate_limiter_priority; + IOOptions io_options(opts); + if (writable_file_.get() != nullptr) { + io_options.rate_limiter_priority = + WritableFileWriter::DecideRateLimiterPriority( + writable_file_->GetIOPriority(), op_rate_limiter_priority); + } + return io_options; +} } // namespace ROCKSDB_NAMESPACE diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h index aac0f59491..6b71cfa64c 100644 --- a/file/writable_file_writer.h +++ b/file/writable_file_writer.h @@ -13,6 +13,7 @@ #include "db/version_edit.h" #include "env/file_system_tracer.h" +#include "monitoring/thread_status_util.h" #include "port/port.h" #include "rocksdb/file_checksum.h" #include "rocksdb/file_system.h" @@ -159,6 +160,7 @@ class WritableFileWriter { uint64_t bytes_per_sync_; RateLimiter* rate_limiter_; Statistics* stats_; + Histograms hist_type_; std::vector> listeners_; std::unique_ptr checksum_generator_; bool checksum_finalized_; @@ -173,6 +175,7 @@ class WritableFileWriter { const FileOptions& options, SystemClock* clock = nullptr, const std::shared_ptr& io_tracer = nullptr, Statistics* stats = nullptr, + Histograms hist_type = Histograms::HISTOGRAM_ENUM_MAX, const std::vector>& listeners = {}, FileChecksumGenFactory* file_checksum_gen_factory = nullptr, bool perform_data_verification = false, @@ -191,6 +194,7 @@ class WritableFileWriter { bytes_per_sync_(options.bytes_per_sync), rate_limiter_(options.rate_limiter), stats_(stats), + hist_type_(hist_type), listeners_(), checksum_generator_(nullptr), checksum_finalized_(false), @@ -222,35 +226,42 @@ class WritableFileWriter { const std::string& fname, const FileOptions& file_opts, std::unique_ptr* writer, IODebugContext* dbg); + + static IOStatus PrepareIOOptions(const WriteOptions& wo, IOOptions& opts); + WritableFileWriter(const WritableFileWriter&) = delete; WritableFileWriter& operator=(const WritableFileWriter&) = delete; ~WritableFileWriter() { - auto s = Close(); + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_UNKNOWN); + auto s = Close(IOOptions()); s.PermitUncheckedError(); + ThreadStatusUtil::SetThreadOperation(cur_op_type); } std::string file_name() const { return file_name_; } // When this Append API is called, if the crc32c_checksum is not provided, we // will calculate the checksum internally. - IOStatus Append(const Slice& data, uint32_t crc32c_checksum = 0, - Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL); + IOStatus Append(const IOOptions& opts, const Slice& data, + uint32_t crc32c_checksum = 0); - IOStatus Pad(const size_t pad_bytes, - Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL); + IOStatus Pad(const IOOptions& opts, const size_t pad_bytes); - IOStatus Flush(Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL); + IOStatus Flush(const IOOptions& opts); - IOStatus Close(); + IOStatus Close(const IOOptions& opts); - IOStatus Sync(bool use_fsync); + IOStatus Sync(const IOOptions& opts, bool use_fsync); // Sync only the data that was already Flush()ed. Safe to call concurrently // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(), // returns NotSupported status. - IOStatus SyncWithoutFlush(bool use_fsync); + IOStatus SyncWithoutFlush(const IOOptions& opts, bool use_fsync); uint64_t GetFileSize() const { return filesize_.load(std::memory_order_acquire); @@ -307,14 +318,20 @@ class WritableFileWriter { // Used when os buffering is OFF and we are writing // DMA such as in Direct I/O mode - IOStatus WriteDirect(Env::IOPriority op_rate_limiter_priority); - IOStatus WriteDirectWithChecksum(Env::IOPriority op_rate_limiter_priority); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus WriteDirect(const IOOptions& opts); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus WriteDirectWithChecksum(const IOOptions& opts); // Normal write. - IOStatus WriteBuffered(const char* data, size_t size, - Env::IOPriority op_rate_limiter_priority); - IOStatus WriteBufferedWithChecksum(const char* data, size_t size, - Env::IOPriority op_rate_limiter_priority); - IOStatus RangeSync(uint64_t offset, uint64_t nbytes); - IOStatus SyncInternal(bool use_fsync); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus WriteBuffered(const IOOptions& opts, const char* data, size_t size); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus WriteBufferedWithChecksum(const IOOptions& opts, const char* data, + size_t size); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus RangeSync(const IOOptions& opts, uint64_t offset, uint64_t nbytes); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus SyncInternal(const IOOptions& opts, bool use_fsync); + IOOptions FinalizeIOOptions(const IOOptions& opts) const; }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 7b0220635e..02cc604948 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -67,6 +67,7 @@ struct ThreadStatus; class FileSystem; class SystemClock; struct ConfigOptions; +struct IOOptions; const size_t kDefaultPageSize = 4 * 1024; @@ -1352,7 +1353,8 @@ extern void Fatal(Logger* info_log, const char* format, ...) // A utility routine: write "data" to the named file. extern Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname, - bool should_sync = false); + bool should_sync = false, + const IOOptions* io_options = nullptr); // A utility routine: read contents of named file into *data extern Status ReadFileToString(Env* env, const std::string& fname, diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 647aad6c94..8590326732 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -1918,7 +1918,8 @@ class FSDirectoryWrapper : public FSDirectory { // A utility routine: write "data" to the named file. extern IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, const std::string& fname, - bool should_sync = false); + bool should_sync = false, + const IOOptions& io_options = IOOptions()); // A utility routine: read contents of named file into *data extern IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index ae5ed2c265..9146f6d7bc 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1781,7 +1781,7 @@ struct WriteOptions { // system call followed by "fdatasync()". // // Default: false - bool sync; + bool sync = false; // If true, writes will not first go to the write ahead log, // and the write may get lost after a crash. The backup engine @@ -1789,18 +1789,18 @@ struct WriteOptions { // you disable write-ahead logs, you must create backups with // flush_before_backup=true to avoid losing unflushed memtable data. // Default: false - bool disableWAL; + bool disableWAL = false; // If true and if user is trying to write to column families that don't exist // (they were dropped), ignore the write (don't return an error). If there // are multiple writes in a WriteBatch, other writes will succeed. // Default: false - bool ignore_missing_column_families; + bool ignore_missing_column_families = false; // If true and we need to wait or sleep for the write request, fails // immediately with Status::Incomplete(). // Default: false - bool no_slowdown; + bool no_slowdown = false; // If true, this write request is of lower priority if compaction is // behind. In this case, no_slowdown = true, the request will be canceled @@ -1809,7 +1809,7 @@ struct WriteOptions { // it introduces minimum impacts to high priority writes. // // Default: false - bool low_pri; + bool low_pri = false; // If true, this writebatch will maintain the last insert positions of each // memtable as hints in concurrent write. It can improve write performance @@ -1818,7 +1818,7 @@ struct WriteOptions { // option will be ignored. // // Default: false - bool memtable_insert_hint_per_batch; + bool memtable_insert_hint_per_batch = false; // For writes associated with this option, charge the internal rate // limiter (see `DBOptions::rate_limiter`) at the specified priority. The @@ -1833,24 +1833,25 @@ struct WriteOptions { // due to implementation constraints. // // Default: `Env::IO_TOTAL` - Env::IOPriority rate_limiter_priority; + Env::IOPriority rate_limiter_priority = Env::IO_TOTAL; // `protection_bytes_per_key` is the number of bytes used to store // protection information for each key entry. Currently supported values are // zero (disabled) and eight. // // Default: zero (disabled). - size_t protection_bytes_per_key; + size_t protection_bytes_per_key = 0; - WriteOptions() - : sync(false), - disableWAL(false), - ignore_missing_column_families(false), - no_slowdown(false), - low_pri(false), - memtable_insert_hint_per_batch(false), - rate_limiter_priority(Env::IO_TOTAL), - protection_bytes_per_key(0) {} + // For RocksDB internal use only + // + // Default: Env::IOActivity::kUnknown. + Env::IOActivity io_activity = Env::IOActivity::kUnknown; + + WriteOptions() {} + explicit WriteOptions(Env::IOActivity _io_activity); + explicit WriteOptions( + Env::IOPriority _rate_limiter_priority, + Env::IOActivity _io_activity = Env::IOActivity::kUnknown); }; // Options that control flush operations diff --git a/include/rocksdb/sst_file_reader.h b/include/rocksdb/sst_file_reader.h index 026ae66d03..dca5a8f03a 100644 --- a/include/rocksdb/sst_file_reader.h +++ b/include/rocksdb/sst_file_reader.h @@ -34,6 +34,7 @@ class SstFileReader { // Verifies whether there is corruption in this table. Status VerifyChecksum(const ReadOptions& /*read_options*/); + // TODO: plumb Env::IOActivity, Env::IOPriority Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } private: @@ -42,4 +43,3 @@ class SstFileReader { }; } // namespace ROCKSDB_NAMESPACE - diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 9aab337124..1853e73d59 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -589,6 +589,14 @@ enum Histograms : uint32_t { FILE_READ_VERIFY_DB_CHECKSUM_MICROS, FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS, + // Time spent in writing SST files + SST_WRITE_MICROS, + // Time spent in writing SST table (currently only block-based table) or blob + // file for flush, compaction or db open + FILE_WRITE_FLUSH_MICROS, + FILE_WRITE_COMPACTION_MICROS, + FILE_WRITE_DB_OPEN_MICROS, + // The number of subcompactions actually scheduled during a compaction NUM_SUBCOMPACTIONS_SCHEDULED, // Value size distribution in each operation diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 3edff81aa7..38e305f325 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5716,10 +5716,17 @@ class HistogramTypeJni { case ROCKSDB_NAMESPACE::Histograms:: FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS: return 0x41; + case ROCKSDB_NAMESPACE::Histograms::SST_WRITE_MICROS: + return 0x42; + case ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_FLUSH_MICROS: + return 0x43; + case ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_COMPACTION_MICROS: + return 0x44; + case ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_DB_OPEN_MICROS: + return 0x45; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; - default: // undefined/default return 0x0; @@ -5853,6 +5860,14 @@ class HistogramTypeJni { case 0x41: return ROCKSDB_NAMESPACE::Histograms:: FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS; + case 0x42: + return ROCKSDB_NAMESPACE::Histograms::SST_WRITE_MICROS; + case 0x43: + return ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_FLUSH_MICROS; + case 0x44: + return ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_COMPACTION_MICROS; + case 0x45: + return ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_DB_OPEN_MICROS; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index 41fe241ad3..aff5cad141 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -185,6 +185,14 @@ public enum HistogramType { FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS((byte) 0x41), + SST_WRITE_MICROS((byte) 0x42), + + FILE_WRITE_FLUSH_MICROS((byte) 0x43), + + FILE_WRITE_COMPACTION_MICROS((byte) 0x44), + + FILE_WRITE_DB_OPEN_MICROS((byte) 0x45), + // 0x1F for backwards compatibility on current minor version. HISTOGRAM_ENUM_MAX((byte) 0x1F); diff --git a/logging/env_logger.h b/logging/env_logger.h index fc9b245504..b236dc817c 100644 --- a/logging/env_logger.h +++ b/logging/env_logger.h @@ -75,7 +75,7 @@ class EnvLogger : public Logger { mutex_.AssertHeld(); if (flush_pending_) { flush_pending_ = false; - file_.Flush().PermitUncheckedError(); + file_.Flush(IOOptions()).PermitUncheckedError(); file_.reset_seen_error(); } last_flush_micros_ = clock_->NowMicros(); @@ -93,7 +93,7 @@ class EnvLogger : public Logger { Status CloseHelper() { FileOpGuard guard(*this); - const auto close_status = file_.Close(); + const auto close_status = file_.Close(IOOptions()); if (close_status.ok()) { return close_status; @@ -162,7 +162,7 @@ class EnvLogger : public Logger { { FileOpGuard guard(*this); // We will ignore any error returned by Append(). - file_.Append(Slice(base, p - base)).PermitUncheckedError(); + file_.Append(IOOptions(), Slice(base, p - base)).PermitUncheckedError(); file_.reset_seen_error(); flush_pending_ = true; const uint64_t now_micros = clock_->NowMicros(); diff --git a/monitoring/persistent_stats_history.cc b/monitoring/persistent_stats_history.cc index 964fe536f1..8c077c55f0 100644 --- a/monitoring/persistent_stats_history.cc +++ b/monitoring/persistent_stats_history.cc @@ -41,6 +41,8 @@ Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type, } else if (type == StatsVersionKeyType::kCompatibleVersion) { key = kCompatibleVersionKeyString; } + + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions options; options.verify_checksums = true; std::string result; @@ -122,6 +124,7 @@ void PersistentStatsHistoryIterator::AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time) { // try to find next entry in stats_history_ map if (db_impl_ != nullptr) { + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; Iterator* iter = db_impl_->NewIterator(ro, db_impl_->PersistentStatsColumnFamily()); diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index cc679ec0a9..072083865e 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -303,6 +303,10 @@ const std::vector> HistogramsNameMap = { "rocksdb.file.read.verify.db.checksum.micros"}, {FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS, "rocksdb.file.read.verify.file.checksums.micros"}, + {SST_WRITE_MICROS, "rocksdb.sst.write.micros"}, + {FILE_WRITE_FLUSH_MICROS, "rocksdb.file.write.flush.micros"}, + {FILE_WRITE_COMPACTION_MICROS, "rocksdb.file.write.compaction.micros"}, + {FILE_WRITE_DB_OPEN_MICROS, "rocksdb.file.write.db.open.micros"}, {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"}, {BYTES_PER_READ, "rocksdb.bytes.per.read"}, {BYTES_PER_WRITE, "rocksdb.bytes.per.write"}, diff --git a/options/options.cc b/options/options.cc index d96cf4072e..2e7c025030 100644 --- a/options/options.cc +++ b/options/options.cc @@ -703,4 +703,11 @@ ReadOptions::ReadOptions(bool _verify_checksums, bool _fill_cache) ReadOptions::ReadOptions(Env::IOActivity _io_activity) : io_activity(_io_activity) {} +WriteOptions::WriteOptions(Env::IOActivity _io_activity) + : io_activity(_io_activity) {} + +WriteOptions::WriteOptions(Env::IOPriority _rate_limiter_priority, + Env::IOActivity _io_activity) + : rate_limiter_priority(_rate_limiter_priority), + io_activity(_io_activity) {} } // namespace ROCKSDB_NAMESPACE diff --git a/options/options_parser.cc b/options/options_parser.cc index e2431016d1..ec32f76447 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -35,7 +35,8 @@ static const std::string option_file_header = "#\n" "\n"; -Status PersistRocksDBOptions(const DBOptions& db_opt, +Status PersistRocksDBOptions(const WriteOptions& write_options, + const DBOptions& db_opt, const std::vector& cf_names, const std::vector& cf_opts, const std::string& file_name, FileSystem* fs) { @@ -48,11 +49,12 @@ Status PersistRocksDBOptions(const DBOptions& db_opt, if (db_opt.log_readahead_size > 0) { config_options.file_readahead_size = db_opt.log_readahead_size; } - return PersistRocksDBOptions(config_options, db_opt, cf_names, cf_opts, - file_name, fs); + return PersistRocksDBOptions(write_options, config_options, db_opt, cf_names, + cf_opts, file_name, fs); } -Status PersistRocksDBOptions(const ConfigOptions& config_options_in, +Status PersistRocksDBOptions(const WriteOptions& write_options, + const ConfigOptions& config_options_in, const DBOptions& db_opt, const std::vector& cf_names, const std::vector& cf_opts, @@ -79,62 +81,70 @@ Status PersistRocksDBOptions(const ConfigOptions& config_options_in, std::string options_file_content; - s = writable->Append( - option_file_header + "[" + opt_section_titles[kOptionSectionVersion] + - "]\n" - " rocksdb_version=" + - std::to_string(ROCKSDB_MAJOR) + "." + std::to_string(ROCKSDB_MINOR) + - "." + std::to_string(ROCKSDB_PATCH) + "\n"); + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); if (s.ok()) { - s = writable->Append( - " options_file_version=" + std::to_string(ROCKSDB_OPTION_FILE_MAJOR) + - "." + std::to_string(ROCKSDB_OPTION_FILE_MINOR) + "\n"); + s = writable->Append(opts, option_file_header + "[" + + opt_section_titles[kOptionSectionVersion] + + "]\n" + " rocksdb_version=" + + std::to_string(ROCKSDB_MAJOR) + "." + + std::to_string(ROCKSDB_MINOR) + "." + + std::to_string(ROCKSDB_PATCH) + "\n"); } if (s.ok()) { - s = writable->Append("\n[" + opt_section_titles[kOptionSectionDBOptions] + - "]\n "); + s = writable->Append( + opts, + " options_file_version=" + std::to_string(ROCKSDB_OPTION_FILE_MAJOR) + + "." + std::to_string(ROCKSDB_OPTION_FILE_MINOR) + "\n"); + } + if (s.ok()) { + s = writable->Append( + opts, "\n[" + opt_section_titles[kOptionSectionDBOptions] + "]\n "); } if (s.ok()) { s = GetStringFromDBOptions(config_options, db_opt, &options_file_content); } if (s.ok()) { - s = writable->Append(options_file_content + "\n"); + s = writable->Append(opts, options_file_content + "\n"); } for (size_t i = 0; s.ok() && i < cf_opts.size(); ++i) { // CFOptions section - s = writable->Append("\n[" + opt_section_titles[kOptionSectionCFOptions] + - " \"" + EscapeOptionString(cf_names[i]) + "\"]\n "); + s = writable->Append( + opts, "\n[" + opt_section_titles[kOptionSectionCFOptions] + " \"" + + EscapeOptionString(cf_names[i]) + "\"]\n "); if (s.ok()) { s = GetStringFromColumnFamilyOptions(config_options, cf_opts[i], &options_file_content); } if (s.ok()) { - s = writable->Append(options_file_content + "\n"); + s = writable->Append(opts, options_file_content + "\n"); } // TableOptions section auto* tf = cf_opts[i].table_factory.get(); if (tf != nullptr) { if (s.ok()) { s = writable->Append( - "[" + opt_section_titles[kOptionSectionTableOptions] + tf->Name() + - " \"" + EscapeOptionString(cf_names[i]) + "\"]\n "); + opts, "[" + opt_section_titles[kOptionSectionTableOptions] + + tf->Name() + " \"" + EscapeOptionString(cf_names[i]) + + "\"]\n "); } if (s.ok()) { options_file_content.clear(); s = tf->GetOptionString(config_options, &options_file_content); } if (s.ok()) { - s = writable->Append(options_file_content + "\n"); + s = writable->Append(opts, options_file_content + "\n"); } } } if (s.ok()) { - s = writable->Sync(true /* use_fsync */); + s = writable->Sync(opts, true /* use_fsync */); } if (s.ok()) { - s = writable->Close(); + s = writable->Close(opts); } TEST_SYNC_POINT("PersistRocksDBOptions:written"); if (s.ok()) { @@ -733,4 +743,3 @@ Status RocksDBOptionsParser::VerifyTableFactory( return Status::OK(); } } // namespace ROCKSDB_NAMESPACE - diff --git a/options/options_parser.h b/options/options_parser.h index 4268051f34..e702c9f499 100644 --- a/options/options_parser.h +++ b/options/options_parser.h @@ -32,11 +32,13 @@ enum OptionSection : char { static const std::string opt_section_titles[] = { "Version", "DBOptions", "CFOptions", "TableOptions/", "Unknown"}; -Status PersistRocksDBOptions(const DBOptions& db_opt, +Status PersistRocksDBOptions(const WriteOptions& write_options, + const DBOptions& db_opt, const std::vector& cf_names, const std::vector& cf_opts, const std::string& file_name, FileSystem* fs); -Status PersistRocksDBOptions(const ConfigOptions& config_options, +Status PersistRocksDBOptions(const WriteOptions& write_options, + const ConfigOptions& config_options, const DBOptions& db_opt, const std::vector& cf_names, const std::vector& cf_opts, diff --git a/options/options_test.cc b/options/options_test.cc index 6420ebf465..8fa39fa2f0 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -3672,8 +3672,8 @@ TEST_F(OptionsParserTest, Readahead) { std::vector cf_names = {"default", one_mb_string}; const std::string kOptionsFileName = "test-persisted-options.ini"; - ASSERT_OK(PersistRocksDBOptions(base_db_opt, cf_names, base_cf_opts, - kOptionsFileName, fs_.get())); + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), base_db_opt, cf_names, + base_cf_opts, kOptionsFileName, fs_.get())); uint64_t file_size = 0; ASSERT_OK( @@ -3747,8 +3747,8 @@ TEST_F(OptionsParserTest, DumpAndParse) { const std::string kOptionsFileName = "test-persisted-options.ini"; // Use default for escaped(true), unknown(false) and check (exact) ConfigOptions config_options; - ASSERT_OK(PersistRocksDBOptions(base_db_opt, cf_names, base_cf_opts, - kOptionsFileName, fs_.get())); + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), base_db_opt, cf_names, + base_cf_opts, kOptionsFileName, fs_.get())); RocksDBOptionsParser parser; ASSERT_OK(parser.Parse(config_options, kOptionsFileName, fs_.get())); @@ -3808,9 +3808,9 @@ TEST_F(OptionsParserTest, DifferentDefault) { ColumnFamilyOptions cf_univ_opts; cf_univ_opts.OptimizeUniversalStyleCompaction(); - ASSERT_OK(PersistRocksDBOptions(DBOptions(), {"default", "universal"}, - {cf_level_opts, cf_univ_opts}, - kOptionsFileName, fs_.get())); + ASSERT_OK(PersistRocksDBOptions( + WriteOptions(), DBOptions(), {"default", "universal"}, + {cf_level_opts, cf_univ_opts}, kOptionsFileName, fs_.get())); RocksDBOptionsParser parser; ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, @@ -3953,8 +3953,8 @@ class OptionsSanityCheckTest : public OptionsParserTest, if (!s.ok()) { return s; } - return PersistRocksDBOptions(db_opts, {"default"}, {cf_opts}, - kOptionsFileName, fs_.get()); + return PersistRocksDBOptions(WriteOptions(), db_opts, {"default"}, + {cf_opts}, kOptionsFileName, fs_.get()); } Status PersistCFOptions(const ColumnFamilyOptions& cf_opts) { diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 7b8bd0275c..6bd28804c7 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -264,6 +264,7 @@ struct BlockBasedTableBuilder::Rep { // BEGIN from MutableCFOptions std::shared_ptr prefix_extractor; // END from MutableCFOptions + const WriteOptions write_options; const BlockBasedTableOptions table_options; const InternalKeyComparator& internal_comparator; // Size in bytes for the user-defined timestamps. @@ -439,6 +440,7 @@ struct BlockBasedTableBuilder::Rep { WritableFileWriter* f) : ioptions(tbo.ioptions), prefix_extractor(tbo.moptions.prefix_extractor), + write_options(tbo.write_options), table_options(table_opt), internal_comparator(tbo.internal_comparator), ts_sz(tbo.internal_comparator.user_comparator()->timestamp_size()), @@ -1317,6 +1319,13 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( // checksum: uint32 Rep* r = rep_; bool is_data_block = block_type == BlockType::kData; + IOOptions io_options; + IOStatus io_s = + WritableFileWriter::PrepareIOOptions(r->write_options, io_options); + if (!io_s.ok()) { + r->SetIOStatus(io_s); + return; + } // Old, misleading name of this function: WriteRawBlock StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS); const uint64_t offset = r->get_offset(); @@ -1330,7 +1339,7 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( } { - IOStatus io_s = r->file->Append(block_contents); + io_s = r->file->Append(io_options, block_contents); if (!io_s.ok()) { r->SetIOStatus(io_s); return; @@ -1357,7 +1366,7 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( "BlockBasedTableBuilder::WriteMaybeCompressedBlock:TamperWithChecksum", trailer.data()); { - IOStatus io_s = r->file->Append(Slice(trailer.data(), trailer.size())); + io_s = r->file->Append(io_options, Slice(trailer.data(), trailer.size())); if (!io_s.ok()) { r->SetIOStatus(io_s); return; @@ -1394,7 +1403,8 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( (r->alignment - ((block_contents.size() + kBlockTrailerSize) & (r->alignment - 1))) & (r->alignment - 1); - IOStatus io_s = r->file->Pad(pad_bytes); + + io_s = r->file->Pad(io_options, pad_bytes); if (io_s.ok()) { r->set_offset(r->get_offset() + pad_bytes); } else { @@ -1800,7 +1810,14 @@ void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle, r->SetStatus(s); return; } - IOStatus ios = r->file->Append(footer.GetSlice()); + IOOptions io_options; + IOStatus ios = + WritableFileWriter::PrepareIOOptions(r->write_options, io_options); + if (!ios.ok()) { + r->SetIOStatus(ios); + return; + } + ios = r->file->Append(io_options, footer.GetSlice()); if (ios.ok()) { r->set_offset(r->get_offset() + footer.GetSlice().size()); } else { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index a184264df1..4de9eba23e 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2922,7 +2922,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { "--------------------------------------\n"; std::unique_ptr metaindex; std::unique_ptr metaindex_iter; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions ro; Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex, &metaindex_iter); @@ -3027,7 +3027,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { out_stream << "Index Details:\n" "--------------------------------------\n"; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; std::unique_ptr> blockhandles_iter( NewIndexIterator(read_options, /*need_upper_bound_check=*/false, @@ -3078,7 +3078,7 @@ Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { } Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; std::unique_ptr> blockhandles_iter( NewIndexIterator(read_options, /*need_upper_bound_check=*/false, diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc index 254546893f..7255fae7e1 100644 --- a/table/block_based/block_based_table_reader_test.cc +++ b/table/block_based/block_based_table_reader_test.cc @@ -19,6 +19,7 @@ #include "rocksdb/compression_type.h" #include "rocksdb/db.h" #include "rocksdb/file_system.h" +#include "rocksdb/options.h" #include "table/block_based/block_based_table_builder.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/partitioned_index_iterator.h" @@ -133,11 +134,13 @@ class BlockBasedTableReaderBaseTest : public testing::Test { compression_opts.max_dict_bytes = compression_dict_bytes; compression_opts.max_dict_buffer_bytes = compression_dict_bytes; IntTblPropCollectorFactories factories; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr table_builder( options_.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, comparator, &factories, - compression_type, compression_opts, - 0 /* column_family_id */, + TableBuilderOptions(ioptions, moptions, read_options, write_options, + comparator, &factories, compression_type, + compression_opts, 0 /* column_family_id */, kDefaultColumnFamilyName, -1 /* level */), writer.get())); diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index 2841b271de..b4ccfce44b 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -553,9 +553,11 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, std::unique_ptr builder; IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; + const ReadOptions read_options; + const WriteOptions write_options; builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions( - ioptions, moptions, internal_comparator, + ioptions, moptions, read_options, write_options, internal_comparator, &int_tbl_prop_collector_factories, options.compression, CompressionOptions(), TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, @@ -567,7 +569,7 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, EXPECT_TRUE(builder->status().ok()); Status s = builder->Finish(); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Flush(IOOptions())); EXPECT_TRUE(s.ok()) << s.ToString(); EXPECT_EQ(sink->contents().size(), builder->FileSize()); diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index 61e444e928..95a0255743 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -77,11 +77,13 @@ class BlockFetcherTest : public testing::Test { ColumnFamilyOptions cf_options(options_); MutableCFOptions moptions(cf_options); IntTblPropCollectorFactories factories; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr table_builder(table_factory_.NewTableBuilder( - TableBuilderOptions(ioptions, moptions, comparator, &factories, - compression_type, CompressionOptions(), - 0 /* column_family_id */, kDefaultColumnFamilyName, - -1 /* level */), + TableBuilderOptions(ioptions, moptions, read_options, write_options, + comparator, &factories, compression_type, + CompressionOptions(), 0 /* column_family_id */, + kDefaultColumnFamilyName, -1 /* level */), writer.get())); // Build table. diff --git a/table/cuckoo/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc index b0596edac0..16e7f46e88 100644 --- a/table/cuckoo/cuckoo_table_builder.cc +++ b/table/cuckoo/cuckoo_table_builder.cc @@ -318,15 +318,16 @@ Status CuckooTableBuilder::Finish() { unused_bucket.resize(static_cast(bucket_size), 'a'); // Write the table. uint32_t num_added = 0; + const IOOptions opts; for (auto& bucket : buckets) { if (bucket.vector_idx == kMaxVectorIdx) { - io_status_ = file_->Append(Slice(unused_bucket)); + io_status_ = file_->Append(opts, Slice(unused_bucket)); } else { ++num_added; - io_status_ = file_->Append(GetKey(bucket.vector_idx)); + io_status_ = file_->Append(opts, GetKey(bucket.vector_idx)); if (io_status_.ok()) { if (value_size_ > 0) { - io_status_ = file_->Append(GetValue(bucket.vector_idx)); + io_status_ = file_->Append(opts, GetValue(bucket.vector_idx)); } } } @@ -382,7 +383,7 @@ Status CuckooTableBuilder::Finish() { BlockHandle property_block_handle; property_block_handle.set_offset(offset); property_block_handle.set_size(property_block.size()); - io_status_ = file_->Append(property_block); + io_status_ = file_->Append(opts, property_block); offset += property_block.size(); if (!io_status_.ok()) { status_ = io_status_; @@ -395,7 +396,7 @@ Status CuckooTableBuilder::Finish() { BlockHandle meta_index_block_handle; meta_index_block_handle.set_offset(offset); meta_index_block_handle.set_size(meta_index_block.size()); - io_status_ = file_->Append(meta_index_block); + io_status_ = file_->Append(opts, meta_index_block); if (!io_status_.ok()) { status_ = io_status_; return status_; @@ -408,7 +409,7 @@ Status CuckooTableBuilder::Finish() { status_ = s; return status_; } - io_status_ = file_->Append(footer.GetSlice()); + io_status_ = file_->Append(opts, footer.GetSlice()); status_ = io_status_; return status_; } diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc index 1a0d58c76d..967e8e2db7 100644 --- a/table/cuckoo/cuckoo_table_builder_test.cc +++ b/table/cuckoo/cuckoo_table_builder_test.cc @@ -182,7 +182,7 @@ TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) { ASSERT_OK(builder.status()); ASSERT_EQ(0UL, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); CheckFileContents({}, {}, {}, "", 2, 2, false); } @@ -229,7 +229,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -277,7 +277,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -325,7 +325,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -374,7 +374,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -420,7 +420,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -463,7 +463,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -507,7 +507,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -550,7 +550,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -589,7 +589,7 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) { ASSERT_OK(builder.status()); } ASSERT_TRUE(builder.Finish().IsNotSupported()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); } TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { @@ -619,7 +619,7 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { ASSERT_OK(builder.status()); ASSERT_TRUE(builder.Finish().IsNotSupported()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); } } // namespace ROCKSDB_NAMESPACE diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc index 54ae6266ef..d74a0b041f 100644 --- a/table/cuckoo/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -59,7 +59,7 @@ CuckooTableReader::CuckooTableReader( } { std::unique_ptr props; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc index d829b36304..25e2c1bca4 100644 --- a/table/cuckoo/cuckoo_table_reader_test.cc +++ b/table/cuckoo/cuckoo_table_reader_test.cc @@ -104,7 +104,7 @@ class CuckooReaderTest : public testing::Test { ASSERT_OK(builder.Finish()); ASSERT_EQ(num_items, builder.NumEntries()); file_size = builder.FileSize(); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); // Check reader now. std::unique_ptr file_reader; @@ -431,7 +431,7 @@ void WriteFile(const std::vector& keys, const uint64_t num, } ASSERT_OK(builder.Finish()); ASSERT_EQ(num, builder.NumEntries()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); uint64_t file_size; ASSERT_OK( @@ -571,4 +571,3 @@ int main(int argc, char** argv) { } #endif // GFLAGS. - diff --git a/table/mock_table.cc b/table/mock_table.cc index 1971c00fc5..fe3bd854c6 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -298,7 +298,7 @@ Status MockTableFactory::GetAndWriteNextID(WritableFileWriter* file, *next_id = next_id_.fetch_add(1); char buf[4]; EncodeFixed32(buf, *next_id); - return file->Append(Slice(buf, 4)); + return file->Append(IOOptions(), Slice(buf, 4)); } Status MockTableFactory::GetIDFromFile(RandomAccessFileReader* file, diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc index 1e61773d65..32f53be49a 100644 --- a/table/plain/plain_table_builder.cc +++ b/table/plain/plain_table_builder.cc @@ -39,7 +39,7 @@ IOStatus WriteBlock(const Slice& block_contents, WritableFileWriter* file, uint64_t* offset, BlockHandle* block_handle) { block_handle->set_offset(*offset); block_handle->set_size(block_contents.size()); - IOStatus io_s = file->Append(block_contents); + IOStatus io_s = file->Append(IOOptions(), block_contents); if (io_s.ok()) { *offset += block_contents.size(); @@ -138,6 +138,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { // temp buffer for metadata bytes between key and value. char meta_bytes_buf[6]; size_t meta_bytes_buf_size = 0; + const IOOptions opts; ParsedInternalKey internal_key; if (!ParseInternalKey(key, &internal_key, false /* log_err_key */) @@ -178,12 +179,13 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size); assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf)); meta_bytes_buf_size = end_ptr - meta_bytes_buf; - io_status_ = file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size)); + io_status_ = + file_->Append(opts, Slice(meta_bytes_buf, meta_bytes_buf_size)); } // Write value if (io_status_.ok()) { - io_status_ = file_->Append(value); + io_status_ = file_->Append(opts, value); offset_ += value_size + meta_bytes_buf_size; } @@ -306,7 +308,7 @@ Status PlainTableBuilder::Finish() { status_ = s; return status_; } - io_status_ = file_->Append(footer.GetSlice()); + io_status_ = file_->Append(IOOptions(), footer.GetSlice()); if (io_status_.ok()) { offset_ += footer.GetSlice().size(); } diff --git a/table/plain/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc index 0ac4231910..102a16a6b3 100644 --- a/table/plain/plain_table_key_coding.cc +++ b/table/plain/plain_table_key_coding.cc @@ -94,6 +94,8 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, Slice key_to_write = key; // Portion of internal key to write out. uint32_t user_key_size = static_cast(key.size() - 8); + const IOOptions opts; + if (encoding_type_ == kPlain) { if (fixed_user_key_len_ == kPlainTableVariableLength) { // Write key length @@ -101,7 +103,7 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, char* ptr = EncodeVarint32(key_size_buf, user_key_size); assert(ptr <= key_size_buf + sizeof(key_size_buf)); auto len = ptr - key_size_buf; - IOStatus io_s = file->Append(Slice(key_size_buf, len)); + IOStatus io_s = file->Append(opts, Slice(key_size_buf, len)); if (!io_s.ok()) { return io_s; } @@ -119,7 +121,7 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, key_count_for_prefix_ = 1; pre_prefix_.SetUserKey(prefix); size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes); - IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos)); + IOStatus io_s = file->Append(opts, Slice(size_bytes, size_bytes_pos)); if (!io_s.ok()) { return io_s; } @@ -137,7 +139,7 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, static_cast(pre_prefix_.GetUserKey().size()); size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len, size_bytes + size_bytes_pos); - IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos)); + IOStatus io_s = file->Append(opts, Slice(size_bytes, size_bytes_pos)); if (!io_s.ok()) { return io_s; } @@ -152,7 +154,7 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, // in this buffer to safe one file append call, which takes 1 byte. if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) { IOStatus io_s = - file->Append(Slice(key_to_write.data(), key_to_write.size() - 8)); + file->Append(opts, Slice(key_to_write.data(), key_to_write.size() - 8)); if (!io_s.ok()) { return io_s; } @@ -160,7 +162,7 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0; *meta_bytes_buf_size += 1; } else { - IOStatus io_s = file->Append(key_to_write); + IOStatus io_s = file->Append(opts, key_to_write); if (!io_s.ok()) { return io_s; } diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index b917fce342..89b1853ce3 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -126,7 +126,7 @@ Status PlainTableReader::Open( } std::unique_ptr props; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, ioptions, read_options, &props); @@ -300,7 +300,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, BlockContents index_block_contents; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index 821fff5b30..3972cdfb37 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -58,6 +58,7 @@ SstFileDumper::SstFileDumper(const Options& options, options_(options), ioptions_(options_), moptions_(ColumnFamilyOptions(options_)), + // TODO: plumb Env::IOActivity, Env::IOPriority read_options_(verify_checksum, false), internal_comparator_(BytewiseComparator()) { read_options_.readahead_size = readahead_size; @@ -303,14 +304,18 @@ Status SstFileDumper::ShowCompressionSize( const ImmutableOptions imoptions(opts); const ColumnFamilyOptions cfo(opts); const MutableCFOptions moptions(cfo); + // TODO: plumb Env::IOActivity, Env::IOPriority + const ReadOptions read_options; + const WriteOptions write_options; ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator); IntTblPropCollectorFactories block_based_table_factories; std::string column_family_name; int unknown_level = -1; + TableBuilderOptions tb_opts( - imoptions, moptions, ikc, &block_based_table_factories, compress_type, - compress_opt, + imoptions, moptions, read_options, write_options, ikc, + &block_based_table_factories, compress_type, compress_opt, TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, column_family_name, unknown_level); uint64_t num_data_blocks = 0; @@ -375,10 +380,8 @@ Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number, RandomAccessFileReader* file, uint64_t file_size, FilePrefetchBuffer* prefetch_buffer) { - // TODO: plumb Env::IOActivity - const ReadOptions read_options; Status s = ROCKSDB_NAMESPACE::ReadTableProperties( - file, file_size, table_magic_number, ioptions_, read_options, + file, file_size, table_magic_number, ioptions_, read_options_, &table_properties_, /* memory_allocator= */ nullptr, prefetch_buffer); if (!s.ok()) { diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index 1ef0f98aa9..2d63bad3ae 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -41,7 +41,11 @@ struct SstFileWriter::Rep { cfh(_cfh), invalidate_page_cache(_invalidate_page_cache), skip_filters(_skip_filters), - db_session_id(_db_session_id) {} + db_session_id(_db_session_id) { + // TODO (hx235): pass in `WriteOptions` instead of `rate_limiter_priority` + // during construction + write_options.rate_limiter_priority = io_priority; + } std::unique_ptr file_writer; std::unique_ptr builder; @@ -49,6 +53,7 @@ struct SstFileWriter::Rep { ImmutableOptions ioptions; MutableCFOptions mutable_cf_options; Env::IOPriority io_priority; + WriteOptions write_options; InternalKeyComparator internal_comparator; ExternalSstFileInfo file_info; InternalKey ikey; @@ -343,13 +348,15 @@ Status SstFileWriter::Open(const std::string& file_path) { // TODO: it would be better to set oldest_key_time to be used for getting the // approximate time of ingested keys. + // TODO: plumb Env::IOActivity, Env::IOPriority TableBuilderOptions table_builder_options( - r->ioptions, r->mutable_cf_options, r->internal_comparator, - &int_tbl_prop_collector_factories, compression_type, compression_opts, - cf_id, r->column_family_name, unknown_level, false /* is_bottommost */, - TableFileCreationReason::kMisc, 0 /* oldest_key_time */, - 0 /* file_creation_time */, "SST Writer" /* db_id */, r->db_session_id, - 0 /* target_file_size */, r->next_file_number); + r->ioptions, r->mutable_cf_options, ReadOptions(), r->write_options, + r->internal_comparator, &int_tbl_prop_collector_factories, + compression_type, compression_opts, cf_id, r->column_family_name, + unknown_level, false /* is_bottommost */, TableFileCreationReason::kMisc, + 0 /* oldest_key_time */, 0 /* file_creation_time */, + "SST Writer" /* db_id */, r->db_session_id, 0 /* target_file_size */, + r->next_file_number); // External SST files used to each get a unique session id. Now for // slightly better uniqueness probability in constructing cache keys, we // assign fake file numbers to each file (into table properties) and keep @@ -361,8 +368,8 @@ Status SstFileWriter::Open(const std::string& file_path) { FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types; r->file_writer.reset(new WritableFileWriter( std::move(sst_file), file_path, r->env_options, r->ioptions.clock, - nullptr /* io_tracer */, nullptr /* stats */, r->ioptions.listeners, - r->ioptions.file_checksum_gen_factory.get(), + nullptr /* io_tracer */, r->ioptions.stats, Histograms::SST_WRITE_MICROS, + r->ioptions.listeners, r->ioptions.file_checksum_gen_factory.get(), tmp_set.Contains(FileType::kTableFile), false)); // TODO(tec) : If table_factory is using compressed block cache, we will @@ -430,11 +437,13 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { Status s = r->builder->Finish(); r->file_info.file_size = r->builder->FileSize(); + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(r->write_options, opts); if (s.ok()) { - s = r->file_writer->Sync(r->ioptions.use_fsync); + s = r->file_writer->Sync(opts, r->ioptions.use_fsync); r->InvalidatePageCache(true /* closing */).PermitUncheckedError(); if (s.ok()) { - s = r->file_writer->Close(); + s = r->file_writer->Close(opts); } } if (s.ok()) { diff --git a/table/table_builder.h b/table/table_builder.h index d6f0e1a03c..c01d03cb2c 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -102,6 +102,7 @@ struct TableReaderOptions { struct TableBuilderOptions { TableBuilderOptions( const ImmutableOptions& _ioptions, const MutableCFOptions& _moptions, + const ReadOptions& _read_options, const WriteOptions& _write_options, const InternalKeyComparator& _internal_comparator, const IntTblPropCollectorFactories* _int_tbl_prop_collector_factories, CompressionType _compression_type, @@ -115,6 +116,8 @@ struct TableBuilderOptions { const uint64_t _target_file_size = 0, const uint64_t _cur_file_num = 0) : ioptions(_ioptions), moptions(_moptions), + read_options(_read_options), + write_options(_write_options), internal_comparator(_internal_comparator), int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories), compression_type(_compression_type), @@ -133,6 +136,8 @@ struct TableBuilderOptions { const ImmutableOptions& ioptions; const MutableCFOptions& moptions; + const ReadOptions& read_options; + const WriteOptions& write_options; const InternalKeyComparator& internal_comparator; const IntTblPropCollectorFactories* int_tbl_prop_collector_factories; const CompressionType compression_type; diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index 60c84d7bf0..9b24e3c433 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -98,11 +98,13 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, IntTblPropCollectorFactories int_tbl_prop_collector_factories; int unknown_level = -1; + const WriteOptions write_options; tb = opts.table_factory->NewTableBuilder( - TableBuilderOptions( - ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - CompressionType::kNoCompression, CompressionOptions(), - 0 /* column_family_id */, kDefaultColumnFamilyName, unknown_level), + TableBuilderOptions(ioptions, moptions, read_options, write_options, + ikc, &int_tbl_prop_collector_factories, + CompressionType::kNoCompression, + CompressionOptions(), 0 /* column_family_id */, + kDefaultColumnFamilyName, unknown_level), file_writer.get()); } else { s = DB::Open(opts, dbname, &db); @@ -122,7 +124,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, } if (!through_db) { tb->Finish(); - file_writer->Close(); + file_writer->Close(IOOptions()); } else { db->Flush(FlushOptions()); } diff --git a/table/table_test.cc b/table/table_test.cc index 298e25fbd5..15b07854ee 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -383,8 +383,11 @@ class TableConstructor : public Constructor { } std::string column_family_name; + const ReadOptions read_options; + const WriteOptions write_options; builder.reset(ioptions.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, internal_comparator, + TableBuilderOptions(ioptions, moptions, read_options, write_options, + internal_comparator, &int_tbl_prop_collector_factories, options.compression, options.compression_opts, kUnknownColumnFamily, column_family_name, level_), @@ -402,7 +405,7 @@ class TableConstructor : public Constructor { EXPECT_OK(builder->status()); } Status s = builder->Finish(); - EXPECT_OK(file_writer_->Flush()); + EXPECT_OK(file_writer_->Flush(IOOptions())); EXPECT_TRUE(s.ok()) << s.ToString(); EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize()); @@ -1309,7 +1312,7 @@ class FileChecksumTestHelper { EXPECT_TRUE(table_builder_->status().ok()); } Status s = table_builder_->Finish(); - EXPECT_OK(file_writer_->Flush()); + EXPECT_OK(file_writer_->Flush(IOOptions())); EXPECT_OK(s); EXPECT_EQ(sink_->contents().size(), table_builder_->FileSize()); @@ -1317,7 +1320,7 @@ class FileChecksumTestHelper { } std::string GetFileChecksum() { - EXPECT_OK(file_writer_->Close()); + EXPECT_OK(file_writer_->Close(IOOptions())); return table_builder_->GetFileChecksum(); } @@ -4466,9 +4469,11 @@ TEST_P(BlockBasedTableTest, NoFileChecksum) { FileChecksumTestHelper f(true); f.CreateWritableFile(); std::unique_ptr builder; + const ReadOptions read_options; + const WriteOptions write_options; builder.reset(ioptions.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, *comparator, - &int_tbl_prop_collector_factories, + TableBuilderOptions(ioptions, moptions, read_options, write_options, + *comparator, &int_tbl_prop_collector_factories, options.compression, options.compression_opts, kUnknownColumnFamily, column_family_name, level), f.GetFileWriter())); @@ -4502,9 +4507,11 @@ TEST_P(BlockBasedTableTest, Crc32cFileChecksum) { f.CreateWritableFile(); f.SetFileChecksumGenerator(checksum_crc32c_gen1.release()); std::unique_ptr builder; + const ReadOptions read_options; + const WriteOptions write_options; builder.reset(ioptions.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, *comparator, - &int_tbl_prop_collector_factories, + TableBuilderOptions(ioptions, moptions, read_options, write_options, + *comparator, &int_tbl_prop_collector_factories, options.compression, options.compression_opts, kUnknownColumnFamily, column_family_name, level), f.GetFileWriter())); @@ -4548,8 +4555,10 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; int unknown_level = -1; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, &int_tbl_prop_collector_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, unknown_level), @@ -4562,7 +4571,7 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { builder->Add(key, value); } ASSERT_OK(builder->Finish()); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Flush(IOOptions())); test::StringSink* ss = static_cast(file_writer->writable_file()); @@ -4572,7 +4581,6 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { new RandomAccessFileReader(std::move(source), "test")); std::unique_ptr props; - const ReadOptions read_options; auto s = ReadTableProperties(file_reader.get(), ss->contents().size(), kPlainTableMagicNumber, ioptions, read_options, &props); @@ -4602,9 +4610,10 @@ TEST_F(PlainTableTest, NoFileChecksum) { int unknown_level = -1; FileChecksumTestHelper f(true); f.CreateWritableFile(); - + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, &int_tbl_prop_collector_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, unknown_level), @@ -4642,9 +4651,10 @@ TEST_F(PlainTableTest, Crc32cFileChecksum) { FileChecksumTestHelper f(true); f.CreateWritableFile(); f.SetFileChecksumGenerator(checksum_crc32c_gen1.release()); - + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, &int_tbl_prop_collector_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, unknown_level), @@ -5252,8 +5262,10 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { new SstFileWriterPropertiesCollectorFactory(2 /* version */, 0 /* global_seqno*/)); std::string column_family_name; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, &int_tbl_prop_collector_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, -1), @@ -5267,7 +5279,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Flush(IOOptions())); test::RandomRWStringSink ss_rw(sink); uint32_t version; @@ -5282,7 +5294,6 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { new RandomAccessFileReader(std::move(source), "")); std::unique_ptr props; - const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), kBlockBasedTableMagicNumber, ioptions, read_options, &props)); @@ -5306,7 +5317,6 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { // Helper function to get the contents of the table InternalIterator std::unique_ptr table_reader; - const ReadOptions read_options; std::function GetTableInternalIter = [&]() { std::unique_ptr source( new test::StringSource(ss_rw.contents(), 73342, true)); @@ -5434,8 +5444,10 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { InternalKeyComparator ikc(options.comparator); IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, &int_tbl_prop_collector_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, -1), @@ -5451,7 +5463,7 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Flush(IOOptions())); std::unique_ptr source( new test::StringSource(sink->contents(), 73342, false)); @@ -5460,7 +5472,6 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { // Helper function to get version, global_seqno, global_seqno_offset std::function VerifyBlockAlignment = [&]() { std::unique_ptr props; - const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(), kBlockBasedTableMagicNumber, ioptions, read_options, &props)); @@ -5488,7 +5499,6 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { 0 /* block_protection_bytes_per_key */), std::move(file_reader), sink->contents().size(), &table_reader)); - ReadOptions read_options; std::unique_ptr db_iter(table_reader->NewIterator( read_options, moptions2.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kUncategorized)); @@ -5526,9 +5536,10 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { InternalKeyComparator ikc(options.comparator); IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; - + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, &int_tbl_prop_collector_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, -1), @@ -5544,7 +5555,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Flush(IOOptions())); std::unique_ptr source( new test::StringSource(sink->contents(), 73342, true)); @@ -5556,20 +5567,19 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { uint64_t file_size = sink->contents().size(); Footer footer; - IOOptions opts; - ASSERT_OK(ReadFooterFromFile(opts, file, *FileSystem::Default(), + ASSERT_OK(ReadFooterFromFile(IOOptions(), file, *FileSystem::Default(), nullptr /* prefetch_buffer */, file_size, &footer, kBlockBasedTableMagicNumber)); auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type, BlockContents* contents) { - ReadOptions read_options; - read_options.verify_checksums = false; + ReadOptions read_options_for_helper; + read_options_for_helper.verify_checksums = false; PersistentCacheOptions cache_options; BlockFetcher block_fetcher( - file, nullptr /* prefetch_buffer */, footer, read_options, handle, - contents, ioptions, false /* decompress */, + file, nullptr /* prefetch_buffer */, footer, read_options_for_helper, + handle, contents, ioptions, false /* decompress */, false /*maybe_compressed*/, block_type, UncompressionDict::GetEmptyDict(), cache_options); @@ -6117,12 +6127,15 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) { InternalKeyComparator ikc(options.comparator); IntTblPropCollectorFactories int_tbl_prop_collector_factories; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder( options.table_factory->NewTableBuilder( - TableBuilderOptions( - ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - kSnappyCompression, options.compression_opts, - kUnknownColumnFamily, "test_cf", -1 /* level */), + TableBuilderOptions(ioptions, moptions, read_options, write_options, + ikc, &int_tbl_prop_collector_factories, + kSnappyCompression, options.compression_opts, + kUnknownColumnFamily, "test_cf", + -1 /* level */), file_writer.get())); std::string key1 = "key1"; @@ -6193,8 +6206,10 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, InternalKeyComparator ikc(options.comparator); IntTblPropCollectorFactories int_tbl_prop_collector_factories; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, &int_tbl_prop_collector_factories, kSnappyCompression, options.compression_opts, kUnknownColumnFamily, "test_cf", -1 /* level */), @@ -6278,8 +6293,10 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) { InternalKeyComparator ikc(options.comparator); IntTblPropCollectorFactories int_tbl_prop_collector_factories; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, &int_tbl_prop_collector_factories, kSnappyCompression, options.compression_opts, kUnknownColumnFamily, "test_cf", -1 /* level */), diff --git a/test_util/testutil.cc b/test_util/testutil.cc index ce221e79bc..b4939ee038 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -463,15 +463,16 @@ bool IsPrefetchSupported(const std::shared_ptr& fs, Random rnd(301); std::string test_string = rnd.RandomString(4096); Slice data(test_string); - Status s = WriteStringToFile(fs.get(), data, tmp, true); + IOOptions opts; + Status s = WriteStringToFile(fs.get(), data, tmp, true, opts); if (s.ok()) { std::unique_ptr file; auto io_s = fs->NewRandomAccessFile(tmp, FileOptions(), &file, nullptr); if (io_s.ok()) { - supported = !(file->Prefetch(0, data.size(), IOOptions(), nullptr) - .IsNotSupported()); + supported = + !(file->Prefetch(0, data.size(), opts, nullptr).IsNotSupported()); } - s = fs->DeleteFile(tmp, IOOptions(), nullptr); + s = fs->DeleteFile(tmp, opts, nullptr); } return s.ok() && supported; } @@ -521,7 +522,7 @@ Status CorruptFile(Env* env, const std::string& fname, int offset, for (int i = 0; i < bytes_to_corrupt; i++) { contents[i + offset] ^= 0x80; } - s = WriteStringToFile(env, contents, fname); + s = WriteStringToFile(env, contents, fname, false /* should_sync */); } if (s.ok() && verify_checksum) { Options options; @@ -544,7 +545,7 @@ Status TruncateFile(Env* env, const std::string& fname, uint64_t new_length) { s = ReadFileToString(env, fname, &contents); if (s.ok()) { contents.resize(static_cast(new_length), 'b'); - s = WriteStringToFile(env, contents, fname); + s = WriteStringToFile(env, contents, fname, false /* should_sync */); } return s; } diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc index a30c650654..1668dfb883 100644 --- a/tools/db_bench_tool_test.cc +++ b/tools/db_bench_tool_test.cc @@ -130,7 +130,7 @@ namespace {} // namespace TEST_F(DBBenchTest, OptionsFile) { const std::string kOptionsFileName = test_path_ + "/OPTIONS_test"; Options opt = GetDefaultOptions(); - ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"}, + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), DBOptions(opt), {"default"}, {ColumnFamilyOptions(opt)}, kOptionsFileName, opt.env->GetFileSystem().get())); @@ -149,7 +149,7 @@ TEST_F(DBBenchTest, OptionsFileUniversal) { Options opt = GetDefaultOptions(kCompactionStyleUniversal, 1); - ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"}, + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), DBOptions(opt), {"default"}, {ColumnFamilyOptions(opt)}, kOptionsFileName, opt.env->GetFileSystem().get())); @@ -166,7 +166,7 @@ TEST_F(DBBenchTest, OptionsFileMultiLevelUniversal) { Options opt = GetDefaultOptions(kCompactionStyleUniversal, 12); - ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"}, + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), DBOptions(opt), {"default"}, {ColumnFamilyOptions(opt)}, kOptionsFileName, opt.env->GetFileSystem().get())); diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 81e946bbce..90b7886eea 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -4376,8 +4376,10 @@ UnsafeRemoveSstFileCommand::UnsafeRemoveSstFileCommand( } void UnsafeRemoveSstFileCommand::DoCommand() { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + PrepareOptions(); OfflineManifestWriter w(options_, db_path_); @@ -4402,7 +4404,7 @@ void UnsafeRemoveSstFileCommand::DoCommand() { s = options_.env->GetFileSystem()->NewDirectory(db_path_, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = w.LogAndApply(read_options, cfd, &edit, db_dir.get()); + s = w.LogAndApply(read_options, write_options, cfd, &edit, db_dir.get()); } } diff --git a/tools/simulated_hybrid_file_system.cc b/tools/simulated_hybrid_file_system.cc index 2b9aa0950f..7d9b9dc6e2 100644 --- a/tools/simulated_hybrid_file_system.cc +++ b/tools/simulated_hybrid_file_system.cc @@ -86,7 +86,9 @@ SimulatedHybridFileSystem::~SimulatedHybridFileSystem() { metadata += f; metadata += "\n"; } - IOStatus s = WriteStringToFile(target(), metadata, metadata_file_name_, true); + IOOptions opts; + IOStatus s = + WriteStringToFile(target(), metadata, metadata_file_name_, true, opts); if (!s.ok()) { fprintf(stderr, "Error writing to file %s: %s", metadata_file_name_.c_str(), s.ToString().c_str()); @@ -240,4 +242,3 @@ IOStatus SimulatedWritableFile::Sync(const IOOptions& options, return target()->Sync(options, dbg); } } // namespace ROCKSDB_NAMESPACE - diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc index 07c42b6e74..2ebdad1ee0 100644 --- a/tools/sst_dump_test.cc +++ b/tools/sst_dump_test.cc @@ -123,10 +123,12 @@ class SSTDumpToolTest : public testing::Test { std::string column_family_name; int unknown_level = -1; + const WriteOptions write_options; tb.reset(opts.table_factory->NewTableBuilder( TableBuilderOptions( - imoptions, moptions, ikc, &int_tbl_prop_collector_factories, - CompressionType::kNoCompression, CompressionOptions(), + imoptions, moptions, read_options, write_options, ikc, + &int_tbl_prop_collector_factories, CompressionType::kNoCompression, + CompressionOptions(), TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, column_family_name, unknown_level), file_writer.get())); @@ -160,7 +162,7 @@ class SSTDumpToolTest : public testing::Test { } } ASSERT_OK(tb->Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); } protected: @@ -417,9 +419,9 @@ TEST_F(SSTDumpToolTest, ValidSSTPath) { std::string sst_file = MakeFilePath("rocksdb_sst_test.sst"); createSST(opts, sst_file); std::string text_file = MakeFilePath("text_file"); - ASSERT_OK(WriteStringToFile(opts.env, "Hello World!", text_file)); + ASSERT_OK(WriteStringToFile(opts.env, "Hello World!", text_file, false)); std::string fake_sst = MakeFilePath("fake_sst.sst"); - ASSERT_OK(WriteStringToFile(opts.env, "Not an SST file!", fake_sst)); + ASSERT_OK(WriteStringToFile(opts.env, "Not an SST file!", fake_sst, false)); for (const auto& command_arg : {"--command=verify", "--command=identify"}) { snprintf(usage[1], kOptLength, "%s", command_arg); diff --git a/unreleased_history/behavior_changes/blob_file_write_micros.md b/unreleased_history/behavior_changes/blob_file_write_micros.md new file mode 100644 index 0000000000..aceb059200 --- /dev/null +++ b/unreleased_history/behavior_changes/blob_file_write_micros.md @@ -0,0 +1 @@ +`rocksdb.blobdb.blob.file.write.micros` expands to also measure time writing the header and footer. Therefore the COUNT may be higher and values may be smaller than before. For stacked BlobDB, it no longer measures the time of explictly flushing blob file. diff --git a/unreleased_history/bug_fixes/blob_tickers.md b/unreleased_history/bug_fixes/blob_tickers.md new file mode 100644 index 0000000000..6858959443 --- /dev/null +++ b/unreleased_history/bug_fixes/blob_tickers.md @@ -0,0 +1 @@ +Fix bugs where `rocksdb.blobdb.blob.file.synced` includes blob files failed to get synced and `rocksdb.blobdb.blob.file.bytes.written` includes blob bytes failed to get written. diff --git a/unreleased_history/new_features/sst_write_micros_file_write_stats_break_down.md b/unreleased_history/new_features/sst_write_micros_file_write_stats_break_down.md new file mode 100644 index 0000000000..6e86998675 --- /dev/null +++ b/unreleased_history/new_features/sst_write_micros_file_write_stats_break_down.md @@ -0,0 +1 @@ +Add new statistics: `rocksdb.sst.write.micros` measures time of each write to SST file; `rocksdb.file.write.{flush|compaction|db.open}.micros` measure time of each write to SST table (currently only block-based table format) and blob file for flush, compaction and db open. diff --git a/util/file_checksum_helper.cc b/util/file_checksum_helper.cc index 59da96fa8d..3e57bc78af 100644 --- a/util/file_checksum_helper.cc +++ b/util/file_checksum_helper.cc @@ -98,7 +98,7 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, return Status::InvalidArgument("checksum_list is nullptr"); } assert(checksum_list); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; checksum_list->reset(); Status s; diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc index 036c030dc1..dfdd7d32c7 100644 --- a/util/file_reader_writer_test.cc +++ b/util/file_reader_writer_test.cc @@ -113,16 +113,16 @@ TEST_F(WritableFileWriterTest, RangeSync) { for (int i = 0; i < 1000; i++) { int skew_limit = (i < 700) ? 10 : 15; uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100); - s = writer->Append(Slice(large_buf.get(), num)); + s = writer->Append(IOOptions(), Slice(large_buf.get(), num)); ASSERT_OK(s); // Flush in a chance of 1/10. if (r.Uniform(10) == 0) { - s = writer->Flush(); + s = writer->Flush(IOOptions()); ASSERT_OK(s); } } - s = writer->Close(); + s = writer->Close(IOOptions()); ASSERT_OK(s); } @@ -215,16 +215,16 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) { for (int i = 0; i < 20; i++) { uint32_t num = r.Skewed(16) * 100 + r.Uniform(100); std::string random_string = r.RandomString(num); - ASSERT_OK(writer->Append(Slice(random_string.c_str(), num))); + ASSERT_OK(writer->Append(IOOptions(), Slice(random_string.c_str(), num))); target.append(random_string.c_str(), num); // In some attempts, flush in a chance of 1/10. if (!no_flush && r.Uniform(10) == 0) { - ASSERT_OK(writer->Flush()); + ASSERT_OK(writer->Flush(IOOptions())); } } - ASSERT_OK(writer->Flush()); - ASSERT_OK(writer->Close()); + ASSERT_OK(writer->Flush(IOOptions())); + ASSERT_OK(writer->Close(IOOptions())); ASSERT_EQ(target.size(), actual.size()); ASSERT_EQ(target, actual); } @@ -272,27 +272,28 @@ TEST_F(DBWritableFileWriterTest, AppendWithChecksum) { ImmutableOptions ioptions(options); file_writer.reset(new WritableFileWriter( std::move(file), fname, file_options, SystemClock::Default().get(), - nullptr, ioptions.stats, ioptions.listeners, - ioptions.file_checksum_gen_factory.get(), true, true)); + nullptr, ioptions.stats, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + ioptions.listeners, ioptions.file_checksum_gen_factory.get(), true, + true)); Random rnd(301); std::string data = rnd.RandomString(1000); uint32_t data_crc32c = crc32c::Value(data.c_str(), data.size()); fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); - - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); + ASSERT_OK(file_writer->Flush(IOOptions())); Random size_r(47); for (int i = 0; i < 2000; i++) { data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); data_crc32c = crc32c::Value(data.c_str(), data.size()); - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + ASSERT_OK( + file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); data = rnd.RandomString((static_cast(size_r.Next()) % 97)); - ASSERT_OK(file_writer->Append(Slice(data.c_str()))); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush(IOOptions())); } - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); Destroy(options); } @@ -314,27 +315,29 @@ TEST_F(DBWritableFileWriterTest, AppendVerifyNoChecksum) { // So Append with checksum logic will not be triggered file_writer.reset(new WritableFileWriter( std::move(file), fname, file_options, SystemClock::Default().get(), - nullptr, ioptions.stats, ioptions.listeners, - ioptions.file_checksum_gen_factory.get(), true, false)); + nullptr, ioptions.stats, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + ioptions.listeners, ioptions.file_checksum_gen_factory.get(), true, + false)); Random rnd(301); std::string data = rnd.RandomString(1000); uint32_t data_crc32c = crc32c::Value(data.c_str(), data.size()); fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); + ASSERT_OK(file_writer->Flush(IOOptions())); Random size_r(47); for (int i = 0; i < 1000; i++) { data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); data_crc32c = crc32c::Value(data.c_str(), data.size()); - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + ASSERT_OK( + file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); data = rnd.RandomString((static_cast(size_r.Next()) % 97)); - ASSERT_OK(file_writer->Append(Slice(data.c_str()))); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush(IOOptions())); } - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); Destroy(options); } @@ -357,8 +360,9 @@ TEST_F(DBWritableFileWriterTest, AppendWithChecksumRateLimiter) { // So Append with checksum logic will not be triggered file_writer.reset(new WritableFileWriter( std::move(file), fname, file_options, SystemClock::Default().get(), - nullptr, ioptions.stats, ioptions.listeners, - ioptions.file_checksum_gen_factory.get(), true, true)); + nullptr, ioptions.stats, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + ioptions.listeners, ioptions.file_checksum_gen_factory.get(), true, + true)); fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); Random rnd(301); @@ -370,17 +374,18 @@ TEST_F(DBWritableFileWriterTest, AppendWithChecksumRateLimiter) { for (int i = 0; i < 100; i++) { data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); data_crc32c = crc32c::Value(data.c_str(), data.size()); - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + ASSERT_OK( + file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); bytes_written += static_cast(data.size()); data = rnd.RandomString((static_cast(size_r.Next()) % 97)); - ASSERT_OK(file_writer->Append(Slice(data.c_str()))); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush(IOOptions())); bytes_written += static_cast(data.size()); } uint64_t elapsed = fault_env_->NowMicros() - start; double raw_rate = bytes_written * 1000000.0 / elapsed; - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); // Set the rate-limiter FileOptions file_options1 = FileOptions(); @@ -397,19 +402,21 @@ TEST_F(DBWritableFileWriterTest, AppendWithChecksumRateLimiter) { // So Append with checksum logic will not be triggered file_writer.reset(new WritableFileWriter( std::move(file), fname, file_options1, SystemClock::Default().get(), - nullptr, ioptions.stats, ioptions.listeners, - ioptions.file_checksum_gen_factory.get(), true, true)); + nullptr, ioptions.stats, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + ioptions.listeners, ioptions.file_checksum_gen_factory.get(), true, + true)); for (int i = 0; i < 1000; i++) { data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); data_crc32c = crc32c::Value(data.c_str(), data.size()); - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + ASSERT_OK( + file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); data = rnd.RandomString((static_cast(size_r.Next()) % 97)); - ASSERT_OK(file_writer->Append(Slice(data.c_str()))); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush(IOOptions())); } - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); if (file_options1.rate_limiter != nullptr) { delete file_options1.rate_limiter; } @@ -465,12 +472,12 @@ TEST_F(WritableFileWriterTest, AppendStatusReturn) { std::unique_ptr writer( new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions())); - ASSERT_OK(writer->Append(std::string(2 * kMb, 'a'))); + ASSERT_OK(writer->Append(IOOptions(), std::string(2 * kMb, 'a'))); // Next call to WritableFile::Append() should fail FakeWF* fwf = static_cast(writer->writable_file()); fwf->SetIOError(true); - ASSERT_NOK(writer->Append(std::string(2 * kMb, 'b'))); + ASSERT_NOK(writer->Append(IOOptions(), std::string(2 * kMb, 'b'))); } class ReadaheadRandomAccessFileTest @@ -498,9 +505,9 @@ class ReadaheadRandomAccessFileTest new test::StringSink(&control_contents_)); std::unique_ptr write_holder(new WritableFileWriter( std::move(sink), "" /* don't care */, FileOptions())); - Status s = write_holder->Append(Slice(str)); + Status s = write_holder->Append(IOOptions(), Slice(str)); EXPECT_OK(s); - s = write_holder->Flush(); + s = write_holder->Flush(IOOptions()); EXPECT_OK(s); std::unique_ptr read_holder( new test::StringSource(control_contents_)); @@ -878,26 +885,27 @@ TEST_F(DBWritableFileWriterTest, IOErrorNotification) { file_writer.reset(new WritableFileWriter( std::move(writable_file_ptr), fname, file_options, - SystemClock::Default().get(), nullptr, ioptions.stats, ioptions.listeners, + SystemClock::Default().get(), nullptr, ioptions.stats, + Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, ioptions.listeners, ioptions.file_checksum_gen_factory.get(), true, true)); FakeWF* fwf = static_cast(file_writer->writable_file()); fwf->SetIOError(true); - ASSERT_NOK(file_writer->Append(std::string(2 * kMb, 'a'))); + ASSERT_NOK(file_writer->Append(IOOptions(), std::string(2 * kMb, 'a'))); fwf->CheckCounters(1, 0); ASSERT_EQ(listener->NotifyErrorCount(), 1); file_writer->reset_seen_error(); fwf->SetIOError(true); - ASSERT_NOK(file_writer->Flush()); + ASSERT_NOK(file_writer->Flush(IOOptions())); fwf->CheckCounters(1, 1); ASSERT_EQ(listener->NotifyErrorCount(), 2); /* No error generation */ file_writer->reset_seen_error(); fwf->SetIOError(false); - ASSERT_OK(file_writer->Append(std::string(2 * kMb, 'b'))); + ASSERT_OK(file_writer->Append(IOOptions(), std::string(2 * kMb, 'b'))); ASSERT_EQ(listener->NotifyErrorCount(), 2); fwf->CheckCounters(1, 1); } @@ -1006,23 +1014,29 @@ class WritableFileWriterIOPriorityTest : public testing::Test { }; TEST_F(WritableFileWriterIOPriorityTest, Append) { - ASSERT_OK(writer_->Append(Slice("abc"))); + ASSERT_OK(writer_->Append(IOOptions(), Slice("abc"))); } -TEST_F(WritableFileWriterIOPriorityTest, Pad) { ASSERT_OK(writer_->Pad(500)); } +TEST_F(WritableFileWriterIOPriorityTest, Pad) { + ASSERT_OK(writer_->Pad(IOOptions(), 500)); +} -TEST_F(WritableFileWriterIOPriorityTest, Flush) { ASSERT_OK(writer_->Flush()); } +TEST_F(WritableFileWriterIOPriorityTest, Flush) { + ASSERT_OK(writer_->Flush(IOOptions())); +} -TEST_F(WritableFileWriterIOPriorityTest, Close) { ASSERT_OK(writer_->Close()); } +TEST_F(WritableFileWriterIOPriorityTest, Close) { + ASSERT_OK(writer_->Close(IOOptions())); +} TEST_F(WritableFileWriterIOPriorityTest, Sync) { - ASSERT_OK(writer_->Sync(false)); - ASSERT_OK(writer_->Sync(true)); + ASSERT_OK(writer_->Sync(IOOptions(), false)); + ASSERT_OK(writer_->Sync(IOOptions(), true)); } TEST_F(WritableFileWriterIOPriorityTest, SyncWithoutFlush) { - ASSERT_OK(writer_->SyncWithoutFlush(false)); - ASSERT_OK(writer_->SyncWithoutFlush(true)); + ASSERT_OK(writer_->SyncWithoutFlush(IOOptions(), false)); + ASSERT_OK(writer_->SyncWithoutFlush(IOOptions(), true)); } TEST_F(WritableFileWriterIOPriorityTest, BasicOp) { @@ -1037,16 +1051,16 @@ TEST_F(WritableFileWriterIOPriorityTest, BasicOp) { for (int i = 0; i < 1000; i++) { int skew_limit = (i < 700) ? 10 : 15; uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100); - s = writer->Append(Slice(large_buf.get(), num)); + s = writer->Append(IOOptions(), Slice(large_buf.get(), num)); ASSERT_OK(s); // Flush in a chance of 1/10. if (r.Uniform(10) == 0) { - s = writer->Flush(); + s = writer->Flush(IOOptions()); ASSERT_OK(s); } } - s = writer->Close(); + s = writer->Close(IOOptions()); ASSERT_OK(s); } } // namespace ROCKSDB_NAMESPACE diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc index c1637db15d..25602791ec 100644 --- a/util/log_write_bench.cc +++ b/util/log_write_bench.cc @@ -41,9 +41,9 @@ void RunBenchmark() { std::unique_ptr file; env->NewWritableFile(file_name, &file, env_options); std::unique_ptr writer; - writer.reset(new WritableFileWriter(std::move(file), file_name, env_options, - clock, nullptr /* stats */, - options.listeners)); + writer.reset(new WritableFileWriter( + std::move(file), file_name, env_options, clock, nullptr /* stats */, + Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, options.listeners)); std::string record; record.assign(FLAGS_record_size, 'X'); diff --git a/utilities/backup/backup_engine.cc b/utilities/backup/backup_engine.cc index 31a7337315..e2f0d4a7b4 100644 --- a/utilities/backup/backup_engine.cc +++ b/utilities/backup/backup_engine.cc @@ -2195,6 +2195,7 @@ IOStatus BackupEngineImpl::CopyOrCreateFile( rate_limiter ? static_cast(rate_limiter->GetSingleBurstBytes()) : kDefaultCopyFileBufferSize; + // TODO: pass in Histograms if the destination file is sst or blob std::unique_ptr dest_writer( new WritableFileWriter(std::move(dst_file), dst, dst_file_options)); std::unique_ptr src_reader; @@ -2209,6 +2210,7 @@ IOStatus BackupEngineImpl::CopyOrCreateFile( } Slice data; + const IOOptions opts; do { if (stop_backup_.load(std::memory_order_acquire)) { return status_to_io_status(Status::Incomplete("Backup stopped")); @@ -2238,7 +2240,8 @@ IOStatus BackupEngineImpl::CopyOrCreateFile( if (checksum_hex != nullptr) { checksum_value = crc32c::Extend(checksum_value, data.data(), data.size()); } - io_s = dest_writer->Append(data); + + io_s = dest_writer->Append(opts, data); if (rate_limiter != nullptr) { if (!src.empty()) { @@ -2275,10 +2278,10 @@ IOStatus BackupEngineImpl::CopyOrCreateFile( } if (io_s.ok() && sync) { - io_s = dest_writer->Sync(false); + io_s = dest_writer->Sync(opts, false); } if (io_s.ok()) { - io_s = dest_writer->Close(); + io_s = dest_writer->Close(opts); } return io_s; } @@ -3352,4 +3355,3 @@ void TEST_SetDefaultRateLimitersClock( restore_rate_limiter_clock); } } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc index 5ed6ae8951..d768fbe97e 100644 --- a/utilities/backup/backup_engine_test.cc +++ b/utilities/backup/backup_engine_test.cc @@ -931,7 +931,7 @@ class BackupEngineTest : public testing::Test { } file_contents[0] = (file_contents[0] + 257) % 256; - return WriteStringToFile(test_db_env_.get(), file_contents, fname); + return WriteStringToFile(test_db_env_.get(), file_contents, fname, false); } void AssertDirectoryFilesMatchRegex(const std::string& dir, diff --git a/utilities/blob_db/blob_compaction_filter.cc b/utilities/blob_db/blob_compaction_filter.cc index ddaa98c7d3..97543214db 100644 --- a/utilities/blob_db/blob_compaction_filter.cc +++ b/utilities/blob_db/blob_compaction_filter.cc @@ -181,7 +181,9 @@ bool BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded() const { BlobDBImpl* const blob_db_impl = context_.blob_db_impl; assert(blob_db_impl); + // TODO: plumb Env::IOActivity, Env::IOPriority const Status s = blob_db_impl->CreateBlobFileAndWriter( + WriteOptions(), /* has_ttl */ false, ExpirationRange(), "compaction/GC", &blob_file_, &writer_); if (!s.ok()) { @@ -251,8 +253,9 @@ bool BlobIndexCompactionFilterBase::WriteBlobToNewFile( assert(writer_); uint64_t new_key_offset = 0; - const Status s = writer_->AddRecord(key, blob, kNoExpiration, &new_key_offset, - new_blob_offset); + // TODO: plumb Env::IOActivity, Env::IOPriority + const Status s = writer_->AddRecord(WriteOptions(), key, blob, kNoExpiration, + &new_key_offset, new_blob_offset); if (!s.ok()) { const BlobDBImpl* const blob_db_impl = context_.blob_db_impl; @@ -302,7 +305,8 @@ bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFile() const { { WriteLock wl(&blob_db_impl->mutex_); - s = blob_db_impl->CloseBlobFile(blob_file_); + // TODO: plumb Env::IOActivity, Env::IOPriority + s = blob_db_impl->CloseBlobFile(WriteOptions(), blob_file_); // Note: we delay registering the new blob file until it's closed to // prevent FIFO eviction from processing it during compaction/GC. diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h index e2f0b7bdbd..59242a645a 100644 --- a/utilities/blob_db/blob_db.h +++ b/utilities/blob_db/blob_db.h @@ -248,7 +248,7 @@ class BlobDB : public StackableDB { virtual BlobDBOptions GetBlobDBOptions() const = 0; - virtual Status SyncBlobFiles() = 0; + virtual Status SyncBlobFiles(const WriteOptions& write_options) = 0; virtual ~BlobDB() {} diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 2fa7ae898f..6fded84411 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -23,6 +23,7 @@ #include "logging/logging.h" #include "monitoring/instrumented_mutex.h" #include "monitoring/statistics_impl.h" +#include "monitoring/thread_status_util.h" #include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" @@ -106,6 +107,15 @@ BlobDBImpl::~BlobDBImpl() { } Status BlobDBImpl::Close() { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); + Status s = CloseImpl(); + ThreadStatusUtil::SetThreadOperation(cur_op_type); + return s; +} + +Status BlobDBImpl::CloseImpl() { if (closed_) { return Status::OK(); } @@ -123,7 +133,8 @@ Status BlobDBImpl::Close() { return s; } - s = SyncBlobFiles(); + // TODO: plumb Env::IOActivity, Env::IOPriority + s = SyncBlobFiles(WriteOptions()); return s; } @@ -277,7 +288,7 @@ Status BlobDBImpl::Open(std::vector* handles) { return s; } - UpdateLiveSSTSize(); + UpdateLiveSSTSize(WriteOptions(Env::IOActivity::kDBOpen)); // Start background jobs. if (!bdb_options_.disable_background_tasks) { @@ -743,7 +754,9 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr& bfile) { } std::unique_ptr fwriter; - fwriter.reset(new WritableFileWriter(std::move(wfile), fpath, file_options_)); + fwriter.reset(new WritableFileWriter( + std::move(wfile), fpath, file_options_, clock_, nullptr /* io_tracer */, + statistics_, Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS)); uint64_t boffset = bfile->GetFileSize(); if (debug_level_ >= 2 && boffset) { @@ -824,8 +837,9 @@ Status BlobDBImpl::CheckOrCreateWriterLocked( } Status BlobDBImpl::CreateBlobFileAndWriter( - bool has_ttl, const ExpirationRange& expiration_range, - const std::string& reason, std::shared_ptr* blob_file, + const WriteOptions& write_options, bool has_ttl, + const ExpirationRange& expiration_range, const std::string& reason, + std::shared_ptr* blob_file, std::shared_ptr* writer) { TEST_SYNC_POINT("BlobDBImpl::CreateBlobFileAndWriter"); assert(has_ttl == (expiration_range.first || expiration_range.second)); @@ -846,7 +860,7 @@ Status BlobDBImpl::CreateBlobFileAndWriter( assert(*writer); - s = (*writer)->WriteHeader((*blob_file)->header_); + s = (*writer)->WriteHeader(write_options, (*blob_file)->header_); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "Failed to write header to new blob file: %s" @@ -861,7 +875,8 @@ Status BlobDBImpl::CreateBlobFileAndWriter( return s; } -Status BlobDBImpl::SelectBlobFile(std::shared_ptr* blob_file) { +Status BlobDBImpl::SelectBlobFile(const WriteOptions& write_options, + std::shared_ptr* blob_file) { assert(blob_file); { @@ -885,6 +900,7 @@ Status BlobDBImpl::SelectBlobFile(std::shared_ptr* blob_file) { std::shared_ptr writer; const Status s = CreateBlobFileAndWriter( + write_options, /* has_ttl */ false, ExpirationRange(), /* reason */ "SelectBlobFile", blob_file, &writer); if (!s.ok()) { @@ -897,7 +913,8 @@ Status BlobDBImpl::SelectBlobFile(std::shared_ptr* blob_file) { return s; } -Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration, +Status BlobDBImpl::SelectBlobFileTTL(const WriteOptions& write_options, + uint64_t expiration, std::shared_ptr* blob_file) { assert(blob_file); assert(expiration != kNoExpiration); @@ -930,9 +947,9 @@ Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration, oss << "SelectBlobFileTTL range: [" << exp_low << ',' << exp_high << ')'; std::shared_ptr writer; - const Status s = - CreateBlobFileAndWriter(/* has_ttl */ true, expiration_range, - /* reason */ oss.str(), blob_file, &writer); + const Status s = CreateBlobFileAndWriter( + write_options, /* has_ttl */ true, expiration_range, + /* reason */ oss.str(), blob_file, &writer); if (!s.ok()) { return s; } @@ -1055,7 +1072,7 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key, return s; } -Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/, +Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options, const Slice& key, const Slice& value, uint64_t expiration, WriteBatch* batch) { write_mutex_.AssertHeld(); @@ -1087,30 +1104,30 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/, // Check DB size limit before selecting blob file to // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be // done before calling SelectBlobFile(). - s = CheckSizeAndEvictBlobFiles(headerbuf.size() + key.size() + - value_compressed.size()); + s = CheckSizeAndEvictBlobFiles( + write_options, headerbuf.size() + key.size() + value_compressed.size()); if (!s.ok()) { return s; } std::shared_ptr blob_file; if (expiration != kNoExpiration) { - s = SelectBlobFileTTL(expiration, &blob_file); + s = SelectBlobFileTTL(write_options, expiration, &blob_file); } else { - s = SelectBlobFile(&blob_file); + s = SelectBlobFile(write_options, &blob_file); } if (s.ok()) { assert(blob_file != nullptr); assert(blob_file->GetCompressionType() == bdb_options_.compression); - s = AppendBlob(blob_file, headerbuf, key, value_compressed, expiration, - &index_entry); + s = AppendBlob(write_options, blob_file, headerbuf, key, value_compressed, + expiration, &index_entry); } if (s.ok()) { if (expiration != kNoExpiration) { WriteLock file_lock(&blob_file->mutex_); blob_file->ExtendExpirationRange(expiration); } - s = CloseBlobFileIfNeeded(blob_file); + s = CloseBlobFileIfNeeded(write_options, blob_file); } if (s.ok()) { s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key, @@ -1249,7 +1266,7 @@ void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context, } } -void BlobDBImpl::UpdateLiveSSTSize() { +void BlobDBImpl::UpdateLiveSSTSize(const WriteOptions& write_options) { uint64_t live_sst_size = 0; bool ok = GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size); if (ok) { @@ -1265,7 +1282,7 @@ void BlobDBImpl::UpdateLiveSSTSize() { { // Trigger FIFO eviction if needed. MutexLock l(&write_mutex_); - Status s = CheckSizeAndEvictBlobFiles(0, true /*force*/); + Status s = CheckSizeAndEvictBlobFiles(write_options, 0, true /*force*/); if (s.IsNoSpace()) { ROCKS_LOG_WARN(db_options_.info_log, "DB grow out-of-space after SST size updated. Current live" @@ -1276,7 +1293,8 @@ void BlobDBImpl::UpdateLiveSSTSize() { } } -Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size, +Status BlobDBImpl::CheckSizeAndEvictBlobFiles(const WriteOptions& write_options, + uint64_t blob_size, bool force_evict) { write_mutex_.AssertHeld(); @@ -1316,7 +1334,7 @@ Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size, } // FIFO eviction can evict open blob files. if (!blob_file->Immutable()) { - Status s = CloseBlobFile(blob_file); + Status s = CloseBlobFile(write_options, blob_file); if (!s.ok()) { return s; } @@ -1347,7 +1365,8 @@ Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size, return Status::OK(); } -Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, +Status BlobDBImpl::AppendBlob(const WriteOptions& write_options, + const std::shared_ptr& bfile, const std::string& headerbuf, const Slice& key, const Slice& value, uint64_t expiration, std::string* index_entry) { @@ -1363,8 +1382,8 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, } // write the blob to the blob log. - s = writer->EmitPhysicalRecord(headerbuf, key, value, &key_offset, - &blob_offset); + s = writer->EmitPhysicalRecord(write_options, headerbuf, key, value, + &key_offset, &blob_offset); } if (!s.ok()) { @@ -1767,7 +1786,8 @@ std::pair BlobDBImpl::SanityCheck(bool aborted) { return std::make_pair(true, -1); } -Status BlobDBImpl::CloseBlobFile(std::shared_ptr bfile) { +Status BlobDBImpl::CloseBlobFile(const WriteOptions& write_options, + std::shared_ptr bfile) { TEST_SYNC_POINT("BlobDBImpl::CloseBlobFile"); assert(bfile); assert(!bfile->Immutable()); @@ -1783,7 +1803,7 @@ Status BlobDBImpl::CloseBlobFile(std::shared_ptr bfile) { const SequenceNumber sequence = GetLatestSequenceNumber(); - const Status s = bfile->WriteFooterAndCloseLocked(sequence); + const Status s = bfile->WriteFooterAndCloseLocked(write_options, sequence); if (s.ok()) { total_blob_size_ += BlobLogFooter::kSize; @@ -1815,7 +1835,8 @@ Status BlobDBImpl::CloseBlobFile(std::shared_ptr bfile) { return s; } -Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr& bfile) { +Status BlobDBImpl::CloseBlobFileIfNeeded(const WriteOptions& write_options, + std::shared_ptr& bfile) { write_mutex_.AssertHeld(); // atomic read @@ -1831,7 +1852,7 @@ Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr& bfile) { return Status::OK(); } - return CloseBlobFile(bfile); + return CloseBlobFile(write_options, bfile); } void BlobDBImpl::ObsoleteBlobFile(std::shared_ptr blob_file, @@ -1921,7 +1942,8 @@ std::pair BlobDBImpl::EvictExpiredFiles(bool aborted) { } if (!blob_file->Immutable()) { - CloseBlobFile(blob_file).PermitUncheckedError(); + // TODO: plumb Env::IOActivity, Env::IOPriority + CloseBlobFile(WriteOptions(), blob_file).PermitUncheckedError(); } assert(blob_file->Immutable()); @@ -1933,7 +1955,7 @@ std::pair BlobDBImpl::EvictExpiredFiles(bool aborted) { return std::make_pair(true, -1); } -Status BlobDBImpl::SyncBlobFiles() { +Status BlobDBImpl::SyncBlobFiles(const WriteOptions& write_options) { MutexLock l(&write_mutex_); std::vector> process_files; @@ -1949,7 +1971,7 @@ Status BlobDBImpl::SyncBlobFiles() { Status s; for (auto& blob_file : process_files) { - s = blob_file->Fsync(); + s = blob_file->Fsync(write_options); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "Failed to sync blob file %" PRIu64 ", status: %s", @@ -2196,7 +2218,7 @@ Status BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr& bfile) { WriteLock lock(&mutex_); WriteLock file_lock(&bfile->mutex_); - return CloseBlobFile(bfile); + return CloseBlobFile(WriteOptions(), bfile); } void BlobDBImpl::TEST_ObsoleteBlobFile(std::shared_ptr& blob_file, diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h index d491108d3e..365ce6c505 100644 --- a/utilities/blob_db/blob_db_impl.h +++ b/utilities/blob_db/blob_db_impl.h @@ -167,7 +167,7 @@ class BlobDBImpl : public BlobDB { Status Open(std::vector* handles); - Status SyncBlobFiles() override; + Status SyncBlobFiles(const WriteOptions& write_options) override; // Common part of the two GetCompactionContext methods below. // REQUIRES: read lock on mutex_ @@ -245,11 +245,13 @@ class BlobDBImpl : public BlobDB { // to a single thread (like in the case of new files written during // compaction/GC), the locks on write_mutex_ and the blob file's mutex_ can be // avoided. - Status CloseBlobFile(std::shared_ptr bfile); + Status CloseBlobFile(const WriteOptions& write_options, + std::shared_ptr bfile); // Close a file if its size exceeds blob_file_size // REQUIRES: lock held on write_mutex_. - Status CloseBlobFileIfNeeded(std::shared_ptr& bfile); + Status CloseBlobFileIfNeeded(const WriteOptions& write_options, + std::shared_ptr& bfile); // Mark file as obsolete and move the file to obsolete file list. // @@ -261,13 +263,15 @@ class BlobDBImpl : public BlobDB { const Slice& value, uint64_t expiration, WriteBatch* batch); - Status AppendBlob(const std::shared_ptr& bfile, + Status AppendBlob(const WriteOptions& write_options, + const std::shared_ptr& bfile, const std::string& headerbuf, const Slice& key, const Slice& value, uint64_t expiration, std::string* index_entry); // Create a new blob file and associated writer. - Status CreateBlobFileAndWriter(bool has_ttl, + Status CreateBlobFileAndWriter(const WriteOptions& write_options, + bool has_ttl, const ExpirationRange& expiration_range, const std::string& reason, std::shared_ptr* blob_file, @@ -275,11 +279,13 @@ class BlobDBImpl : public BlobDB { // Get the open non-TTL blob log file, or create a new one if no such file // exists. - Status SelectBlobFile(std::shared_ptr* blob_file); + Status SelectBlobFile(const WriteOptions& write_options, + std::shared_ptr* blob_file); // Get the open TTL blob log file for a certain expiration, or create a new // one if no such file exists. - Status SelectBlobFileTTL(uint64_t expiration, + Status SelectBlobFileTTL(const WriteOptions& write_options, + uint64_t expiration, std::shared_ptr* blob_file); std::shared_ptr FindBlobFileLocked(uint64_t expiration) const; @@ -363,7 +369,7 @@ class BlobDBImpl : public BlobDB { void MarkUnreferencedBlobFilesObsolete(); void MarkUnreferencedBlobFilesObsoleteDuringOpen(); - void UpdateLiveSSTSize(); + void UpdateLiveSSTSize(const WriteOptions& write_options); Status GetBlobFileReader(const std::shared_ptr& blob_file, std::shared_ptr* reader); @@ -394,9 +400,12 @@ class BlobDBImpl : public BlobDB { // If is_fifo = true, FIFO eviction will be triggered to make room for the // new blob. If force_evict = true, FIFO eviction will evict blob files // even eviction will not make enough room for the new blob. - Status CheckSizeAndEvictBlobFiles(uint64_t blob_size, + Status CheckSizeAndEvictBlobFiles(const WriteOptions& write_options, + uint64_t blob_size, bool force_evict = false); + Status CloseImpl(); + // name of the database directory std::string dbname_; diff --git a/utilities/blob_db/blob_db_listener.h b/utilities/blob_db/blob_db_listener.h index c95740c50e..0759b68114 100644 --- a/utilities/blob_db/blob_db_listener.h +++ b/utilities/blob_db/blob_db_listener.h @@ -22,18 +22,20 @@ class BlobDBListener : public EventListener { void OnFlushBegin(DB* /*db*/, const FlushJobInfo& /*info*/) override { assert(blob_db_impl_ != nullptr); - blob_db_impl_->SyncBlobFiles().PermitUncheckedError(); + blob_db_impl_->SyncBlobFiles(WriteOptions(Env::IOActivity::kFlush)) + .PermitUncheckedError(); } void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& /*info*/) override { assert(blob_db_impl_ != nullptr); - blob_db_impl_->UpdateLiveSSTSize(); + blob_db_impl_->UpdateLiveSSTSize(WriteOptions(Env::IOActivity::kFlush)); } void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& /*info*/) override { assert(blob_db_impl_ != nullptr); - blob_db_impl_->UpdateLiveSSTSize(); + blob_db_impl_->UpdateLiveSSTSize( + WriteOptions(Env::IOActivity::kCompaction)); } const char* Name() const override { return kClassName(); } diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc index 5b31d56973..c4c0556fb1 100644 --- a/utilities/blob_db/blob_file.cc +++ b/utilities/blob_db/blob_file.cc @@ -78,7 +78,8 @@ void BlobFile::MarkObsolete(SequenceNumber sequence) { obsolete_.store(true); } -Status BlobFile::WriteFooterAndCloseLocked(SequenceNumber sequence) { +Status BlobFile::WriteFooterAndCloseLocked(const WriteOptions& write_options, + SequenceNumber sequence) { BlobLogFooter footer; footer.blob_count = blob_count_; if (HasTTL()) { @@ -86,7 +87,8 @@ Status BlobFile::WriteFooterAndCloseLocked(SequenceNumber sequence) { } // this will close the file and reset the Writable File Pointer. - Status s = log_writer_->AppendFooter(footer, /* checksum_method */ nullptr, + Status s = log_writer_->AppendFooter(write_options, footer, + /* checksum_method */ nullptr, /* checksum_value */ nullptr); if (s.ok()) { closed_ = true; @@ -137,10 +139,10 @@ Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) { return Status::OK(); } -Status BlobFile::Fsync() { +Status BlobFile::Fsync(const WriteOptions& write_options) { Status s; if (log_writer_.get()) { - s = log_writer_->Sync(); + s = log_writer_->Sync(write_options); } return s; } diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h index 8651c6b672..f0ec83ebe8 100644 --- a/utilities/blob_db/blob_file.h +++ b/utilities/blob_db/blob_file.h @@ -180,7 +180,7 @@ class BlobFile { return obsolete_sequence_; } - Status Fsync(); + Status Fsync(const WriteOptions& write_options); uint64_t GetFileSize() const { return file_size_.load(std::memory_order_acquire); @@ -218,7 +218,8 @@ class BlobFile { private: Status ReadFooter(BlobLogFooter* footer); - Status WriteFooterAndCloseLocked(SequenceNumber sequence); + Status WriteFooterAndCloseLocked(const WriteOptions& write_options, + SequenceNumber sequence); void CloseRandomAccessLocked(); diff --git a/utilities/cache_dump_load_impl.h b/utilities/cache_dump_load_impl.h index b5a0783de1..26839e17ab 100644 --- a/utilities/cache_dump_load_impl.h +++ b/utilities/cache_dump_load_impl.h @@ -162,11 +162,12 @@ class ToFileCacheDumpWriter : public CacheDumpWriter { assert(file_writer_ != nullptr); std::string prefix; PutFixed32(&prefix, static_cast(metadata.size())); - IOStatus io_s = file_writer_->Append(Slice(prefix)); + const IOOptions opts; + IOStatus io_s = file_writer_->Append(opts, Slice(prefix)); if (!io_s.ok()) { return io_s; } - io_s = file_writer_->Append(metadata); + io_s = file_writer_->Append(opts, metadata); return io_s; } @@ -175,11 +176,12 @@ class ToFileCacheDumpWriter : public CacheDumpWriter { assert(file_writer_ != nullptr); std::string prefix; PutFixed32(&prefix, static_cast(data.size())); - IOStatus io_s = file_writer_->Append(Slice(prefix)); + const IOOptions opts; + IOStatus io_s = file_writer_->Append(opts, Slice(prefix)); if (!io_s.ok()) { return io_s; } - io_s = file_writer_->Append(data); + io_s = file_writer_->Append(opts, data); return io_s; } diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc index 53bbaeb079..d19eb35927 100644 --- a/utilities/fault_injection_fs.cc +++ b/utilities/fault_injection_fs.cc @@ -917,9 +917,10 @@ IOStatus FaultInjectionTestFS::DeleteFilesCreatedAfterLastDirSync( return io_s; } } else { + IOOptions opts; IOStatus io_s = WriteStringToFile(target(), file_pair.second, - pair.first + "/" + file_pair.first, true); + pair.first + "/" + file_pair.first, true, opts); if (!io_s.ok()) { return io_s; } diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc index fd9affb0d9..2d08c3dd06 100644 --- a/utilities/options/options_util_test.cc +++ b/utilities/options/options_util_test.cc @@ -57,8 +57,8 @@ TEST_F(OptionsUtilTest, SaveAndLoad) { } const std::string kFileName = "OPTIONS-123456"; - ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName, - env_->GetFileSystem().get())); + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), db_opt, cf_names, cf_opts, + kFileName, env_->GetFileSystem().get())); DBOptions loaded_db_opt; std::vector loaded_cf_descs; @@ -125,8 +125,8 @@ TEST_F(OptionsUtilTest, SaveAndLoadWithCacheCheck) { cf_names.push_back("cf_plain_table_sample"); // Saving DB in file const std::string kFileName = "OPTIONS-LOAD_CACHE_123456"; - ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName, - env_->GetFileSystem().get())); + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), db_opt, cf_names, cf_opts, + kFileName, env_->GetFileSystem().get())); DBOptions loaded_db_opt; std::vector loaded_cf_descs; @@ -758,8 +758,8 @@ TEST_F(OptionsUtilTest, WalDirInOptins) { options.wal_dir = dbname_; std::string options_file; ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file)); - ASSERT_OK(PersistRocksDBOptions(options, {"default"}, {options}, - dbname_ + "/" + options_file, + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), options, {"default"}, + {options}, dbname_ + "/" + options_file, options.env->GetFileSystem().get())); ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); ASSERT_EQ(db_opts.wal_dir, dbname_); @@ -779,4 +779,3 @@ int main(int argc, char** argv) { #endif // GFLAGS return RUN_ALL_TESTS(); } - diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc index ff9d52dca9..6d2bf098c4 100644 --- a/utilities/simulator_cache/sim_cache.cc +++ b/utilities/simulator_cache/sim_cache.cc @@ -73,7 +73,7 @@ class CacheActivityLogger { oss << "LOOKUP - " << key.ToString(true) << std::endl; MutexLock l(&mutex_); - Status s = file_writer_->Append(oss.str()); + Status s = file_writer_->Append(IOOptions(), oss.str()); if (!s.ok() && bg_status_.ok()) { bg_status_ = s; } @@ -93,7 +93,7 @@ class CacheActivityLogger { // line format: "ADD - - " oss << "ADD - " << key.ToString(true) << " - " << size << std::endl; MutexLock l(&mutex_); - Status s = file_writer_->Append(oss.str()); + Status s = file_writer_->Append(IOOptions(), oss.str()); if (!s.ok() && bg_status_.ok()) { bg_status_ = s; } @@ -126,7 +126,7 @@ class CacheActivityLogger { } activity_logging_enabled_.store(false); - Status s = file_writer_->Close(); + Status s = file_writer_->Close(IOOptions()); if (!s.ok() && bg_status_.ok()) { bg_status_ = s; } diff --git a/utilities/trace/file_trace_reader_writer.cc b/utilities/trace/file_trace_reader_writer.cc index f2ca741442..cbbada57cd 100644 --- a/utilities/trace/file_trace_reader_writer.cc +++ b/utilities/trace/file_trace_reader_writer.cc @@ -96,7 +96,7 @@ Status FileTraceWriter::Close() { } Status FileTraceWriter::Write(const Slice& data) { - return file_writer_->Append(data); + return file_writer_->Append(IOOptions(), data); } uint64_t FileTraceWriter::GetFileSize() { return file_writer_->GetFileSize(); } diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index 8009bef197..661e6bc4d7 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -134,6 +134,7 @@ Status PessimisticTransactionDB::Initialize( assert(batch_info.log_number_); assert(recovered_trx->name_.length()); + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions w_options; w_options.sync = true; TransactionOptions t_options; diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index 58126a4750..ddaf077ac3 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -306,6 +306,7 @@ Status WritePreparedTxn::RollbackInternal() { auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap(); auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap(); auto read_at_seq = kMaxSequenceNumber; + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions roptions; // to prevent callback's seq to be overrriden inside DBImpk::Get roptions.snapshot = wpt_db_->GetMaxSnapshot(); diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 91a81d1589..1ddc175110 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -813,6 +813,7 @@ void WritePreparedTxnDB::AdvanceSeqByOne() { // Inserting an empty value will i) let the max evicted entry to be // published, i.e., max == last_published, increase the last published to // be one beyond max, i.e., max < last_published. + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions woptions; TransactionOptions txn_options; Transaction* txn0 = BeginTransaction(woptions, txn_options, nullptr); diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index c30cf9e1f0..2f1069dbc2 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -727,6 +727,7 @@ Status WriteUnpreparedTxn::RollbackInternal() { assert(GetId() > 0); Status s; auto read_at_seq = kMaxSequenceNumber; + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions roptions; // to prevent callback's seq to be overrriden inside DBImpk::Get roptions.snapshot = wpt_db_->GetMaxSnapshot(); @@ -882,6 +883,7 @@ Status WriteUnpreparedTxn::RollbackToSavePointInternal() { assert(save_points_ != nullptr && save_points_->size() > 0); const LockTracker& tracked_keys = *save_points_->top().new_locks_; + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions roptions; roptions.snapshot = top.snapshot_->snapshot(); SequenceNumber min_uncommitted = diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index 1d75dd4490..9219ec03ca 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -37,6 +37,7 @@ Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction( // MemTableInserter during recovery to actually do writes into the DB // instead of just dropping the in-memory write batch. // + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions w_options; class InvalidSnapshotReadCallback : public ReadCallback { @@ -262,6 +263,7 @@ Status WriteUnpreparedTxnDB::Initialize( continue; } + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions w_options; w_options.sync = true; TransactionOptions t_options;