diff --git a/db/column_family_test.cc b/db/column_family_test.cc index c0574ee550..6fa4373c2b 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" +#include "util/defer.h" #include "util/string_util.h" #include "utilities/fault_injection_env.h" #include "utilities/merge_operators.h" @@ -2169,13 +2171,57 @@ TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) { Close(); } +namespace { +struct CountOptionsFilesFs : public FileSystemWrapper { + explicit CountOptionsFilesFs(const std::shared_ptr& t) + : FileSystemWrapper(t) {} + const char* Name() const override { return "CountOptionsFilesFs"; } + + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + if (f.find("OPTIONS-") != std::string::npos) { + options_files_created.fetch_add(1, std::memory_order_relaxed); + } + return FileSystemWrapper::NewWritableFile(f, file_opts, r, dbg); + } + + std::atomic options_files_created{}; +}; +} // namespace + TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) { - Status s = TryOpen({"one", "two"}); - ASSERT_TRUE(!s.ok()); - db_options_.create_missing_column_families = true; - s = TryOpen({"default", "one", "two"}); - ASSERT_TRUE(s.ok()); + // Can't accidentally add CFs to an existing DB + Open(); Close(); + ASSERT_FALSE(db_options_.create_missing_column_families); + ASSERT_NOK(TryOpen({"one", "two"})); + + // Nor accidentally create in a new DB + Destroy(); + db_options_.create_if_missing = true; + ASSERT_NOK(TryOpen({"one", "two"})); + + // Only with the option (new DB case) + db_options_.create_missing_column_families = true; + // Also setup to count number of options files created (see check below) + auto my_fs = + std::make_shared(db_options_.env->GetFileSystem()); + auto my_env = std::make_unique(db_options_.env, my_fs); + SaveAndRestore save_restore_env(&db_options_.env, my_env.get()); + + ASSERT_OK(TryOpen({"default", "one", "two"})); + Close(); + + // An older version would write an updated options file for each column + // family created under create_missing_column_families, which would be + // quadratic I/O in the number of column families. + ASSERT_EQ(my_fs->options_files_created.load(), 1); + + // Add to existing DB case + ASSERT_OK(TryOpen({"default", "one", "two", "three", "four"})); + Close(); + ASSERT_EQ(my_fs->options_files_created.load(), 2); } TEST_P(ColumnFamilyTest, SanitizeOptions) { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 855f6c5344..b7b555070d 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -816,35 +816,35 @@ Status DBImpl::StartPeriodicTaskScheduler() { } Status DBImpl::RegisterRecordSeqnoTimeWorker() { - uint64_t min_time_duration = std::numeric_limits::max(); - uint64_t max_time_duration = std::numeric_limits::min(); + uint64_t min_preserve_seconds = std::numeric_limits::max(); + uint64_t max_preserve_seconds = std::numeric_limits::min(); { InstrumentedMutexLock l(&mutex_); for (auto cfd : *versions_->GetColumnFamilySet()) { // preserve time is the max of 2 options. - uint64_t preserve_time_duration = + uint64_t preserve_seconds = std::max(cfd->ioptions()->preserve_internal_time_seconds, cfd->ioptions()->preclude_last_level_data_seconds); - if (!cfd->IsDropped() && preserve_time_duration > 0) { - min_time_duration = std::min(preserve_time_duration, min_time_duration); - max_time_duration = std::max(preserve_time_duration, max_time_duration); + if (!cfd->IsDropped() && preserve_seconds > 0) { + min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds); + max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds); } } - if (min_time_duration == std::numeric_limits::max()) { + if (min_preserve_seconds == std::numeric_limits::max()) { seqno_to_time_mapping_.Resize(0, 0); } else { - seqno_to_time_mapping_.Resize(min_time_duration, max_time_duration); + seqno_to_time_mapping_.Resize(min_preserve_seconds, max_preserve_seconds); } } uint64_t seqno_time_cadence = 0; - if (min_time_duration != std::numeric_limits::max()) { + if (min_preserve_seconds != std::numeric_limits::max()) { // round up to 1 when the time_duration is smaller than // kMaxSeqnoTimePairsPerCF - seqno_time_cadence = - (min_time_duration + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF - 1) / - SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF; + seqno_time_cadence = (min_preserve_seconds + + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF - 1) / + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF; } Status s; @@ -3296,14 +3296,34 @@ void DBImpl::MultiGetEntity(const ReadOptions& _read_options, statuses, sorted_input); } +Status DBImpl::WrapUpCreateColumnFamilies( + const std::vector& cf_options) { + // NOTE: this function is skipped for create_missing_column_families and + // DB::Open, so new functionality here might need to go into Open also. + bool register_worker = false; + for (auto* opts_ptr : cf_options) { + if (opts_ptr->preserve_internal_time_seconds > 0 || + opts_ptr->preclude_last_level_data_seconds > 0) { + register_worker = true; + break; + } + } + // Attempt both follow-up actions even if one fails + Status s = WriteOptionsFile(true /*need_mutex_lock*/, + true /*need_enter_write_thread*/); + if (register_worker) { + s.UpdateIfOk(RegisterRecordSeqnoTimeWorker()); + } + return s; +} + Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, const std::string& column_family, ColumnFamilyHandle** handle) { assert(handle != nullptr); Status s = CreateColumnFamilyImpl(cf_options, column_family, handle); if (s.ok()) { - s = WriteOptionsFile(true /*need_mutex_lock*/, - true /*need_enter_write_thread*/); + s.UpdateIfOk(WrapUpCreateColumnFamilies({&cf_options})); } return s; } @@ -3327,11 +3347,7 @@ Status DBImpl::CreateColumnFamilies( success_once = true; } if (success_once) { - Status persist_options_status = WriteOptionsFile( - true /*need_mutex_lock*/, true /*need_enter_write_thread*/); - if (s.ok() && !persist_options_status.ok()) { - s = persist_options_status; - } + s.UpdateIfOk(WrapUpCreateColumnFamilies({&cf_options})); } return s; } @@ -3344,6 +3360,8 @@ Status DBImpl::CreateColumnFamilies( size_t num_cf = column_families.size(); Status s; bool success_once = false; + std::vector cf_opts; + cf_opts.reserve(num_cf); for (size_t i = 0; i < num_cf; i++) { ColumnFamilyHandle* handle; s = CreateColumnFamilyImpl(column_families[i].options, @@ -3353,13 +3371,10 @@ Status DBImpl::CreateColumnFamilies( } handles->push_back(handle); success_once = true; + cf_opts.push_back(&column_families[i].options); } if (success_once) { - Status persist_options_status = WriteOptionsFile( - true /*need_mutex_lock*/, true /*need_enter_write_thread*/); - if (s.ok() && !persist_options_status.ok()) { - s = persist_options_status; - } + s.UpdateIfOk(WrapUpCreateColumnFamilies(cf_opts)); } return s; } @@ -3447,10 +3462,6 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, } } // InstrumentedMutexLock l(&mutex_) - if (cf_options.preserve_internal_time_seconds > 0 || - cf_options.preclude_last_level_data_seconds > 0) { - s = RegisterRecordSeqnoTimeWorker(); - } sv_context.Clean(); // this is outside the mutex if (s.ok()) { diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index f4ce560353..d3ab66d1b3 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -1823,10 +1823,15 @@ class DBImpl : public DB { const Status CreateArchivalDirectory(); + // Create a column family, without some of the follow-up work yet Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, const std::string& cf_name, ColumnFamilyHandle** handle); + // Follow-up work to user creating a column family or (families) + Status WrapUpCreateColumnFamilies( + const std::vector& cf_options); + Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family); // Delete any unneeded files and stale in-memory entries. diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index b139a7c8f9..f9ca85405b 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -2069,7 +2069,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, // missing column family, create it ColumnFamilyHandle* handle = nullptr; impl->mutex_.Unlock(); - s = impl->CreateColumnFamily(cf.options, cf.name, &handle); + // NOTE: the work normally done in WrapUpCreateColumnFamilies will + // be done separately below. + s = impl->CreateColumnFamilyImpl(cf.options, cf.name, &handle); impl->mutex_.Lock(); if (s.ok()) { handles->push_back(handle); diff --git a/db/periodic_task_scheduler.h b/db/periodic_task_scheduler.h index 4d129a6797..a93f9a0958 100644 --- a/db/periodic_task_scheduler.h +++ b/db/periodic_task_scheduler.h @@ -42,15 +42,16 @@ class PeriodicTaskScheduler { PeriodicTaskScheduler& operator=(const PeriodicTaskScheduler&) = delete; PeriodicTaskScheduler& operator=(PeriodicTaskScheduler&&) = delete; - // Register a task with its default repeat period + // Register a task with its default repeat period. Thread safe call. Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn); // Register a task with specified repeat period. 0 is an invalid argument - // (kInvalidPeriodSec). To stop the task, please use Unregister() specifically + // (kInvalidPeriodSec). To stop the task, please use Unregister(). + // Thread safe call. Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn, uint64_t repeat_period_seconds); - // Unregister the task + // Unregister the task. Thread safe call. Status Unregister(PeriodicTaskType task_type); #ifndef NDEBUG @@ -105,4 +106,3 @@ class PeriodicTaskScheduler { }; } // namespace ROCKSDB_NAMESPACE - diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 4cd4b09c4e..781e3f277f 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -362,6 +362,10 @@ class DB { // Create a column_family and return the handle of column family // through the argument handle. + // NOTE: creating many column families one-by-one is not recommended because + // of quadratic overheads, such as writing a full OPTIONS file for all CFs + // after each new CF creation. Use CreateColumnFamilies(), or DB::Open() with + // create_missing_column_families=true. virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, const std::string& column_family_name, ColumnFamilyHandle** handle); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 0944e19764..b20c66e14b 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -485,7 +485,8 @@ struct DBOptions { // Default: false bool create_if_missing = false; - // If true, missing column families will be automatically created. + // If true, missing column families will be automatically created on + // DB::Open(). // Default: false bool create_missing_column_families = false; diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index 8b30545451..82597239ff 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -151,6 +151,25 @@ class Status { return state_.get(); } + // Override this status with another, unless this status is already non-ok. + // Returns *this. Thus, the result of `a.UpdateIfOk(b).UpdateIfOk(c)` is + // non-ok (and `a` modified as such) iff any input was non-ok, with + // left-most taking precedence as far as the details. + Status& UpdateIfOk(Status&& s) { + if (code() == kOk) { + *this = std::move(s); + } else { + // Alright to ignore that status as long as this one is checked + s.PermitUncheckedError(); + } + MustCheck(); + return *this; + } + + Status& UpdateIfOk(const Status& s) { + return UpdateIfOk(std::forward(Status(s))); + } + // Return a success status. static Status OK() { return Status(); } diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc index 8d5ac068bb..20e9ebe2c5 100644 --- a/tools/ldb_tool.cc +++ b/tools/ldb_tool.cc @@ -180,4 +180,3 @@ void LDBTool::Run(int argc, char** argv, Options options, exit(error_code); } } // namespace ROCKSDB_NAMESPACE - diff --git a/unreleased_history/performance_improvements/options_files_on_open.md b/unreleased_history/performance_improvements/options_files_on_open.md new file mode 100644 index 0000000000..e4ecf29496 --- /dev/null +++ b/unreleased_history/performance_improvements/options_files_on_open.md @@ -0,0 +1 @@ +Improved the I/O efficiency of DB::Open a new DB with `create_missing_column_families=true` and many column families. diff --git a/util/slice_test.cc b/util/slice_test.cc index 010ded3d87..e82547494b 100644 --- a/util/slice_test.cc +++ b/util/slice_test.cc @@ -243,6 +243,36 @@ TEST_F(SmallEnumSetTest, SmallEnumSetTest2) { } } +// ***************************************************************** // +// Unit test for Status +TEST(StatusTest, Update) { + const Status ok = Status::OK(); + const Status inc = Status::Incomplete("blah"); + const Status notf = Status::NotFound("meow"); + + Status s = ok; + ASSERT_TRUE(s.UpdateIfOk(Status::Corruption("bad")).IsCorruption()); + ASSERT_TRUE(s.IsCorruption()); + + s = ok; + ASSERT_TRUE(s.UpdateIfOk(Status::OK()).ok()); + ASSERT_TRUE(s.UpdateIfOk(ok).ok()); + ASSERT_TRUE(s.ok()); + + ASSERT_TRUE(s.UpdateIfOk(inc).IsIncomplete()); + ASSERT_TRUE(s.IsIncomplete()); + + ASSERT_TRUE(s.UpdateIfOk(notf).IsIncomplete()); + ASSERT_TRUE(s.UpdateIfOk(ok).IsIncomplete()); + ASSERT_TRUE(s.IsIncomplete()); + + // Keeps left-most non-OK status + s = ok; + ASSERT_TRUE( + s.UpdateIfOk(Status()).UpdateIfOk(notf).UpdateIfOk(inc).IsNotFound()); + ASSERT_TRUE(s.IsNotFound()); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) {