From 1777e5f7e9a891b04ba9130623ebba6344838e26 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 10 Jun 2022 16:07:03 -0700 Subject: [PATCH] Snapshots with user-specified timestamps (#9879) Summary: In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable. It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29. This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps. Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps. In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot` object is created with the last published sequence number of the super-version. You can see that the reader actually has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called, an arbitrarily long period of time may have already elapsed since the last write, which is when the last published sequence number is written. This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will ensure any two snapshots with timestamps should satisfy the following: ``` snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts ``` If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create a snapshot with associated timestamp. Code example ```cpp // Create a timestamped snapshot when committing transaction. txn->SetCommitTimestamp(100); txn->SetSnapshotOnNextOperation(); txn->Commit(); // A wrapper API for convenience Status Transaction::CommitAndTryCreateSnapshot( std::shared_ptr notifier, TxnTimestamp ts, std::shared_ptr* ret); // Create a timestamped snapshot if caller guarantees no concurrent writes std::pair> snapshot = txn_db->CreateTimestampedSnapshot(100); ``` The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp. ```cpp // Return the timestamped snapshot correponding to given timestamp. If ts is // kMaxTxnTimestamp, then we return the latest timestamped snapshot if present. // Othersise, we return the snapshot whose timestamp is equal to `ts`. If no // such snapshot exists, then we return null. std::shared_ptr TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const; // Return the latest timestamped snapshot if present. std::shared_ptr TransactionDB::GetLatestTimestampedSnapshot() const; ``` We also provide two additional APIs for stats collection and reporting purposes. ```cpp Status TransactionDB::GetAllTimestampedSnapshots( std::vector>& snapshots) const; // Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`. Status TransactionDB::GetTimestampedSnapshots( TxnTimestamp ts_lb, TxnTimestamp ts_ub, std::vector>& snapshots) const; ``` To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release timestamped snapshots whose timestamps are older than or equal to a given threshold. ```cpp void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts); ``` Before shutdown, RocksDB will release all timestamped snapshots. Comparison with user-defined timestamp and how they can be combined: User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile mapping between snapshots (sequence numbers) and timestamps. Different internal keys with the same user key but different timestamps will be treated as different by compaction, thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection. In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent). The timestamped snapshot supports the semantics of reading at an exact point in time. Timestamped snapshots can also be used with user-defined timestamp. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879 Test Plan: ``` make check TEST_TMPDIR=/dev/shm make crash_test_with_txn ``` Reviewed By: siying Differential Revision: D35783919 Pulled By: riversand963 fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4 --- .gitignore | 3 - CMakeLists.txt | 1 + HISTORY.md | 1 + Makefile | 3 + TARGETS | 6 + db/db_impl/db_impl.cc | 219 +++++++-- db/db_impl/db_impl.h | 35 +- db/db_impl/db_impl_write.cc | 35 +- db/db_test.cc | 6 + db/post_memtable_callback.h | 25 + db/snapshot_impl.h | 67 ++- db/write_thread.h | 7 +- db_stress_tool/db_stress_common.cc | 28 ++ db_stress_tool/db_stress_common.h | 4 + db_stress_tool/db_stress_driver.cc | 12 +- db_stress_tool/db_stress_gflags.cc | 4 + db_stress_tool/db_stress_test_base.cc | 26 +- db_stress_tool/db_stress_test_base.h | 4 +- db_stress_tool/db_stress_tool.cc | 11 + db_stress_tool/no_batched_ops_stress.cc | 8 +- include/rocksdb/snapshot.h | 2 + include/rocksdb/utilities/transaction.h | 34 ++ include/rocksdb/utilities/transaction_db.h | 36 ++ src.mk | 1 + tools/db_crashtest.py | 6 + .../optimistic_transaction_test.cc | 55 ++- .../transactions/pessimistic_transaction.cc | 34 +- .../transactions/pessimistic_transaction.h | 1 + .../pessimistic_transaction_db.cc | 63 +++ .../transactions/pessimistic_transaction_db.h | 40 ++ .../transactions/timestamped_snapshot_test.cc | 426 ++++++++++++++++++ utilities/transactions/transaction_base.cc | 37 ++ utilities/transactions/transaction_base.h | 6 + utilities/transactions/transaction_test.cc | 30 +- utilities/transactions/transaction_test.h | 18 + 35 files changed, 1204 insertions(+), 90 deletions(-) create mode 100644 db/post_memtable_callback.h create mode 100644 utilities/transactions/timestamped_snapshot_test.cc diff --git a/.gitignore b/.gitignore index 5bdb34212a..489ad62a5d 100644 --- a/.gitignore +++ b/.gitignore @@ -36,8 +36,6 @@ manifest_dump sst_dump blob_dump block_cache_trace_analyzer -db_readonly_with_timestamp_test -db_with_timestamp_basic_test tools/block_cache_analyzer/*.pyc column_aware_encoding_exp util/build_version.cc @@ -53,7 +51,6 @@ rocksdb_dump rocksdb_undump db_test2 trace_analyzer -trace_analyzer_test block_cache_trace_analyzer io_tracer_parser .DS_Store diff --git a/CMakeLists.txt b/CMakeLists.txt index 7da02ca79f..f02f8cf2ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1390,6 +1390,7 @@ if(WITH_TESTS) utilities/transactions/write_prepared_transaction_test.cc utilities/transactions/write_unprepared_transaction_test.cc utilities/transactions/lock/range/range_locking_test.cc + utilities/transactions/timestamped_snapshot_test.cc utilities/ttl/ttl_test.cc utilities/util_merge_operators_test.cc utilities/write_batch_with_index/write_batch_with_index_test.cc diff --git a/HISTORY.md b/HISTORY.md index c2f891890f..070a67b4da 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -28,6 +28,7 @@ * Add blob garbage collection parameters `blob_garbage_collection_policy` and `blob_garbage_collection_age_cutoff` to both force-enable and force-disable GC, as well as selectively override age cutoff when using CompactRange. * Add an extra sanity check in `GetSortedWalFiles()` (also used by `GetLiveFilesStorageInfo()`, `BackupEngine`, and `Checkpoint`) to reduce risk of successfully created backup or checkpoint failing to open because of missing WAL file. * Add a new column family option `blob_file_starting_level` to enable writing blob files during flushes and compactions starting from the specified LSM tree level. +* Add support for timestamped snapshots (#9879) ### Behavior changes * DB::Open(), DB::OpenAsSecondary() will fail if a Logger cannot be created (#9984) diff --git a/Makefile b/Makefile index 6e288c0d86..7494987c50 100644 --- a/Makefile +++ b/Makefile @@ -1777,6 +1777,9 @@ write_prepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_prepare write_unprepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_unprepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +timestamped_snapshot_test: $(OBJ_DIR)/utilities/transactions/timestamped_snapshot_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + sst_dump: $(OBJ_DIR)/tools/sst_dump.o $(TOOLS_LIBRARY) $(LIBRARY) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 827d0d17aa..6f311616cf 100644 --- a/TARGETS +++ b/TARGETS @@ -5768,6 +5768,12 @@ cpp_unittest_wrapper(name="timer_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="timestamped_snapshot_test", + srcs=["utilities/transactions/timestamped_snapshot_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="trace_analyzer_test", srcs=["tools/trace_analyzer_test.cc"], deps=[":rocksdb_test_lib"], diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 872c6e7a19..147517f2bf 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -511,6 +511,19 @@ void DBImpl::CancelAllBackgroundWork(bool wait) { WaitForBackgroundWork(); } +Status DBImpl::MaybeReleaseTimestampedSnapshotsAndCheck() { + size_t num_snapshots = 0; + ReleaseTimestampedSnapshotsOlderThan(std::numeric_limits::max(), + &num_snapshots); + + // If there is unreleased snapshot, fail the close call + if (num_snapshots > 0) { + return Status::Aborted("Cannot close DB with unreleased snapshot."); + } + + return Status::OK(); +} + Status DBImpl::CloseHelper() { // Guarantee that there is no background error recovery in progress before // continuing with the shutdown @@ -732,11 +745,19 @@ Status DBImpl::CloseImpl() { return CloseHelper(); } DBImpl::~DBImpl() { InstrumentedMutexLock closing_lock_guard(&closing_mutex_); - if (!closed_) { - closed_ = true; - closing_status_ = CloseHelper(); - closing_status_.PermitUncheckedError(); + if (closed_) { + return; } + + closed_ = true; + + { + const Status s = MaybeReleaseTimestampedSnapshotsAndCheck(); + s.PermitUncheckedError(); + } + + closing_status_ = CloseImpl(); + closing_status_.PermitUncheckedError(); } void DBImpl::MaybeIgnoreError(Status* s) const { @@ -1797,11 +1818,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, // data for the snapshot, so the reader would see neither data that was be // visible to the snapshot before compaction nor the newer data inserted // afterwards. - if (last_seq_same_as_publish_seq_) { - snapshot = versions_->LastSequence(); - } else { - snapshot = versions_->LastPublishedSequence(); - } + snapshot = GetLastPublishedSequence(); if (get_impl_options.callback) { // The unprep_seqs are not published for write unprepared, so it could be // that max_visible_seq is larger. Seek to the std::max of the two. @@ -2194,11 +2211,7 @@ bool DBImpl::MultiCFSnapshot( // version because a flush happening in between may compact away data for // the snapshot, but the snapshot is earlier than the data overwriting it, // so users may see wrong results. - if (last_seq_same_as_publish_seq_) { - *snapshot = versions_->LastSequence(); - } else { - *snapshot = versions_->LastPublishedSequence(); - } + *snapshot = GetLastPublishedSequence(); } } else { // If we end up with the same issue of memtable geting sealed during 2 @@ -2229,11 +2242,7 @@ bool DBImpl::MultiCFSnapshot( // acquire the lock so we're sure to succeed mutex_.Lock(); } - if (last_seq_same_as_publish_seq_) { - *snapshot = versions_->LastSequence(); - } else { - *snapshot = versions_->LastPublishedSequence(); - } + *snapshot = GetLastPublishedSequence(); } else { *snapshot = static_cast_with_check(read_options.snapshot) @@ -3170,6 +3179,48 @@ const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { } #endif // ROCKSDB_LITE +std::pair> +DBImpl::CreateTimestampedSnapshot(SequenceNumber snapshot_seq, uint64_t ts) { + assert(ts != std::numeric_limits::max()); + + auto ret = CreateTimestampedSnapshotImpl(snapshot_seq, ts, /*lock=*/true); + return ret; +} + +std::shared_ptr DBImpl::GetTimestampedSnapshot( + uint64_t ts) const { + InstrumentedMutexLock lock_guard(&mutex_); + return timestamped_snapshots_.GetSnapshot(ts); +} + +void DBImpl::ReleaseTimestampedSnapshotsOlderThan(uint64_t ts, + size_t* remaining_total_ss) { + autovector> snapshots_to_release; + { + InstrumentedMutexLock lock_guard(&mutex_); + timestamped_snapshots_.ReleaseSnapshotsOlderThan(ts, snapshots_to_release); + } + snapshots_to_release.clear(); + + if (remaining_total_ss) { + InstrumentedMutexLock lock_guard(&mutex_); + *remaining_total_ss = static_cast(snapshots_.count()); + } +} + +Status DBImpl::GetTimestampedSnapshots( + uint64_t ts_lb, uint64_t ts_ub, + std::vector>& timestamped_snapshots) const { + if (ts_lb >= ts_ub) { + return Status::InvalidArgument( + "timestamp lower bound must be smaller than upper bound"); + } + timestamped_snapshots.clear(); + InstrumentedMutexLock lock_guard(&mutex_); + timestamped_snapshots_.GetSnapshots(ts_lb, ts_ub, timestamped_snapshots); + return Status::OK(); +} + SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary, bool lock) { int64_t unix_time = 0; @@ -3179,6 +3230,8 @@ SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary, if (lock) { mutex_.Lock(); + } else { + mutex_.AssertHeld(); } // returns null if the underlying memtable does not support snapshot. if (!is_snapshot_supported_) { @@ -3188,9 +3241,7 @@ SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary, delete s; return nullptr; } - auto snapshot_seq = last_seq_same_as_publish_seq_ - ? versions_->LastSequence() - : versions_->LastPublishedSequence(); + auto snapshot_seq = GetLastPublishedSequence(); SnapshotImpl* snapshot = snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary); if (lock) { @@ -3199,6 +3250,115 @@ SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary, return snapshot; } +std::pair> +DBImpl::CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts, + bool lock) { + int64_t unix_time = 0; + immutable_db_options_.clock->GetCurrentTime(&unix_time) + .PermitUncheckedError(); // Ignore error + SnapshotImpl* s = new SnapshotImpl; + + const bool need_update_seq = (snapshot_seq != kMaxSequenceNumber); + + if (lock) { + mutex_.Lock(); + } else { + mutex_.AssertHeld(); + } + // returns null if the underlying memtable does not support snapshot. + if (!is_snapshot_supported_) { + if (lock) { + mutex_.Unlock(); + } + delete s; + return std::make_pair( + Status::NotSupported("Memtable does not support snapshot"), nullptr); + } + + // Caller is not write thread, thus didn't provide a valid snapshot_seq. + // Obtain seq from db. + if (!need_update_seq) { + snapshot_seq = GetLastPublishedSequence(); + } + + std::shared_ptr latest = + timestamped_snapshots_.GetSnapshot(std::numeric_limits::max()); + + // If there is already a latest timestamped snapshot, then we need to do some + // checks. + if (latest) { + uint64_t latest_snap_ts = latest->GetTimestamp(); + SequenceNumber latest_snap_seq = latest->GetSequenceNumber(); + assert(latest_snap_seq <= snapshot_seq); + bool needs_create_snap = true; + Status status; + std::shared_ptr ret; + if (latest_snap_ts > ts) { + // A snapshot created later cannot have smaller timestamp than a previous + // timestamped snapshot. + needs_create_snap = false; + std::ostringstream oss; + oss << "snapshot exists with larger timestamp " << latest_snap_ts << " > " + << ts; + status = Status::InvalidArgument(oss.str()); + } else if (latest_snap_ts == ts) { + if (latest_snap_seq == snapshot_seq) { + // We are requesting the same sequence number and timestamp, thus can + // safely reuse (share) the current latest timestamped snapshot. + needs_create_snap = false; + ret = latest; + } else if (latest_snap_seq < snapshot_seq) { + // There may have been writes to the database since the latest + // timestamped snapshot, yet we are still requesting the same + // timestamp. In this case, we cannot create the new timestamped + // snapshot. + needs_create_snap = false; + std::ostringstream oss; + oss << "Allocated seq is " << snapshot_seq + << ", while snapshot exists with smaller seq " << latest_snap_seq + << " but same timestamp " << ts; + status = Status::InvalidArgument(oss.str()); + } + } + if (!needs_create_snap) { + if (lock) { + mutex_.Unlock(); + } + delete s; + return std::make_pair(status, ret); + } else { + status.PermitUncheckedError(); + } + } + + SnapshotImpl* snapshot = + snapshots_.New(s, snapshot_seq, unix_time, + /*is_write_conflict_boundary=*/true, ts); + + std::shared_ptr ret( + snapshot, + std::bind(&DBImpl::ReleaseSnapshot, this, std::placeholders::_1)); + timestamped_snapshots_.AddSnapshot(ret); + + // Caller is from write thread, and we need to update database's sequence + // number. + if (need_update_seq) { + assert(versions_); + if (last_seq_same_as_publish_seq_) { + versions_->SetLastSequence(snapshot_seq); + } else { + // TODO: support write-prepared/write-unprepared transactions with two + // write queues. + assert(false); + } + } + + if (lock) { + mutex_.Unlock(); + } + return std::make_pair(Status::OK(), ret); +} + namespace { using CfdList = autovector; bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) { @@ -3224,11 +3384,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) { snapshots_.Delete(casted_s); uint64_t oldest_snapshot; if (snapshots_.empty()) { - if (last_seq_same_as_publish_seq_) { - oldest_snapshot = versions_->LastSequence(); - } else { - oldest_snapshot = versions_->LastPublishedSequence(); - } + oldest_snapshot = GetLastPublishedSequence(); } else { oldest_snapshot = snapshots_.oldest()->number_; } @@ -4119,13 +4275,14 @@ Status DBImpl::Close() { if (closed_) { return closing_status_; } + { - InstrumentedMutexLock l(&mutex_); - // If there is unreleased snapshot, fail the close call - if (!snapshots_.empty()) { - return Status::Aborted("Cannot close DB with unreleased snapshot."); + const Status s = MaybeReleaseTimestampedSnapshotsAndCheck(); + if (!s.ok()) { + return s; } } + closing_status_ = CloseImpl(); closed_ = true; return closing_status_; diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 721c73c378..018d7904cf 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -32,6 +32,7 @@ #include "db/log_writer.h" #include "db/logs_with_prep_tracker.h" #include "db/memtable_list.h" +#include "db/post_memtable_callback.h" #include "db/pre_release_callback.h" #include "db/range_del_aggregator.h" #include "db/read_callback.h" @@ -345,6 +346,19 @@ class DBImpl : public DB { virtual const Snapshot* GetSnapshot() override; virtual void ReleaseSnapshot(const Snapshot* snapshot) override; + // Create a timestamped snapshot. This snapshot can be shared by multiple + // readers. If any of them uses it for write conflict checking, then + // is_write_conflict_boundary is true. For simplicity, set it to true by + // default. + std::pair> CreateTimestampedSnapshot( + SequenceNumber snapshot_seq, uint64_t ts); + std::shared_ptr GetTimestampedSnapshot(uint64_t ts) const; + void ReleaseTimestampedSnapshotsOlderThan( + uint64_t ts, size_t* remaining_total_ss = nullptr); + Status GetTimestampedSnapshots(uint64_t ts_lb, uint64_t ts_ub, + std::vector>& + timestamped_snapshots) const; + using DB::GetProperty; virtual bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property, std::string* value) override; @@ -1222,6 +1236,8 @@ class DBImpl : public DB { static void TEST_ResetDbSessionIdGen(); static std::string GenerateDbSessionId(Env* env); + bool seq_per_batch() const { return seq_per_batch_; } + protected: const std::string dbname_; // TODO(peterd): unify with VersionSet::db_id_ @@ -1404,7 +1420,8 @@ class DBImpl : public DB { uint64_t* log_used = nullptr, uint64_t log_ref = 0, bool disable_memtable = false, uint64_t* seq_used = nullptr, size_t batch_cnt = 0, - PreReleaseCallback* pre_release_callback = nullptr); + PreReleaseCallback* pre_release_callback = nullptr, + PostMemTableCallback* post_memtable_callback = nullptr); Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates, WriteCallback* callback = nullptr, @@ -2039,10 +2056,24 @@ class DBImpl : public DB { SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary, bool lock = true); + // If snapshot_seq != kMaxSequenceNumber, then this function can only be + // called from the write thread that publishes sequence numbers to readers. + // For 1) write-committed, or 2) write-prepared + one-write-queue, this will + // be the write thread performing memtable writes. For write-prepared with + // two write queues, this will be the write thread writing commit marker to + // the WAL. + // If snapshot_seq == kMaxSequenceNumber, this function is called by a caller + // ensuring no writes to the database. + std::pair> + CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts, + bool lock = true); + uint64_t GetMaxTotalWalSize() const; FSDirectory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const; + Status MaybeReleaseTimestampedSnapshotsAndCheck(); + Status CloseHelper(); void WaitForBackgroundWork(); @@ -2308,6 +2339,8 @@ class DBImpl : public DB { SnapshotList snapshots_; + TimestampedSnapshotList timestamped_snapshots_; + // For each background job, pending_outputs_ keeps the current file number at // the time that background job started. // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index fee4df87b0..c6ce801ae4 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -126,7 +126,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, uint64_t* log_used, uint64_t log_ref, bool disable_memtable, uint64_t* seq_used, size_t batch_cnt, - PreReleaseCallback* pre_release_callback) { + PreReleaseCallback* pre_release_callback, + PostMemTableCallback* post_memtable_callback) { assert(!seq_per_batch_ || batch_cnt != 0); if (my_batch == nullptr) { return Status::InvalidArgument("Batch is nullptr!"); @@ -185,6 +186,15 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, return Status::NotSupported( "pipelined_writes is not compatible with unordered_write"); } + if (immutable_db_options_.enable_pipelined_write && + post_memtable_callback != nullptr) { + return Status::NotSupported( + "pipelined write currently does not honor post_memtable_callback"); + } + if (seq_per_batch_ && post_memtable_callback != nullptr) { + return Status::NotSupported( + "seq_per_batch currently does not honor post_memtable_callback"); + } // Otherwise IsLatestPersistentState optimization does not make sense assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) || disable_memtable); @@ -241,7 +251,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, PERF_TIMER_GUARD(write_pre_and_post_process_time); WriteThread::Writer w(write_options, my_batch, callback, log_ref, - disable_memtable, batch_cnt, pre_release_callback); + disable_memtable, batch_cnt, pre_release_callback, + post_memtable_callback); StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); write_thread_.JoinBatchGroup(&w); @@ -268,6 +279,16 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // we're responsible for exit batch group // TODO(myabandeh): propagate status to write_group auto last_sequence = w.write_group->last_sequence; + for (auto* tmp_w : *(w.write_group)) { + assert(tmp_w); + if (tmp_w->post_memtable_callback) { + Status tmp_s = + (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable); + // TODO: propagate the execution status of post_memtable_callback to + // caller. + assert(tmp_s.ok()); + } + } versions_->SetLastSequence(last_sequence); MemTableInsertStatusCheck(w.status); write_thread_.ExitAsBatchGroupFollower(&w); @@ -550,6 +571,16 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } if (should_exit_batch_group) { if (status.ok()) { + for (auto* tmp_w : write_group) { + assert(tmp_w); + if (tmp_w->post_memtable_callback) { + Status tmp_s = + (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable); + // TODO: propagate the execution status of post_memtable_callback to + // caller. + assert(tmp_s.ok()); + } + } // Note: if we are to resume after non-OK statuses we need to revisit how // we reacts to non-OK statuses here. versions_->SetLastSequence(last_sequence); diff --git a/db/db_test.cc b/db/db_test.cc index bc1808be80..0cff96d3ea 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2875,6 +2875,12 @@ class ModelDB : public DB { assert(false); return 0; } + + uint64_t GetTimestamp() const override { + // no need to call this + assert(false); + return 0; + } }; explicit ModelDB(const Options& options) : options_(options) {} diff --git a/db/post_memtable_callback.h b/db/post_memtable_callback.h new file mode 100644 index 0000000000..fbf2fbe869 --- /dev/null +++ b/db/post_memtable_callback.h @@ -0,0 +1,25 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// Callback invoked after finishing writing to the memtable but before +// publishing the sequence number to readers. +// Note that with write-prepared/write-unprepared transactions with +// two-write-queues, PreReleaseCallback is called before publishing the +// sequence numbers to readers. +class PostMemTableCallback { + public: + virtual ~PostMemTableCallback() {} + + virtual Status operator()(SequenceNumber seq, bool disable_memtable) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/snapshot_impl.h b/db/snapshot_impl.h index dc903ea426..59f491615d 100644 --- a/db/snapshot_impl.h +++ b/db/snapshot_impl.h @@ -12,6 +12,7 @@ #include "db/dbformat.h" #include "rocksdb/db.h" +#include "util/autovector.h" namespace ROCKSDB_NAMESPACE { @@ -31,6 +32,8 @@ class SnapshotImpl : public Snapshot { int64_t GetUnixTime() const override { return unix_time_; } + uint64_t GetTimestamp() const override { return timestamp_; } + private: friend class SnapshotList; @@ -42,6 +45,8 @@ class SnapshotImpl : public Snapshot { int64_t unix_time_; + uint64_t timestamp_; + // Will this snapshot be used by a Transaction to do write-conflict checking? bool is_write_conflict_boundary_; }; @@ -55,6 +60,7 @@ class SnapshotList { // Set all the variables to make UBSAN happy. list_.list_ = nullptr; list_.unix_time_ = 0; + list_.timestamp_ = 0; list_.is_write_conflict_boundary_ = false; count_ = 0; } @@ -62,14 +68,19 @@ class SnapshotList { // No copy-construct. SnapshotList(const SnapshotList&) = delete; - bool empty() const { return list_.next_ == &list_; } + bool empty() const { + assert(list_.next_ != &list_ || 0 == count_); + return list_.next_ == &list_; + } SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; } SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; } SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, uint64_t unix_time, - bool is_write_conflict_boundary) { + bool is_write_conflict_boundary, + uint64_t ts = std::numeric_limits::max()) { s->number_ = seq; s->unix_time_ = unix_time; + s->timestamp_ = ts; s->is_write_conflict_boundary_ = is_write_conflict_boundary; s->list_ = this; s->next_ = &list_; @@ -167,4 +178,56 @@ class SnapshotList { uint64_t count_; }; +// All operations on TimestampedSnapshotList must be protected by db mutex. +class TimestampedSnapshotList { + public: + explicit TimestampedSnapshotList() = default; + + std::shared_ptr GetSnapshot(uint64_t ts) const { + if (ts == std::numeric_limits::max() && !snapshots_.empty()) { + auto it = snapshots_.rbegin(); + assert(it != snapshots_.rend()); + return it->second; + } + auto it = snapshots_.find(ts); + if (it == snapshots_.end()) { + return std::shared_ptr(); + } + return it->second; + } + + void GetSnapshots( + uint64_t ts_lb, uint64_t ts_ub, + std::vector>& snapshots) const { + assert(ts_lb < ts_ub); + auto it_low = snapshots_.lower_bound(ts_lb); + auto it_high = snapshots_.lower_bound(ts_ub); + for (auto it = it_low; it != it_high; ++it) { + snapshots.emplace_back(it->second); + } + } + + void AddSnapshot(const std::shared_ptr& snapshot) { + assert(snapshot); + snapshots_.try_emplace(snapshot->GetTimestamp(), snapshot); + } + + // snapshots_to_release: the container to where the timestamped snapshots will + // be moved so that it retains the last reference to the snapshots and the + // snapshots won't be actually released which requires db mutex. The + // snapshots will be released by caller of ReleaseSnapshotsOlderThan(). + void ReleaseSnapshotsOlderThan( + uint64_t ts, + autovector>& snapshots_to_release) { + auto ub = snapshots_.lower_bound(ts); + for (auto it = snapshots_.begin(); it != ub; ++it) { + snapshots_to_release.emplace_back(it->second); + } + snapshots_.erase(snapshots_.begin(), ub); + } + + private: + std::map> snapshots_; +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/db/write_thread.h b/db/write_thread.h index af4d0967e3..f78b01cd9f 100644 --- a/db/write_thread.h +++ b/db/write_thread.h @@ -15,6 +15,7 @@ #include #include "db/dbformat.h" +#include "db/post_memtable_callback.h" #include "db/pre_release_callback.h" #include "db/write_callback.h" #include "monitoring/instrumented_mutex.h" @@ -122,6 +123,7 @@ class WriteThread { size_t batch_cnt; // if non-zero, number of sub-batches in the write batch size_t protection_bytes_per_key; PreReleaseCallback* pre_release_callback; + PostMemTableCallback* post_memtable_callback; uint64_t log_used; // log number that this batch was inserted into uint64_t log_ref; // log number that memtable insert should reference WriteCallback* callback; @@ -147,6 +149,7 @@ class WriteThread { batch_cnt(0), protection_bytes_per_key(0), pre_release_callback(nullptr), + post_memtable_callback(nullptr), log_used(0), log_ref(0), callback(nullptr), @@ -160,7 +163,8 @@ class WriteThread { Writer(const WriteOptions& write_options, WriteBatch* _batch, WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable, size_t _batch_cnt = 0, - PreReleaseCallback* _pre_release_callback = nullptr) + PreReleaseCallback* _pre_release_callback = nullptr, + PostMemTableCallback* _post_memtable_callback = nullptr) : batch(_batch), sync(write_options.sync), no_slowdown(write_options.no_slowdown), @@ -170,6 +174,7 @@ class WriteThread { batch_cnt(_batch_cnt), protection_bytes_per_key(_batch->GetProtectionBytesPerKey()), pre_release_callback(_pre_release_callback), + post_memtable_callback(_post_memtable_callback), log_used(0), log_ref(_log_ref), callback(_callback), diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc index 2a5cbd763a..c20bcd7197 100644 --- a/db_stress_tool/db_stress_common.cc +++ b/db_stress_tool/db_stress_common.cc @@ -148,6 +148,34 @@ void DbVerificationThread(void* v) { } } +void SnapshotGcThread(void* v) { + assert(FLAGS_create_timestamped_snapshot_one_in > 0); + auto* thread = reinterpret_cast(v); + assert(thread); + SharedState* shared = thread->shared; + assert(shared); + StressTest* stress_test = shared->GetStressTest(); + assert(stress_test); + while (true) { + { + MutexLock l(shared->GetMutex()); + if (shared->ShouldStopBgThread()) { + shared->IncBgThreadsFinished(); + if (shared->BgThreadsFinished()) { + shared->GetCondVar()->SignalAll(); + } + return; + } + } + + uint64_t now = db_stress_env->NowNanos(); + constexpr uint64_t time_diff = static_cast(1000) * 1000 * 1000; + stress_test->ReleaseOldTimestampedSnapshots(now - time_diff); + + db_stress_env->SleepForMicroseconds(1000 * 1000); + } +} + void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz) { if (!FLAGS_verbose) { return; diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 9d392d75ad..8aa80b64d2 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -295,6 +295,8 @@ DECLARE_bool(async_io); DECLARE_string(wal_compression); DECLARE_bool(verify_sst_unique_id_in_manifest); +DECLARE_int32(create_timestamped_snapshot_one_in); + constexpr long KB = 1024; constexpr int kRandomValueMaxFactor = 3; constexpr int kValueMaxLen = 100; @@ -583,6 +585,8 @@ extern void PoolSizeChangeThread(void* v); extern void DbVerificationThread(void* v); +extern void SnapshotGcThread(void* v); + extern void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz); extern int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration); diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc index 009168ae37..3c69f5408a 100644 --- a/db_stress_tool/db_stress_driver.cc +++ b/db_stress_tool/db_stress_driver.cc @@ -84,6 +84,10 @@ bool RunStressTest(StressTest* stress) { shared.IncBgThreads(); } + if (FLAGS_create_timestamped_snapshot_one_in > 0) { + shared.IncBgThreads(); + } + std::vector threads(n); for (uint32_t i = 0; i < n; i++) { threads[i] = new ThreadState(i, &shared); @@ -101,6 +105,11 @@ bool RunStressTest(StressTest* stress) { &continuous_verification_thread); } + ThreadState snapshots_gc_thread(0, &shared); + if (FLAGS_create_timestamped_snapshot_one_in > 0) { + db_stress_env->StartThread(SnapshotGcThread, &snapshots_gc_thread); + } + // Each thread goes through the following states: // initializing -> wait for others to init -> read/populate/depopulate // wait for others to operate -> verify -> done @@ -169,7 +178,8 @@ bool RunStressTest(StressTest* stress) { stress->PrintStatistics(); if (FLAGS_compaction_thread_pool_adjust_interval > 0 || - FLAGS_continuous_verification_interval > 0) { + FLAGS_continuous_verification_interval > 0 || + FLAGS_create_timestamped_snapshot_one_in > 0) { MutexLock l(shared.GetMutex()); shared.SetShouldStopBgThread(); while (!shared.BgThreadsFinished()) { diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 547fc7a53c..de9046e533 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -967,4 +967,8 @@ DEFINE_bool( "DB-open try verifying the SST unique id between MANIFEST and SST " "properties."); +DEFINE_int32( + create_timestamped_snapshot_one_in, 0, + "On non-zero, create timestamped snapshots upon transaction commits."); + #endif // GFLAGS diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index e0179c9998..9e7fa4e2b8 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -420,6 +420,20 @@ void StressTest::PrintStatistics() { } } +void StressTest::ReleaseOldTimestampedSnapshots(uint64_t ts) { +#ifndef ROCKSDB_LITE + if (!txn_db_) { + return; + } + assert(txn_db_); + txn_db_->ReleaseTimestampedSnapshotsOlderThan(ts); +#else + (void)ts; + fprintf(stderr, "timestamped snapshots not supported in LITE mode\n"); + exit(1); +#endif // ROCKSDB_LITE +} + // Currently PreloadDb has to be single-threaded. void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, SharedState* shared) { @@ -560,13 +574,21 @@ Status StressTest::NewTxn(WriteOptions& write_opts, Transaction** txn) { return s; } -Status StressTest::CommitTxn(Transaction* txn) { +Status StressTest::CommitTxn(Transaction* txn, ThreadState* thread) { if (!FLAGS_use_txn) { return Status::InvalidArgument("CommitTxn when FLAGS_use_txn is not set"); } Status s = txn->Prepare(); + std::shared_ptr timestamped_snapshot; if (s.ok()) { - s = txn->Commit(); + if (thread && FLAGS_create_timestamped_snapshot_one_in && + thread->rand.OneIn(FLAGS_create_timestamped_snapshot_one_in)) { + uint64_t ts = db_stress_env->NowNanos(); + s = txn->CommitAndTryCreateSnapshot(/*notifier=*/nullptr, ts, + ×tamped_snapshot); + } else { + s = txn->Commit(); + } } delete txn; return s; diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h index cc792fa50e..fcb89c4c2c 100644 --- a/db_stress_tool/db_stress_test_base.h +++ b/db_stress_tool/db_stress_test_base.h @@ -43,6 +43,8 @@ class StressTest { void PrintStatistics(); + void ReleaseOldTimestampedSnapshots(uint64_t ts); + protected: Status AssertSame(DB* db, ColumnFamilyHandle* cf, ThreadState::SnapshotState& snap_state); @@ -56,7 +58,7 @@ class StressTest { #ifndef ROCKSDB_LITE Status NewTxn(WriteOptions& write_opts, Transaction** txn); - Status CommitTxn(Transaction* txn); + Status CommitTxn(Transaction* txn, ThreadState* thread = nullptr); Status RollbackTxn(Transaction* txn); #endif diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index d04bd96f23..1729ee3f7c 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -276,6 +276,17 @@ int db_stress_tool(int argc, char** argv) { CheckAndSetOptionsForMultiOpsTxnStressTest(); } + if (FLAGS_create_timestamped_snapshot_one_in > 0) { + if (!FLAGS_use_txn) { + fprintf(stderr, "timestamped snapshot supported only in TransactionDB\n"); + exit(1); + } else if (FLAGS_txn_write_policy != 0) { + fprintf(stderr, + "timestamped snapshot supported only in write-committed\n"); + exit(1); + } + } + #ifndef NDEBUG KillPoint* kp = KillPoint::GetInstance(); kp->rocksdb_kill_odds = FLAGS_kill_random_test; diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index cb40913d78..5c68b6e8b2 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -666,7 +666,7 @@ class NonBatchedOpsStressTest : public StressTest { if (s.ok()) { s = txn->Merge(cfh, key, v); if (s.ok()) { - s = CommitTxn(txn); + s = CommitTxn(txn, thread); } } #endif @@ -685,7 +685,7 @@ class NonBatchedOpsStressTest : public StressTest { if (s.ok()) { s = txn->Put(cfh, key, v); if (s.ok()) { - s = CommitTxn(txn); + s = CommitTxn(txn, thread); } } #endif @@ -746,7 +746,7 @@ class NonBatchedOpsStressTest : public StressTest { if (s.ok()) { s = txn->Delete(cfh, key); if (s.ok()) { - s = CommitTxn(txn); + s = CommitTxn(txn, thread); } } #endif @@ -783,7 +783,7 @@ class NonBatchedOpsStressTest : public StressTest { if (s.ok()) { s = txn->SingleDelete(cfh, key); if (s.ok()) { - s = CommitTxn(txn); + s = CommitTxn(txn, thread); } } #endif diff --git a/include/rocksdb/snapshot.h b/include/rocksdb/snapshot.h index 61281d7c49..1ea56e71e0 100644 --- a/include/rocksdb/snapshot.h +++ b/include/rocksdb/snapshot.h @@ -25,6 +25,8 @@ class Snapshot { // 00:00:00 (UTC). virtual int64_t GetUnixTime() const = 0; + virtual uint64_t GetTimestamp() const = 0; + protected: virtual ~Snapshot(); }; diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index ecb797506c..b8f7076339 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -115,6 +115,8 @@ class TransactionNotifier { // Implement this method to receive notification when a snapshot is // requested via SetSnapshotOnNextOperation. + // Do not take exclusive ownership of `newSnapshot` because it is shared with + // the underlying transaction. virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0; }; @@ -183,6 +185,10 @@ class Transaction { // txn2->Put("A", ...); // txn2->Commit(); // txn1->GetForUpdate(opts, "A", ...); // FAIL! + // + // WriteCommittedTxn only: a new snapshot will be taken upon next operation, + // and next operation can be a Commit. + // TODO(yanqin) remove the "write-committed only" limitation. virtual void SetSnapshotOnNextOperation( std::shared_ptr notifier = nullptr) = 0; @@ -193,6 +199,10 @@ class Transaction { // is called, or the Transaction is deleted. virtual const Snapshot* GetSnapshot() const = 0; + // Returns the Snapshot created by the last call to SetSnapshot(). + // The returned snapshot can outlive the transaction. + virtual std::shared_ptr GetTimestampedSnapshot() const = 0; + // Clears the current snapshot (i.e. no snapshot will be 'set') // // This removes any snapshot that currently exists or is set to be created @@ -227,6 +237,28 @@ class Transaction { // transaction before Commit. virtual Status Commit() = 0; + // In addition to Commit(), also creates a snapshot of the db after all + // writes by this txn are visible to other readers. + // Caller is responsible for ensuring that + // snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts + // in which snapshot1 and snapshot2 are created by this API. + // + // Currently only supported by WriteCommittedTxn. Calling this method on + // other types of transactions will return non-ok Status resulting from + // Commit() or a `NotSupported` error. + // This method returns OK if and only if the transaction successfully + // commits. It is possible that transaction commits successfully but fails to + // create a timestamped snapshot. Therefore, the caller should check that the + // snapshot is created. + // notifier will be notified upon next snapshot creation. Nullable. + // ret non-null output argument storing a shared_ptr to the newly created + // snapshot. + Status CommitAndTryCreateSnapshot( + std::shared_ptr notifier = + std::shared_ptr(), + TxnTimestamp ts = kMaxTxnTimestamp, + std::shared_ptr* snapshot = nullptr); + // Discard all batched writes in this transaction. virtual Status Rollback() = 0; @@ -620,6 +652,8 @@ class Transaction { return Status::NotSupported("timestamp not supported"); } + virtual TxnTimestamp GetCommitTimestamp() const { return kMaxTxnTimestamp; } + protected: explicit Transaction(const TransactionDB* /*db*/) {} Transaction() : log_number_(0), txn_state_(STARTED) {} diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index eaf2bc1282..aefcd6de15 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -458,6 +458,42 @@ class TransactionDB : public StackableDB { virtual std::vector GetDeadlockInfoBuffer() = 0; virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0; + // Create a snapshot and assign ts to it. Return the snapshot to caller. The + // snapshot-timestamp mapping is also tracked by the database. + // Caller must ensure there are no active writes when this API is called. + virtual std::pair> + CreateTimestampedSnapshot(TxnTimestamp ts) = 0; + + // Return the latest timestamped snapshot if present. + std::shared_ptr GetLatestTimestampedSnapshot() const { + return GetTimestampedSnapshot(kMaxTxnTimestamp); + } + // Return the snapshot correponding to given timestamp. If ts is + // kMaxTxnTimestamp, then we return the latest timestamped snapshot if + // present. Othersise, we return the snapshot whose timestamp is equal to + // `ts`. If no such snapshot exists, then we return null. + virtual std::shared_ptr GetTimestampedSnapshot( + TxnTimestamp ts) const = 0; + // Release timestamped snapshots whose timestamps are less than or equal to + // ts. + virtual void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) = 0; + + // Get all timestamped snapshots which will be stored in + // timestamped_snapshots. + Status GetAllTimestampedSnapshots( + std::vector>& timestamped_snapshots) + const { + return GetTimestampedSnapshots(/*ts_lb=*/0, /*ts_ub=*/kMaxTxnTimestamp, + timestamped_snapshots); + } + + // Get all timestamped snapshots whose timestamps fall within [ts_lb, ts_ub). + // timestamped_snapshots will be cleared and contain returned snapshots. + virtual Status GetTimestampedSnapshots( + TxnTimestamp ts_lb, TxnTimestamp ts_ub, + std::vector>& timestamped_snapshots) + const = 0; + protected: // To Create an TransactionDB, call Open() // The ownership of db is transferred to the base StackableDB diff --git a/src.mk b/src.mk index 0a77e7f469..dbd9fa15aa 100644 --- a/src.mk +++ b/src.mk @@ -594,6 +594,7 @@ TEST_MAIN_SOURCES = \ utilities/transactions/write_prepared_transaction_test.cc \ utilities/transactions/write_unprepared_transaction_test.cc \ utilities/transactions/write_committed_transaction_ts_test.cc \ + utilities/transactions/timestamped_snapshot_test.cc \ utilities/ttl/ttl_test.cc \ utilities/util_merge_operators_test.cc \ utilities/write_batch_with_index/write_batch_with_index_test.cc \ diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 1c24887ad7..66d8bf02c6 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -318,6 +318,7 @@ txn_params = { "checkpoint_one_in": 0, # pipeline write is not currnetly compatible with WritePrepared txns "enable_pipelined_write": 0, + "create_timestamped_snapshot_one_in": random.choice([0, 20]), } best_efforts_recovery_params = { @@ -533,6 +534,11 @@ def finalize_and_sanitize(src_params): if dest_params["secondary_cache_uri"] != "": # Currently the only cache type compatible with a secondary cache is LRUCache dest_params["cache_type"] = "lru_cache" + # Remove the following once write-prepared/write-unprepared with/without + # unordered write supports timestamped snapshots + if dest_params.get("create_timestamped_snapshot_one_in", 0) > 0: + dest_params["txn_write_policy"] = 0 + dest_params["unordered_write"] = 0 return dest_params diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index 26e9c0b1ea..1447ac8d1d 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -22,8 +22,6 @@ #include "util/crc32c.h" #include "util/random.h" -using std::string; - namespace ROCKSDB_NAMESPACE { class OptimisticTransactionTest @@ -31,7 +29,7 @@ class OptimisticTransactionTest public testing::WithParamInterface { public: OptimisticTransactionDB* txn_db; - string dbname; + std::string dbname; Options options; OptimisticTransactionTest() { @@ -78,7 +76,7 @@ private: TEST_P(OptimisticTransactionTest, SuccessTest) { WriteOptions write_options; ReadOptions read_options; - string value; + std::string value; ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); @@ -105,7 +103,7 @@ TEST_P(OptimisticTransactionTest, SuccessTest) { TEST_P(OptimisticTransactionTest, WriteConflictTest) { WriteOptions write_options; ReadOptions read_options; - string value; + std::string value; ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); ASSERT_OK(txn_db->Put(write_options, "foo2", "bar")); @@ -138,7 +136,7 @@ TEST_P(OptimisticTransactionTest, WriteConflictTest2) { WriteOptions write_options; ReadOptions read_options; OptimisticTransactionOptions txn_options; - string value; + std::string value; ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); ASSERT_OK(txn_db->Put(write_options, "foo2", "bar")); @@ -172,7 +170,7 @@ TEST_P(OptimisticTransactionTest, ReadConflictTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; OptimisticTransactionOptions txn_options; - string value; + std::string value; ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); ASSERT_OK(txn_db->Put(write_options, "foo2", "bar")); @@ -211,7 +209,7 @@ TEST_P(OptimisticTransactionTest, TxnOnlyTest) { WriteOptions write_options; ReadOptions read_options; - string value; + std::string value; Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_NE(txn, nullptr); @@ -226,7 +224,7 @@ TEST_P(OptimisticTransactionTest, TxnOnlyTest) { TEST_P(OptimisticTransactionTest, FlushTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; - string value; + std::string value; ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); @@ -263,7 +261,7 @@ TEST_P(OptimisticTransactionTest, FlushTest) { TEST_P(OptimisticTransactionTest, FlushTest2) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; - string value; + std::string value; ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); @@ -324,7 +322,7 @@ TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) { ReadOptions read_options; ReadOptions snapshot_read_options; ReadOptions snapshot_read_options2; - string value; + std::string value; ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); @@ -426,7 +424,7 @@ TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) { TEST_P(OptimisticTransactionTest, NoSnapshotTest) { WriteOptions write_options; ReadOptions read_options; - string value; + std::string value; ASSERT_OK(txn_db->Put(write_options, "AAA", "bar")); @@ -453,7 +451,7 @@ TEST_P(OptimisticTransactionTest, NoSnapshotTest) { TEST_P(OptimisticTransactionTest, MultipleSnapshotTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; - string value; + std::string value; ASSERT_OK(txn_db->Put(write_options, "AAA", "bar")); ASSERT_OK(txn_db->Put(write_options, "BBB", "bar")); @@ -549,7 +547,7 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; OptimisticTransactionOptions txn_options; - string value; + std::string value; ColumnFamilyHandle *cfa, *cfb; ColumnFamilyOptions cf_options; @@ -705,7 +703,7 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) { TEST_P(OptimisticTransactionTest, EmptyTest) { WriteOptions write_options; ReadOptions read_options; - string value; + std::string value; ASSERT_OK(txn_db->Put(write_options, "aaa", "aaa")); @@ -739,7 +737,7 @@ TEST_P(OptimisticTransactionTest, PredicateManyPreceders) { WriteOptions write_options; ReadOptions read_options1, read_options2; OptimisticTransactionOptions txn_options; - string value; + std::string value; txn_options.set_snapshot = true; Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options); @@ -804,7 +802,7 @@ TEST_P(OptimisticTransactionTest, LostUpdate) { WriteOptions write_options; ReadOptions read_options, read_options1, read_options2; OptimisticTransactionOptions txn_options; - string value; + std::string value; // Test 2 transactions writing to the same key in multiple orders and // with/without snapshots @@ -892,7 +890,7 @@ TEST_P(OptimisticTransactionTest, LostUpdate) { TEST_P(OptimisticTransactionTest, UntrackedWrites) { WriteOptions write_options; ReadOptions read_options; - string value; + std::string value; Status s; // Verify transaction rollback works for untracked keys. @@ -942,7 +940,7 @@ TEST_P(OptimisticTransactionTest, IteratorTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; OptimisticTransactionOptions txn_options; - string value; + std::string value; // Write some keys to the db ASSERT_OK(txn_db->Put(write_options, "A", "a")); @@ -1047,7 +1045,7 @@ TEST_P(OptimisticTransactionTest, SavepointTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; OptimisticTransactionOptions txn_options; - string value; + std::string value; Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_NE(txn, nullptr); @@ -1169,7 +1167,7 @@ TEST_P(OptimisticTransactionTest, UndoGetForUpdateTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; OptimisticTransactionOptions txn_options; - string value; + std::string value; ASSERT_OK(txn_db->Put(write_options, "A", "")); @@ -1392,6 +1390,21 @@ TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverTest) { delete transaction; } +TEST_P(OptimisticTransactionTest, TimestampedSnapshotMissingCommitTs) { + std::unique_ptr txn(txn_db->BeginTransaction(WriteOptions())); + ASSERT_OK(txn->Put("a", "v")); + Status s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); +} + +TEST_P(OptimisticTransactionTest, TimestampedSnapshotSetCommitTs) { + std::unique_ptr txn(txn_db->BeginTransaction(WriteOptions())); + ASSERT_OK(txn->Put("a", "v")); + std::shared_ptr snapshot; + Status s = txn->CommitAndTryCreateSnapshot(nullptr, /*ts=*/100, &snapshot); + ASSERT_TRUE(s.IsNotSupported()); +} + INSTANTIATE_TEST_CASE_P( InstanceOccGroup, OptimisticTransactionTest, testing::Values(OccValidationPolicy::kValidateSerial, diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index b473170609..6266387a9a 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -689,10 +689,21 @@ Status WriteCommittedTxn::CommitWithoutPrepareInternal() { } uint64_t seq_used = kMaxSequenceNumber; - auto s = - db_impl_->WriteImpl(write_options_, wb, - /*callback*/ nullptr, /*log_used*/ nullptr, - /*log_ref*/ 0, /*disable_memtable*/ false, &seq_used); + SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_, + snapshot_notifier_, snapshot_); + PostMemTableCallback* post_mem_cb = nullptr; + if (snapshot_needed_) { + if (commit_timestamp_ == kMaxTxnTimestamp) { + return Status::InvalidArgument("Must set transaction commit timestamp"); + } else { + post_mem_cb = &snapshot_creation_cb; + } + } + auto s = db_impl_->WriteImpl(write_options_, wb, + /*callback*/ nullptr, /*log_used*/ nullptr, + /*log_ref*/ 0, /*disable_memtable*/ false, + &seq_used, /*batch_cnt=*/0, + /*pre_release_callback=*/nullptr, post_mem_cb); assert(!s.ok() || seq_used != kMaxSequenceNumber); if (s.ok()) { SetId(seq_used); @@ -764,9 +775,22 @@ Status WriteCommittedTxn::CommitInternal() { assert(s.ok()); uint64_t seq_used = kMaxSequenceNumber; + SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_, + snapshot_notifier_, snapshot_); + PostMemTableCallback* post_mem_cb = nullptr; + if (snapshot_needed_) { + if (commit_timestamp_ == kMaxTxnTimestamp) { + s = Status::InvalidArgument("Must set transaction commit timestamp"); + return s; + } else { + post_mem_cb = &snapshot_creation_cb; + } + } s = db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr, /*log_used*/ nullptr, /*log_ref*/ log_number_, - /*disable_memtable*/ false, &seq_used); + /*disable_memtable*/ false, &seq_used, + /*batch_cnt=*/0, /*pre_release_callback=*/nullptr, + post_mem_cb); assert(!s.ok() || seq_used != kMaxSequenceNumber); if (s.ok()) { SetId(seq_used); diff --git a/utilities/transactions/pessimistic_transaction.h b/utilities/transactions/pessimistic_transaction.h index 609bcd6005..d43d1d3ac5 100644 --- a/utilities/transactions/pessimistic_transaction.h +++ b/utilities/transactions/pessimistic_transaction.h @@ -277,6 +277,7 @@ class WriteCommittedTxn : public PessimisticTransaction { Status SetReadTimestampForValidation(TxnTimestamp ts) override; Status SetCommitTimestamp(TxnTimestamp ts) override; + TxnTimestamp GetCommitTimestamp() const override { return commit_timestamp_; } private: template diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index c1e3a2ab2e..45460dd2fa 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -656,5 +656,68 @@ void PessimisticTransactionDB::UnregisterTransaction(Transaction* txn) { transactions_.erase(it); } +std::pair> +PessimisticTransactionDB::CreateTimestampedSnapshot(TxnTimestamp ts) { + if (kMaxTxnTimestamp == ts) { + return std::make_pair(Status::InvalidArgument("invalid ts"), nullptr); + } + assert(db_impl_); + return db_impl_->CreateTimestampedSnapshot(kMaxSequenceNumber, ts); +} + +std::shared_ptr +PessimisticTransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const { + assert(db_impl_); + return db_impl_->GetTimestampedSnapshot(ts); +} + +void PessimisticTransactionDB::ReleaseTimestampedSnapshotsOlderThan( + TxnTimestamp ts) { + assert(db_impl_); + db_impl_->ReleaseTimestampedSnapshotsOlderThan(ts); +} + +Status PessimisticTransactionDB::GetTimestampedSnapshots( + TxnTimestamp ts_lb, TxnTimestamp ts_ub, + std::vector>& timestamped_snapshots) const { + assert(db_impl_); + return db_impl_->GetTimestampedSnapshots(ts_lb, ts_ub, timestamped_snapshots); +} + +Status SnapshotCreationCallback::operator()(SequenceNumber seq, + bool disable_memtable) { + assert(db_impl_); + assert(commit_ts_ != kMaxTxnTimestamp); + + const bool two_write_queues = + db_impl_->immutable_db_options().two_write_queues; + assert(!two_write_queues || !disable_memtable); +#ifdef NDEBUG + (void)two_write_queues; + (void)disable_memtable; +#endif + + const bool seq_per_batch = db_impl_->seq_per_batch(); + if (!seq_per_batch) { + assert(db_impl_->GetLastPublishedSequence() <= seq); + } else { + assert(db_impl_->GetLastPublishedSequence() < seq); + } + + // Create a snapshot which can also be used for write conflict checking. + auto ret = db_impl_->CreateTimestampedSnapshot(seq, commit_ts_); + snapshot_creation_status_ = ret.first; + snapshot_ = ret.second; + if (snapshot_creation_status_.ok()) { + assert(snapshot_); + } else { + assert(!snapshot_); + } + if (snapshot_ && snapshot_notifier_) { + snapshot_notifier_->SnapshotCreated(snapshot_.get()); + } + return Status::OK(); +} + } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff --git a/utilities/transactions/pessimistic_transaction_db.h b/utilities/transactions/pessimistic_transaction_db.h index c0a4b97362..68b6227ef3 100644 --- a/utilities/transactions/pessimistic_transaction_db.h +++ b/utilities/transactions/pessimistic_transaction_db.h @@ -150,6 +150,18 @@ class PessimisticTransactionDB : public TransactionDB { return lock_manager_->GetLockTrackerFactory(); } + std::pair> CreateTimestampedSnapshot( + TxnTimestamp ts) override; + + std::shared_ptr GetTimestampedSnapshot( + TxnTimestamp ts) const override; + + void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) override; + + Status GetTimestampedSnapshots(TxnTimestamp ts_lb, TxnTimestamp ts_ub, + std::vector>& + timestamped_snapshots) const override; + protected: DBImpl* db_impl_; std::shared_ptr info_log_; @@ -255,5 +267,33 @@ inline Status PessimisticTransactionDB::FailIfCfEnablesTs( return Status::OK(); } +class SnapshotCreationCallback : public PostMemTableCallback { + public: + explicit SnapshotCreationCallback( + DBImpl* dbi, TxnTimestamp commit_ts, + const std::shared_ptr& notifier, + std::shared_ptr& snapshot) + : db_impl_(dbi), + commit_ts_(commit_ts), + snapshot_notifier_(notifier), + snapshot_(snapshot) { + assert(db_impl_); + } + + ~SnapshotCreationCallback() override { + snapshot_creation_status_.PermitUncheckedError(); + } + + Status operator()(SequenceNumber seq, bool disable_memtable) override; + + private: + DBImpl* const db_impl_; + const TxnTimestamp commit_ts_; + std::shared_ptr snapshot_notifier_; + std::shared_ptr& snapshot_; + + Status snapshot_creation_status_; +}; + } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff --git a/utilities/transactions/timestamped_snapshot_test.cc b/utilities/transactions/timestamped_snapshot_test.cc new file mode 100644 index 0000000000..63e53a6b73 --- /dev/null +++ b/utilities/transactions/timestamped_snapshot_test.cc @@ -0,0 +1,426 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef ROCKSDB_LITE +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Transactions are not supported in LITE mode\n"); + return 0; +} +#else // ROCKSDB_LITE +#include + +#include "util/cast_util.h" +#include "utilities/transactions/transaction_test.h" + +namespace ROCKSDB_NAMESPACE { +INSTANTIATE_TEST_CASE_P( + Unsupported, TimestampedSnapshotWithTsSanityCheck, + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite))); + +INSTANTIATE_TEST_CASE_P(WriteCommitted, TransactionTest, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Values(WRITE_COMMITTED), + ::testing::Values(kOrderedWrite))); + +namespace { +// Not thread-safe. Caller needs to provide external synchronization. +class TsCheckingTxnNotifier : public TransactionNotifier { + public: + explicit TsCheckingTxnNotifier() = default; + + ~TsCheckingTxnNotifier() override {} + + void SnapshotCreated(const Snapshot* new_snapshot) override { + assert(new_snapshot); + if (prev_snapshot_seq_ != kMaxSequenceNumber) { + assert(prev_snapshot_seq_ <= new_snapshot->GetSequenceNumber()); + } + prev_snapshot_seq_ = new_snapshot->GetSequenceNumber(); + if (prev_snapshot_ts_ != kMaxTxnTimestamp) { + assert(prev_snapshot_ts_ <= new_snapshot->GetTimestamp()); + } + prev_snapshot_ts_ = new_snapshot->GetTimestamp(); + } + + TxnTimestamp prev_snapshot_ts() const { return prev_snapshot_ts_; } + + private: + SequenceNumber prev_snapshot_seq_ = kMaxSequenceNumber; + TxnTimestamp prev_snapshot_ts_ = kMaxTxnTimestamp; +}; +} // anonymous namespace + +TEST_P(TimestampedSnapshotWithTsSanityCheck, WithoutCommitTs) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + ASSERT_OK(txn->Prepare()); + Status s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(txn->Rollback()); + + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); +} + +TEST_P(TimestampedSnapshotWithTsSanityCheck, SetCommitTs) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + ASSERT_OK(txn->Prepare()); + std::shared_ptr snapshot; + Status s = txn->CommitAndTryCreateSnapshot(nullptr, 10, &snapshot); + ASSERT_TRUE(s.IsNotSupported()); + ASSERT_OK(txn->Rollback()); + + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + s = txn->CommitAndTryCreateSnapshot(nullptr, 10, &snapshot); + ASSERT_TRUE(s.IsNotSupported()); +} + +TEST_P(TransactionTest, WithoutCommitTs) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + ASSERT_OK(txn->Prepare()); + Status s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(txn->Rollback()); + + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); +} + +TEST_P(TransactionTest, CreateSnapshotWhenCommit) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + + constexpr int batch_size = 10; + for (int i = 0; i < batch_size; ++i) { + ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i), "v0")); + } + const SequenceNumber seq0 = db->GetLatestSequenceNumber(); + ASSERT_EQ(static_cast(batch_size), seq0); + + txn->SetSnapshot(); + { + const Snapshot* const snapshot = txn->GetSnapshot(); + assert(snapshot); + ASSERT_EQ(seq0, snapshot->GetSequenceNumber()); + } + + for (int i = 0; i < batch_size; ++i) { + ASSERT_OK(txn->Put("k" + std::to_string(i), "v1")); + } + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Prepare()); + + std::shared_ptr snapshot; + constexpr TxnTimestamp timestamp = 1; + auto notifier = std::make_shared(); + Status s = txn->CommitAndTryCreateSnapshot(notifier, timestamp, &snapshot); + ASSERT_OK(s); + ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp); + assert(snapshot); + ASSERT_EQ(timestamp, snapshot->GetTimestamp()); + ASSERT_EQ(seq0 + batch_size, snapshot->GetSequenceNumber()); + const Snapshot* const raw_snapshot_ptr = txn->GetSnapshot(); + ASSERT_EQ(raw_snapshot_ptr, snapshot.get()); + ASSERT_EQ(snapshot, txn->GetTimestampedSnapshot()); + + { + std::shared_ptr snapshot1 = + db->GetLatestTimestampedSnapshot(); + ASSERT_EQ(snapshot, snapshot1); + } + { + std::shared_ptr snapshot1 = + db->GetTimestampedSnapshot(timestamp); + ASSERT_EQ(snapshot, snapshot1); + } + { + std::vector > snapshots; + s = db->GetAllTimestampedSnapshots(snapshots); + ASSERT_OK(s); + ASSERT_EQ(std::vector >{snapshot}, + snapshots); + } +} + +TEST_P(TransactionTest, CreateSnapshot) { + // First create a non-timestamped snapshot + ManagedSnapshot snapshot_guard(db); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i), + "v0_" + std::to_string(i))); + } + { + auto ret = db->CreateTimestampedSnapshot(kMaxTxnTimestamp); + ASSERT_TRUE(ret.first.IsInvalidArgument()); + auto snapshot = ret.second; + ASSERT_EQ(nullptr, snapshot.get()); + } + constexpr TxnTimestamp timestamp = 100; + Status s; + std::shared_ptr ts_snap0; + std::tie(s, ts_snap0) = db->CreateTimestampedSnapshot(timestamp); + ASSERT_OK(s); + assert(ts_snap0); + ASSERT_EQ(timestamp, ts_snap0->GetTimestamp()); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(db->Delete(WriteOptions(), "k" + std::to_string(i))); + } + { + ReadOptions read_opts; + read_opts.snapshot = ts_snap0.get(); + for (int i = 0; i < 10; ++i) { + std::string value; + s = db->Get(read_opts, "k" + std::to_string(i), &value); + ASSERT_OK(s); + ASSERT_EQ("v0_" + std::to_string(i), value); + } + } + { + std::shared_ptr snapshot = + db->GetLatestTimestampedSnapshot(); + ASSERT_EQ(ts_snap0, snapshot); + } + { + std::shared_ptr snapshot = + db->GetTimestampedSnapshot(timestamp); + ASSERT_OK(s); + ASSERT_EQ(ts_snap0, snapshot); + } + { + std::vector > snapshots; + s = db->GetAllTimestampedSnapshots(snapshots); + ASSERT_OK(s); + ASSERT_EQ(std::vector >{ts_snap0}, + snapshots); + } +} + +TEST_P(TransactionTest, SequenceAndTsOrder) { + Status s; + std::shared_ptr snapshot; + std::tie(s, snapshot) = db->CreateTimestampedSnapshot(100); + ASSERT_OK(s); + assert(snapshot); + { + // Cannot request smaller timestamp for the new timestamped snapshot. + std::shared_ptr tmp_snapshot; + std::tie(s, tmp_snapshot) = db->CreateTimestampedSnapshot(50); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_EQ(nullptr, tmp_snapshot.get()); + } + + // If requesting a new timestamped snapshot with the same timestamp and + // sequence number, we avoid creating new snapshot object but reuse + // exisisting one. + std::shared_ptr snapshot1; + std::tie(s, snapshot1) = db->CreateTimestampedSnapshot(100); + ASSERT_OK(s); + ASSERT_EQ(snapshot.get(), snapshot1.get()); + + // If there is no write, but we request a larger timestamp, we still create + // a new snapshot object. + std::shared_ptr snapshot2; + std::tie(s, snapshot2) = db->CreateTimestampedSnapshot(200); + ASSERT_OK(s); + assert(snapshot2); + ASSERT_NE(snapshot.get(), snapshot2.get()); + ASSERT_EQ(snapshot2->GetSequenceNumber(), snapshot->GetSequenceNumber()); + ASSERT_EQ(200, snapshot2->GetTimestamp()); + + // Increase sequence number. + ASSERT_OK(db->Put(WriteOptions(), "foo", "v0")); + { + // We are requesting the same timestamp for a larger sequence number, thus + // we cannot create timestamped snapshot. + std::shared_ptr tmp_snapshot; + std::tie(s, tmp_snapshot) = db->CreateTimestampedSnapshot(200); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_EQ(nullptr, tmp_snapshot.get()); + } + { + std::unique_ptr txn1( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn1->Put("bar", "v0")); + std::shared_ptr ss; + ASSERT_OK(txn1->CommitAndTryCreateSnapshot(nullptr, 200, &ss)); + // Cannot create snapshot because requested timestamp is the same as the + // latest timestamped snapshot while sequence number is strictly higher. + ASSERT_EQ(nullptr, ss); + } + { + std::unique_ptr txn2( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn2->Put("bar", "v0")); + std::shared_ptr ss; + // Application should never do this. This is just to demonstrate error + // handling. + ASSERT_OK(txn2->CommitAndTryCreateSnapshot(nullptr, 100, &ss)); + // Cannot create snapshot because requested timestamp is smaller than + // latest timestamped snapshot. + ASSERT_EQ(nullptr, ss); + } +} + +TEST_P(TransactionTest, CloseDbWithSnapshots) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("foo", "v")); + ASSERT_OK(txn->Prepare()); + std::shared_ptr snapshot; + constexpr TxnTimestamp timestamp = 121; + auto notifier = std::make_shared(); + ASSERT_OK(txn->CommitAndTryCreateSnapshot(notifier, timestamp, &snapshot)); + assert(snapshot); + ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp); + ASSERT_EQ(timestamp, snapshot->GetTimestamp()); + ASSERT_TRUE(db->Close().IsAborted()); +} + +TEST_P(TransactionTest, MultipleTimestampedSnapshots) { + auto* dbimpl = static_cast_with_check(db->GetRootDB()); + assert(dbimpl); + const bool seq_per_batch = dbimpl->seq_per_batch(); + // TODO: remove the following assert(!seq_per_batch) once timestamped snapshot + // is supported in write-prepared/write-unprepared transactions. + assert(!seq_per_batch); + constexpr size_t txn_size = 10; + constexpr TxnTimestamp ts_delta = 10; + constexpr size_t num_txns = 100; + std::vector > snapshots(num_txns); + constexpr TxnTimestamp start_ts = 10000; + auto notifier = std::make_shared(); + for (size_t i = 0; i < num_txns; ++i) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn->SetName("txn" + std::to_string(i))); + for (size_t j = 0; j < txn_size; ++j) { + ASSERT_OK(txn->Put("k" + std::to_string(j), + "v" + std::to_string(j) + "_" + std::to_string(i))); + } + if (0 == (i % 2)) { + ASSERT_OK(txn->Prepare()); + } + ASSERT_OK(txn->CommitAndTryCreateSnapshot(notifier, start_ts + i * ts_delta, + &snapshots[i])); + assert(snapshots[i]); + ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp); + ASSERT_EQ(start_ts + i * ts_delta, snapshots[i]->GetTimestamp()); + } + + { + auto snapshot = db->GetTimestampedSnapshot(start_ts + 1); + ASSERT_EQ(nullptr, snapshot); + } + + constexpr TxnTimestamp max_ts = start_ts + num_txns * ts_delta; + for (size_t i = 0; i < num_txns; ++i) { + auto snapshot = db->GetTimestampedSnapshot(start_ts + i * ts_delta); + ASSERT_EQ(snapshots[i], snapshot); + + std::vector > tmp_snapshots; + Status s = db->GetTimestampedSnapshots(max_ts, start_ts + i * ts_delta, + tmp_snapshots); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_TRUE(tmp_snapshots.empty()); + + for (size_t j = i; j < num_txns; ++j) { + std::vector > expected_snapshots( + snapshots.begin() + i, snapshots.begin() + j); + tmp_snapshots.clear(); + s = db->GetTimestampedSnapshots(start_ts + i * ts_delta, + start_ts + j * ts_delta, tmp_snapshots); + if (i < j) { + ASSERT_OK(s); + } else { + ASSERT_TRUE(s.IsInvalidArgument()); + } + ASSERT_EQ(expected_snapshots, tmp_snapshots); + } + } + + { + std::vector > tmp_snapshots; + const Status s = db->GetAllTimestampedSnapshots(tmp_snapshots); + ASSERT_OK(s); + ASSERT_EQ(snapshots, tmp_snapshots); + + const std::shared_ptr latest_snapshot = + db->GetLatestTimestampedSnapshot(); + ASSERT_EQ(snapshots.back(), latest_snapshot); + } + + for (size_t i = 0; i <= num_txns; ++i) { + std::vector > snapshots1( + snapshots.begin() + i, snapshots.end()); + if (i > 0) { + auto snapshot1 = + db->GetTimestampedSnapshot(start_ts + (i - 1) * ts_delta); + assert(snapshot1); + ASSERT_EQ(start_ts + (i - 1) * ts_delta, snapshot1->GetTimestamp()); + } + + db->ReleaseTimestampedSnapshotsOlderThan(start_ts + i * ts_delta); + + if (i > 0) { + auto snapshot1 = + db->GetTimestampedSnapshot(start_ts + (i - 1) * ts_delta); + ASSERT_EQ(nullptr, snapshot1); + } + + std::vector > tmp_snapshots; + const Status s = db->GetAllTimestampedSnapshots(tmp_snapshots); + ASSERT_OK(s); + ASSERT_EQ(snapshots1, tmp_snapshots); + } + + // Even after released by db, the applications still hold reference to shared + // snapshots. + for (size_t i = 0; i < num_txns; ++i) { + assert(snapshots[i]); + ASSERT_EQ(start_ts + i * ts_delta, snapshots[i]->GetTimestamp()); + } + + snapshots.clear(); + ASSERT_OK(db->Close()); + delete db; + db = nullptr; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +#endif // !ROCKSDB_LITE diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index 0149ffd501..53d54abfb9 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -21,6 +21,43 @@ namespace ROCKSDB_NAMESPACE { +Status Transaction::CommitAndTryCreateSnapshot( + std::shared_ptr notifier, TxnTimestamp ts, + std::shared_ptr* snapshot) { + if (snapshot) { + snapshot->reset(); + } + TxnTimestamp commit_ts = GetCommitTimestamp(); + if (commit_ts == kMaxTxnTimestamp) { + if (ts == kMaxTxnTimestamp) { + return Status::InvalidArgument("Commit timestamp unset"); + } else { + const Status s = SetCommitTimestamp(ts); + if (!s.ok()) { + return s; + } + } + } else if (ts != kMaxTxnTimestamp) { + if (ts != commit_ts) { + // For now we treat this as error. + return Status::InvalidArgument("Different commit ts specified"); + } + } + SetSnapshotOnNextOperation(notifier); + Status s = Commit(); + if (!s.ok()) { + return s; + } + assert(s.ok()); + // If we reach here, we must return ok status for this function. + std::shared_ptr new_snapshot = GetTimestampedSnapshot(); + + if (snapshot) { + *snapshot = new_snapshot; + } + return Status::OK(); +} + TransactionBaseImpl::TransactionBaseImpl( DB* db, const WriteOptions& write_options, const LockTrackerFactory& lock_tracker_factory) diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h index e4da8d44b9..504d692bfb 100644 --- a/utilities/transactions/transaction_base.h +++ b/utilities/transactions/transaction_base.h @@ -206,6 +206,10 @@ class TransactionBaseImpl : public Transaction { return snapshot_.get(); } + std::shared_ptr GetTimestampedSnapshot() const override { + return snapshot_; + } + virtual void SetSnapshot() override; void SetSnapshotOnNextOperation( std::shared_ptr notifier = nullptr) override; @@ -346,7 +350,9 @@ class TransactionBaseImpl : public Transaction { save_points_; private: + friend class WriteCommittedTxn; friend class WritePreparedTxn; + // Extra data to be persisted with the commit. Note this is only used when // prepare phase is not skipped. WriteBatch commit_time_batch_; diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index d4a516dc9c..71eb9b0735 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -31,8 +31,6 @@ #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/transactions/pessimistic_transaction_db.h" -using std::string; - namespace ROCKSDB_NAMESPACE { INSTANTIATE_TEST_CASE_P( @@ -333,7 +331,7 @@ TEST_P(TransactionTest, WaitingTxn) { WriteOptions write_options; ReadOptions read_options; TransactionOptions txn_options; - string value; + std::string value; Status s; txn_options.lock_timeout = 1; @@ -994,7 +992,7 @@ TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) { TransactionOptions txn_options; txn_options.use_only_the_last_commit_time_batch_for_recovery = cwb4recovery; - string value; + std::string value; Status s; DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); @@ -2165,7 +2163,7 @@ TEST_P(TransactionTest, WriteOptionsTest) { TEST_P(TransactionTest, WriteConflictTest) { WriteOptions write_options; ReadOptions read_options; - string value; + std::string value; Status s; ASSERT_OK(db->Put(write_options, "foo", "A")); @@ -2370,7 +2368,7 @@ TEST_P(TransactionTest, FlushTest2) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; TransactionOptions txn_options; - string value; + std::string value; DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); @@ -2647,7 +2645,7 @@ TEST_P(TransactionTest, ColumnFamiliesTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; TransactionOptions txn_options; - string value; + std::string value; Status s; ColumnFamilyHandle *cfa, *cfb; @@ -2814,7 +2812,7 @@ TEST_P(TransactionTest, MultiGetBatchedTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; TransactionOptions txn_options; - string value; + std::string value; Status s; ColumnFamilyHandle* cf; @@ -2904,7 +2902,7 @@ TEST_P(TransactionTest, MultiGetBatchedTest) { TEST_P(TransactionTest, MultiGetLargeBatchedTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; - string value; + std::string value; Status s; ColumnFamilyHandle* cf; @@ -3037,7 +3035,7 @@ TEST_P(TransactionTest, MultiGetSnapshot) { TEST_P(TransactionTest, ColumnFamiliesTest2) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; - string value; + std::string value; Status s; ColumnFamilyHandle *one, *two; @@ -3113,7 +3111,7 @@ TEST_P(TransactionTest, ColumnFamiliesTest2) { TEST_P(TransactionTest, EmptyTest) { WriteOptions write_options; ReadOptions read_options; - string value; + std::string value; Status s; s = db->Put(write_options, "aaa", "aaa"); @@ -3156,7 +3154,7 @@ TEST_P(TransactionTest, PredicateManyPreceders) { WriteOptions write_options; ReadOptions read_options1, read_options2; TransactionOptions txn_options; - string value; + std::string value; Status s; txn_options.set_snapshot = true; @@ -3400,7 +3398,7 @@ TEST_P(TransactionTest, ExpiredTransaction) { WriteOptions write_options; ReadOptions read_options; TransactionOptions txn_options; - string value; + std::string value; Status s; // Set txn expiration timeout to 0 microseconds (expires instantly) @@ -3589,7 +3587,7 @@ TEST_P(TransactionTest, LockLimitTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; TransactionOptions txn_options; - string value; + std::string value; Status s; delete db; @@ -5414,7 +5412,7 @@ TEST_P(TransactionStressTest, ExpiredTransactionDataRace1) { ASSERT_OK(s); ReadOptions read_options; - string value; + std::string value; s = db->Get(read_options, "X", &value); ASSERT_OK(s); ASSERT_EQ("1", value); @@ -6021,7 +6019,7 @@ TEST_P(TransactionTest, DuplicateKeys) { ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("4"))); ASSERT_OK(db->Write(write_options, &batch)); ReadOptions read_options; - string value; + std::string value; ASSERT_OK(db->Get(read_options, cf_handle, "key", &value)); ASSERT_EQ(value, "value,1,2,3,4"); delete cf_handle; diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index 2780cf24de..52e8acd444 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -562,4 +562,22 @@ class WriteCommittedTxnWithTsTest std::vector handles_{}; }; +class TimestampedSnapshotWithTsSanityCheck + : public TransactionTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + explicit TimestampedSnapshotWithTsSanityCheck() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())) {} + ~TimestampedSnapshotWithTsSanityCheck() override { + for (auto* h : handles_) { + delete h; + } + } + + protected: + std::vector handles_{}; +}; + } // namespace ROCKSDB_NAMESPACE