2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2023-05-24 18:57:15 +00:00
|
|
|
#include <cstdint>
|
2016-12-16 19:17:26 +00:00
|
|
|
#include <functional>
|
2023-05-24 18:57:15 +00:00
|
|
|
#include <memory>
|
2015-05-29 21:36:35 +00:00
|
|
|
#include <string>
|
2016-03-03 19:20:25 +00:00
|
|
|
#include <thread>
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2019-06-11 18:42:19 +00:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2020-11-17 01:58:59 +00:00
|
|
|
#include "db/db_test_util.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "port/port.h"
|
2015-05-29 21:36:35 +00:00
|
|
|
#include "rocksdb/db.h"
|
2019-06-11 18:42:19 +00:00
|
|
|
#include "rocksdb/perf_context.h"
|
2015-05-29 21:36:35 +00:00
|
|
|
#include "rocksdb/utilities/optimistic_transaction_db.h"
|
2016-03-03 19:20:25 +00:00
|
|
|
#include "rocksdb/utilities/transaction.h"
|
2019-06-11 18:42:19 +00:00
|
|
|
#include "test_util/sync_point.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "test_util/testharness.h"
|
|
|
|
#include "test_util/transaction_test_util.h"
|
2016-03-03 19:20:25 +00:00
|
|
|
#include "util/crc32c.h"
|
|
|
|
#include "util/random.h"
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
class OptimisticTransactionTest
|
|
|
|
: public testing::Test,
|
|
|
|
public testing::WithParamInterface<OccValidationPolicy> {
|
2015-05-29 21:36:35 +00:00
|
|
|
public:
|
2023-05-24 18:57:15 +00:00
|
|
|
std::unique_ptr<OptimisticTransactionDB> txn_db;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string dbname;
|
2015-05-29 21:36:35 +00:00
|
|
|
Options options;
|
2023-05-24 18:57:15 +00:00
|
|
|
OptimisticTransactionDBOptions occ_opts;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
OptimisticTransactionTest() {
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.max_write_buffer_number = 2;
|
2021-12-02 19:44:29 +00:00
|
|
|
options.max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize;
|
2020-11-17 01:58:59 +00:00
|
|
|
options.merge_operator.reset(new TestPutOperator());
|
2023-05-24 18:57:15 +00:00
|
|
|
occ_opts.validate_policy = GetParam();
|
2018-07-14 00:18:39 +00:00
|
|
|
dbname = test::PerThreadDBPath("optimistic_transaction_testdb");
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2022-08-11 00:34:38 +00:00
|
|
|
EXPECT_OK(DestroyDB(dbname, options));
|
2016-11-09 20:12:40 +00:00
|
|
|
Open();
|
2015-05-29 21:36:35 +00:00
|
|
|
}
|
2019-02-14 21:52:47 +00:00
|
|
|
~OptimisticTransactionTest() override {
|
2023-05-24 18:57:15 +00:00
|
|
|
EXPECT_OK(txn_db->Close());
|
|
|
|
txn_db.reset();
|
2022-08-11 00:34:38 +00:00
|
|
|
EXPECT_OK(DestroyDB(dbname, options));
|
2015-05-29 21:36:35 +00:00
|
|
|
}
|
2016-11-09 20:12:40 +00:00
|
|
|
|
|
|
|
void Reopen() {
|
2023-05-24 18:57:15 +00:00
|
|
|
txn_db.reset();
|
2016-11-09 20:12:40 +00:00
|
|
|
Open();
|
|
|
|
}
|
|
|
|
|
2023-05-24 18:57:15 +00:00
|
|
|
static void OpenImpl(const Options& options,
|
|
|
|
const OptimisticTransactionDBOptions& occ_opts,
|
|
|
|
const std::string& dbname,
|
|
|
|
std::unique_ptr<OptimisticTransactionDB>* txn_db) {
|
2020-01-07 22:19:06 +00:00
|
|
|
ColumnFamilyOptions cf_options(options);
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
2024-03-04 18:08:32 +00:00
|
|
|
column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
|
2023-05-24 18:57:15 +00:00
|
|
|
OptimisticTransactionDB* raw_txn_db = nullptr;
|
|
|
|
Status s = OptimisticTransactionDB::Open(
|
|
|
|
options, occ_opts, dbname, column_families, &handles, &raw_txn_db);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(s);
|
2023-05-24 18:57:15 +00:00
|
|
|
ASSERT_NE(raw_txn_db, nullptr);
|
|
|
|
txn_db->reset(raw_txn_db);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_EQ(handles.size(), 1);
|
2020-01-07 22:19:06 +00:00
|
|
|
delete handles[0];
|
2016-11-09 20:12:40 +00:00
|
|
|
}
|
2023-05-24 18:57:15 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
void Open() { OpenImpl(options, occ_opts, dbname, &txn_db); }
|
2015-05-29 21:36:35 +00:00
|
|
|
};
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, SuccessTest) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, WriteConflictTest) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("foo", "bar2"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// This Put outside of a transaction will conflict with the previous write
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "barz");
|
2015-08-25 02:13:18 +00:00
|
|
|
ASSERT_EQ(1, txn->GetNumKeys());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
Status s = txn->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy()); // Txn should not commit
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Verify that transaction did not write anything
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "barz");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "foo2", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, WriteConflictTest2) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
OptimisticTransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// This Put outside of a transaction will conflict with a later write
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put(
|
|
|
|
"foo", "bar2")); // Conflicts with write done after snapshot taken
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "barz");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
Status s = txn->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy()); // Txn should not commit
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Verify that transaction did not write anything
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "barz");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "foo2", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2022-09-24 00:29:05 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, WriteConflictTest3) {
|
|
|
|
ASSERT_OK(txn_db->Put(WriteOptions(), "foo", "bar"));
|
|
|
|
|
|
|
|
Transaction* txn = txn_db->BeginTransaction(WriteOptions());
|
|
|
|
ASSERT_NE(txn, nullptr);
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
ASSERT_OK(txn->GetForUpdate(ReadOptions(), "foo", &value));
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
ASSERT_OK(txn->Merge("foo", "bar3"));
|
|
|
|
|
|
|
|
// Merge outside of a transaction should conflict with the previous merge
|
|
|
|
ASSERT_OK(txn_db->Merge(WriteOptions(), "foo", "bar2"));
|
|
|
|
ASSERT_OK(txn_db->Get(ReadOptions(), "foo", &value));
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
ASSERT_EQ(1, txn->GetNumKeys());
|
|
|
|
|
|
|
|
Status s = txn->Commit();
|
|
|
|
EXPECT_TRUE(s.IsBusy()); // Txn should not commit
|
|
|
|
|
|
|
|
// Verify that transaction did not write anything
|
|
|
|
ASSERT_OK(txn_db->Get(ReadOptions(), "foo", &value));
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(OptimisticTransactionTest, WriteConflict4) {
|
|
|
|
ASSERT_OK(txn_db->Put(WriteOptions(), "foo", "bar"));
|
|
|
|
|
|
|
|
Transaction* txn = txn_db->BeginTransaction(WriteOptions());
|
|
|
|
ASSERT_NE(txn, nullptr);
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
ASSERT_OK(txn->GetForUpdate(ReadOptions(), "foo", &value));
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
ASSERT_OK(txn->Merge("foo", "bar3"));
|
|
|
|
|
|
|
|
// Range delete outside of a transaction should conflict with the previous
|
|
|
|
// merge inside txn
|
|
|
|
auto* dbimpl = static_cast_with_check<DBImpl>(txn_db->GetRootDB());
|
|
|
|
ColumnFamilyHandle* default_cf = dbimpl->DefaultColumnFamily();
|
|
|
|
ASSERT_OK(dbimpl->DeleteRange(WriteOptions(), default_cf, "foo", "foo1"));
|
|
|
|
Status s = txn_db->Get(ReadOptions(), "foo", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
ASSERT_EQ(1, txn->GetNumKeys());
|
|
|
|
|
|
|
|
s = txn->Commit();
|
|
|
|
EXPECT_TRUE(s.IsBusy()); // Txn should not commit
|
|
|
|
|
|
|
|
// Verify that transaction did not write anything
|
|
|
|
s = txn_db->Get(ReadOptions(), "foo", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, ReadConflictTest) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
OptimisticTransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
// This Put outside of a transaction will conflict with the previous read
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "barz");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
Status s = txn->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy()); // Txn should not commit
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Verify that transaction did not write anything
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "barz");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "foo2", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, TxnOnlyTest) {
|
2015-05-29 21:36:35 +00:00
|
|
|
// Test to make sure transactions work when there are no other writes in an
|
|
|
|
// empty db.
|
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("x", "y"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, FlushTest) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
// Put a random key so we have a memtable to flush
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// force a memtable flush
|
|
|
|
FlushOptions flush_ops;
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Flush(flush_ops));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// txn should commit since the flushed table is still in MemtableList History
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
namespace {
|
|
|
|
void FlushTest2PopulateTxn(Transaction* txn) {
|
|
|
|
ReadOptions snapshot_read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
TEST_P(OptimisticTransactionTest, FlushTest2) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::string value;
|
|
|
|
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
|
|
|
|
|
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
|
|
|
ASSERT_NE(txn, nullptr);
|
|
|
|
|
|
|
|
FlushTest2PopulateTxn(txn);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Put a random key so we have a MemTable to flush
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// force a memtable flush
|
|
|
|
FlushOptions flush_ops;
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Flush(flush_ops));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Put a random key so we have a MemTable to flush
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy2"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// force a memtable flush
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Flush(flush_ops));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy3"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// force a memtable flush
|
|
|
|
// Since our test db has max_write_buffer_number=2, this flush will cause
|
|
|
|
// the first memtable to get purged from the MemtableList history.
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Flush(flush_ops));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
Status s = txn->Commit();
|
2015-05-29 21:36:35 +00:00
|
|
|
// txn should not commit since MemTableList History is not large enough
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsTryAgain());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
// simply trying Commit again doesn't help
|
|
|
|
s = txn->Commit();
|
|
|
|
ASSERT_TRUE(s.IsTryAgain());
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "foo", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
// But rolling back and redoing does
|
|
|
|
ASSERT_OK(txn->Rollback());
|
|
|
|
|
|
|
|
FlushTest2PopulateTxn(txn);
|
|
|
|
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
|
|
|
|
ASSERT_OK(txn_db->Get(read_options, "foo", &value));
|
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
2015-05-29 21:36:35 +00:00
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2019-06-11 18:42:19 +00:00
|
|
|
// Trigger the condition where some old memtables are skipped when doing
|
|
|
|
// TransactionUtil::CheckKey(), and make sure the result is still correct.
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) {
|
2019-06-11 18:42:19 +00:00
|
|
|
const int kAttemptHistoryMemtable = 0;
|
|
|
|
const int kAttemptImmMemTable = 1;
|
|
|
|
for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
|
|
|
|
attempt++) {
|
|
|
|
Reopen();
|
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
ReadOptions snapshot_read_options;
|
|
|
|
ReadOptions snapshot_read_options2;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2019-06-11 18:42:19 +00:00
|
|
|
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
|
|
|
|
|
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn != nullptr);
|
|
|
|
|
|
|
|
Transaction* txn2 = txn_db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn2 != nullptr);
|
|
|
|
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
|
|
|
|
|
|
|
|
snapshot_read_options2.snapshot = txn2->GetSnapshot();
|
|
|
|
ASSERT_OK(txn2->GetForUpdate(snapshot_read_options2, "foo2", &value));
|
|
|
|
ASSERT_EQ(value, "bar");
|
|
|
|
ASSERT_OK(txn2->Put(Slice("foo2"), Slice("bar2")));
|
|
|
|
|
|
|
|
// txn updates "foo" and txn2 updates "foo2", and now a write is
|
|
|
|
// issued for "foo", which conflicts with txn but not txn2
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
|
|
|
|
|
|
|
|
if (attempt == kAttemptImmMemTable) {
|
|
|
|
// For the second attempt, hold flush from beginning. The memtable
|
|
|
|
// will be switched to immutable after calling TEST_SwitchMemtable()
|
|
|
|
// while CheckKey() is called.
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
2019-06-11 18:42:19 +00:00
|
|
|
{{"OptimisticTransactionTest.CheckKeySkipOldMemtable",
|
|
|
|
"FlushJob::Start"}});
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
2019-06-11 18:42:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// force a memtable flush. The memtable should still be kept
|
|
|
|
FlushOptions flush_ops;
|
|
|
|
if (attempt == kAttemptHistoryMemtable) {
|
|
|
|
ASSERT_OK(txn_db->Flush(flush_ops));
|
|
|
|
} else {
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_EQ(attempt, kAttemptImmMemTable);
|
2019-06-11 18:42:19 +00:00
|
|
|
DBImpl* db_impl = static_cast<DBImpl*>(txn_db->GetRootDB());
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(db_impl->TEST_SwitchMemtable());
|
2019-06-11 18:42:19 +00:00
|
|
|
}
|
|
|
|
uint64_t num_imm_mems;
|
|
|
|
ASSERT_TRUE(txn_db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
|
|
|
|
&num_imm_mems));
|
|
|
|
if (attempt == kAttemptHistoryMemtable) {
|
|
|
|
ASSERT_EQ(0, num_imm_mems);
|
|
|
|
} else {
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_EQ(attempt, kAttemptImmMemTable);
|
2019-06-11 18:42:19 +00:00
|
|
|
ASSERT_EQ(1, num_imm_mems);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Put something in active memtable
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, Slice("foo3"), Slice("bar")));
|
|
|
|
|
|
|
|
// Create txn3 after flushing, when this transaction is commited,
|
|
|
|
// only need to check the active memtable
|
|
|
|
Transaction* txn3 = txn_db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn3 != nullptr);
|
|
|
|
|
|
|
|
// Commit both of txn and txn2. txn will conflict but txn2 will
|
|
|
|
// pass. In both ways, both memtables are queried.
|
|
|
|
SetPerfLevel(PerfLevel::kEnableCount);
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
2020-12-10 05:19:55 +00:00
|
|
|
Status s = txn->Commit();
|
2019-06-11 18:42:19 +00:00
|
|
|
// We should have checked two memtables
|
|
|
|
ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
|
|
|
|
// txn should fail because of conflict, even if the memtable
|
|
|
|
// has flushed, because it is still preserved in history.
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
s = txn2->Commit();
|
|
|
|
// We should have checked two memtables
|
|
|
|
ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
|
|
|
|
ASSERT_TRUE(s.ok());
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn3->Put(Slice("foo2"), Slice("bar2")));
|
2019-06-11 18:42:19 +00:00
|
|
|
get_perf_context()->Reset();
|
|
|
|
s = txn3->Commit();
|
|
|
|
// txn3 is created after the active memtable is created, so that is the only
|
|
|
|
// memtable to check.
|
|
|
|
ASSERT_EQ(1, get_perf_context()->get_from_memtable_count);
|
|
|
|
ASSERT_TRUE(s.ok());
|
|
|
|
|
|
|
|
TEST_SYNC_POINT("OptimisticTransactionTest.CheckKeySkipOldMemtable");
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
2019-06-11 18:42:19 +00:00
|
|
|
|
|
|
|
SetPerfLevel(PerfLevel::kDisable);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
delete txn2;
|
|
|
|
delete txn3;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, NoSnapshotTest) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "AAA", "bar"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Modify key after transaction start
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Read and write without a snapshot
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar1");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("AAA", "bar2"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Should commit since read/write was done after data changed
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, MultipleSnapshotTest) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "AAA", "bar"));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "BBB", "bar"));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "CCC", "bar"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Read and write without a snapshot
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar1");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("AAA", "bar2"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Modify BBB before snapshot is taken
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "BBB", "bar1"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
// Read and write with snapshot
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar1");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("BBB", "bar2"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "CCC", "bar1"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Set a new snapshot
|
|
|
|
txn->SetSnapshot();
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
// Read and write with snapshot
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar1");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("CCC", "bar2"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "BBB", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "CCC", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "AAA", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar1");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "BBB", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar1");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "CCC", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar1");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "AAA", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "BBB", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "CCC", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "bar2");
|
|
|
|
|
|
|
|
// verify that we track multiple writes to the same key at different snapshots
|
|
|
|
delete txn;
|
|
|
|
txn = txn_db->BeginTransaction(write_options);
|
|
|
|
|
|
|
|
// Potentially conflicting writes
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "ZZZ", "zzz"));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "XXX", "xxx"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
|
|
|
|
OptimisticTransactionOptions txn_options;
|
|
|
|
txn_options.set_snapshot = true;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options);
|
2015-05-29 21:36:35 +00:00
|
|
|
txn2->SetSnapshot();
|
|
|
|
|
|
|
|
// This should not conflict in txn since the snapshot is later than the
|
|
|
|
// previous write (spoiler alert: it will later conflict with txn2).
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("ZZZ", "zzzz"));
|
|
|
|
ASSERT_OK(txn->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete txn;
|
|
|
|
|
|
|
|
// This will conflict since the snapshot is earlier than another write to ZZZ
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put("ZZZ", "xxxxx"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
Status s = txn2->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
OptimisticTransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
ColumnFamilyHandle *cfa, *cfb;
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
|
|
|
|
// Create 2 new column families
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFA", &cfa));
|
|
|
|
ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFB", &cfb));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete cfa;
|
|
|
|
delete cfb;
|
2023-05-24 18:57:15 +00:00
|
|
|
txn_db.reset();
|
|
|
|
|
|
|
|
OptimisticTransactionDBOptions my_occ_opts = occ_opts;
|
|
|
|
const size_t bucket_count = 500;
|
|
|
|
my_occ_opts.shared_lock_buckets = MakeSharedOccLockBuckets(bucket_count);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// open DB with three column families
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
// have to open default column family
|
2024-03-04 18:08:32 +00:00
|
|
|
column_families.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
|
2015-05-29 21:36:35 +00:00
|
|
|
// open the new column families
|
2024-03-04 18:08:32 +00:00
|
|
|
column_families.emplace_back("CFA", ColumnFamilyOptions());
|
|
|
|
column_families.emplace_back("CFB", ColumnFamilyOptions());
|
2015-05-29 21:36:35 +00:00
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
2023-05-24 18:57:15 +00:00
|
|
|
OptimisticTransactionDB* raw_txn_db = nullptr;
|
|
|
|
ASSERT_OK(OptimisticTransactionDB::Open(
|
|
|
|
options, my_occ_opts, dbname, column_families, &handles, &raw_txn_db));
|
|
|
|
ASSERT_NE(raw_txn_db, nullptr);
|
|
|
|
txn_db.reset(raw_txn_db);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options);
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_TRUE(txn2);
|
|
|
|
|
|
|
|
// Write some data to the db
|
|
|
|
WriteBatch batch;
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(batch.Put("foo", "foo"));
|
|
|
|
ASSERT_OK(batch.Put(handles[1], "AAA", "bar"));
|
|
|
|
ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar"));
|
|
|
|
ASSERT_OK(txn_db->Write(write_options, &batch));
|
|
|
|
ASSERT_OK(txn_db->Delete(write_options, handles[1], "AAAZZZ"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// These keys do no conflict with existing writes since they're in
|
|
|
|
// different column families
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Delete("AAA"));
|
|
|
|
Status s =
|
|
|
|
txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
2015-05-29 21:36:35 +00:00
|
|
|
Slice key_slice("AAAZZZ");
|
|
|
|
Slice value_slices[2] = {Slice("bar"), Slice("bar")};
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put(handles[2], SliceParts(&key_slice, 1),
|
|
|
|
SliceParts(value_slices, 2)));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2015-08-25 02:13:18 +00:00
|
|
|
ASSERT_EQ(3, txn->GetNumKeys());
|
|
|
|
|
2015-05-29 21:36:35 +00:00
|
|
|
// Txn should commit
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->Get(read_options, "AAA", &value);
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->Get(read_options, handles[2], "AAAZZZ", &value);
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(s);
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "barbar");
|
|
|
|
|
|
|
|
Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")};
|
|
|
|
Slice value_slice("barbarbar");
|
|
|
|
// This write will cause a conflict with the earlier batch write
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put(handles[1], SliceParts(key_slices, 3),
|
|
|
|
SliceParts(&value_slice, 1)));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Delete(handles[2], "XXX"));
|
|
|
|
ASSERT_OK(txn2->Delete(handles[1], "XXX"));
|
2015-05-29 21:36:35 +00:00
|
|
|
s = txn2->GetForUpdate(snapshot_read_options, handles[1], "AAA", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
// Verify txn did not commit
|
|
|
|
s = txn2->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->Get(read_options, handles[1], "AAAZZZ", &value);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete txn;
|
|
|
|
delete txn2;
|
|
|
|
|
2023-05-24 18:57:15 +00:00
|
|
|
// ** MultiGet **
|
2015-05-29 21:36:35 +00:00
|
|
|
txn = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
snapshot_read_options.snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options, txn_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2],
|
|
|
|
handles[0], handles[2]};
|
|
|
|
std::vector<Slice> multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"};
|
|
|
|
std::vector<std::string> values(4);
|
|
|
|
|
|
|
|
std::vector<Status> results = txn->MultiGetForUpdate(
|
|
|
|
snapshot_read_options, multiget_cfh, multiget_keys, &values);
|
|
|
|
ASSERT_OK(results[0]);
|
|
|
|
ASSERT_OK(results[1]);
|
|
|
|
ASSERT_OK(results[2]);
|
|
|
|
ASSERT_TRUE(results[3].IsNotFound());
|
|
|
|
ASSERT_EQ(values[0], "bar");
|
|
|
|
ASSERT_EQ(values[1], "barbar");
|
|
|
|
ASSERT_EQ(values[2], "foo");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Delete(handles[2], "ZZZ"));
|
|
|
|
ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYY"));
|
|
|
|
ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYYY"));
|
|
|
|
ASSERT_OK(txn->Delete(handles[2], "ZZZ"));
|
|
|
|
ASSERT_OK(txn->Put(handles[2], "AAAZZZ", "barbarbar"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2015-08-25 02:13:18 +00:00
|
|
|
ASSERT_EQ(5, txn->GetNumKeys());
|
|
|
|
|
2015-05-29 21:36:35 +00:00
|
|
|
// Txn should commit
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->Get(read_options, handles[2], "ZZZ", &value);
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
// Put a key which will conflict with the next txn using the previous snapshot
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, handles[2], "foo", "000"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
|
|
|
|
multiget_keys, &values);
|
|
|
|
ASSERT_OK(results[0]);
|
|
|
|
ASSERT_OK(results[1]);
|
|
|
|
ASSERT_OK(results[2]);
|
|
|
|
ASSERT_TRUE(results[3].IsNotFound());
|
|
|
|
ASSERT_EQ(values[0], "bar");
|
|
|
|
ASSERT_EQ(values[1], "barbar");
|
|
|
|
ASSERT_EQ(values[2], "foo");
|
|
|
|
|
|
|
|
// Verify Txn Did not Commit
|
|
|
|
s = txn2->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2023-05-24 18:57:15 +00:00
|
|
|
delete txn;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
// ** Test independence and/or sharing of lock buckets across CFs and DBs **
|
|
|
|
if (my_occ_opts.validate_policy == OccValidationPolicy::kValidateParallel) {
|
|
|
|
struct SeenStat {
|
|
|
|
uint64_t rolling_hash = 0;
|
|
|
|
uintptr_t min = 0;
|
|
|
|
uintptr_t max = 0;
|
|
|
|
};
|
|
|
|
SeenStat cur_seen;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"OptimisticTransaction::CommitWithParallelValidate::lock_bucket_ptr",
|
|
|
|
[&](void* arg) {
|
|
|
|
// Hash the pointer
|
|
|
|
cur_seen.rolling_hash = Hash64(reinterpret_cast<char*>(&arg),
|
|
|
|
sizeof(arg), cur_seen.rolling_hash);
|
|
|
|
uintptr_t val = reinterpret_cast<uintptr_t>(arg);
|
|
|
|
if (cur_seen.min == 0 || val < cur_seen.min) {
|
|
|
|
cur_seen.min = val;
|
|
|
|
}
|
|
|
|
if (cur_seen.max == 0 || val > cur_seen.max) {
|
|
|
|
cur_seen.max = val;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Another db sharing lock buckets
|
|
|
|
auto shared_dbname =
|
|
|
|
test::PerThreadDBPath("optimistic_transaction_testdb_shared");
|
|
|
|
std::unique_ptr<OptimisticTransactionDB> shared_txn_db = nullptr;
|
|
|
|
OpenImpl(options, my_occ_opts, shared_dbname, &shared_txn_db);
|
|
|
|
|
|
|
|
// Another db not sharing lock buckets
|
|
|
|
auto nonshared_dbname =
|
|
|
|
test::PerThreadDBPath("optimistic_transaction_testdb_nonshared");
|
|
|
|
std::unique_ptr<OptimisticTransactionDB> nonshared_txn_db = nullptr;
|
|
|
|
my_occ_opts.occ_lock_buckets = bucket_count;
|
|
|
|
my_occ_opts.shared_lock_buckets = nullptr;
|
|
|
|
OpenImpl(options, my_occ_opts, nonshared_dbname, &nonshared_txn_db);
|
|
|
|
|
|
|
|
// Plenty of keys to avoid randomly hitting the same hash sequence
|
|
|
|
std::array<std::string, 30> keys;
|
|
|
|
for (size_t i = 0; i < keys.size(); ++i) {
|
|
|
|
keys[i] = std::to_string(i);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get a baseline pattern of bucket accesses
|
|
|
|
cur_seen = {};
|
|
|
|
txn = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
for (const auto& key : keys) {
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(txn->Put(handles[0], key, "blah"));
|
2023-05-24 18:57:15 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
// Sufficiently large hash coverage of the space
|
|
|
|
const uintptr_t min_span_bytes = sizeof(port::Mutex) * bucket_count / 2;
|
|
|
|
ASSERT_GT(cur_seen.max - cur_seen.min, min_span_bytes);
|
|
|
|
// Save
|
|
|
|
SeenStat base_seen = cur_seen;
|
|
|
|
|
|
|
|
// Verify it is repeatable
|
|
|
|
cur_seen = {};
|
|
|
|
txn = txn_db->BeginTransaction(write_options, txn_options, txn);
|
|
|
|
for (const auto& key : keys) {
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(txn->Put(handles[0], key, "moo"));
|
2023-05-24 18:57:15 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
ASSERT_EQ(cur_seen.rolling_hash, base_seen.rolling_hash);
|
|
|
|
ASSERT_EQ(cur_seen.min, base_seen.min);
|
|
|
|
ASSERT_EQ(cur_seen.max, base_seen.max);
|
|
|
|
|
|
|
|
// Try another CF
|
|
|
|
cur_seen = {};
|
|
|
|
txn = txn_db->BeginTransaction(write_options, txn_options, txn);
|
|
|
|
for (const auto& key : keys) {
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(txn->Put(handles[1], key, "blah"));
|
2023-05-24 18:57:15 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
// Different access pattern (different hash seed)
|
|
|
|
ASSERT_NE(cur_seen.rolling_hash, base_seen.rolling_hash);
|
|
|
|
// Same pointer space
|
|
|
|
ASSERT_LT(cur_seen.min, base_seen.max);
|
|
|
|
ASSERT_GT(cur_seen.max, base_seen.min);
|
|
|
|
// Sufficiently large hash coverage of the space
|
|
|
|
ASSERT_GT(cur_seen.max - cur_seen.min, min_span_bytes);
|
|
|
|
// Save
|
|
|
|
SeenStat cf1_seen = cur_seen;
|
|
|
|
|
|
|
|
// And another CF
|
|
|
|
cur_seen = {};
|
|
|
|
txn = txn_db->BeginTransaction(write_options, txn_options, txn);
|
|
|
|
for (const auto& key : keys) {
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(txn->Put(handles[2], key, "blah"));
|
2023-05-24 18:57:15 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
// Different access pattern (different hash seed)
|
|
|
|
ASSERT_NE(cur_seen.rolling_hash, base_seen.rolling_hash);
|
|
|
|
ASSERT_NE(cur_seen.rolling_hash, cf1_seen.rolling_hash);
|
|
|
|
// Same pointer space
|
|
|
|
ASSERT_LT(cur_seen.min, base_seen.max);
|
|
|
|
ASSERT_GT(cur_seen.max, base_seen.min);
|
|
|
|
// Sufficiently large hash coverage of the space
|
|
|
|
ASSERT_GT(cur_seen.max - cur_seen.min, min_span_bytes);
|
|
|
|
|
|
|
|
// And DB with shared lock buckets
|
|
|
|
cur_seen = {};
|
|
|
|
delete txn;
|
|
|
|
txn = shared_txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
for (const auto& key : keys) {
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(txn->Put(key, "blah"));
|
2023-05-24 18:57:15 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
// Different access pattern (different hash seed)
|
|
|
|
ASSERT_NE(cur_seen.rolling_hash, base_seen.rolling_hash);
|
|
|
|
ASSERT_NE(cur_seen.rolling_hash, cf1_seen.rolling_hash);
|
|
|
|
// Same pointer space
|
|
|
|
ASSERT_LT(cur_seen.min, base_seen.max);
|
|
|
|
ASSERT_GT(cur_seen.max, base_seen.min);
|
|
|
|
// Sufficiently large hash coverage of the space
|
|
|
|
ASSERT_GT(cur_seen.max - cur_seen.min, min_span_bytes);
|
|
|
|
|
|
|
|
// And DB with distinct lock buckets
|
|
|
|
cur_seen = {};
|
|
|
|
delete txn;
|
|
|
|
txn = nonshared_txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
for (const auto& key : keys) {
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(txn->Put(key, "blah"));
|
2023-05-24 18:57:15 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(txn->Commit());
|
|
|
|
// Different access pattern (different hash seed)
|
|
|
|
ASSERT_NE(cur_seen.rolling_hash, base_seen.rolling_hash);
|
|
|
|
ASSERT_NE(cur_seen.rolling_hash, cf1_seen.rolling_hash);
|
|
|
|
// Different pointer space
|
|
|
|
ASSERT_TRUE(cur_seen.min > base_seen.max || cur_seen.max < base_seen.min);
|
|
|
|
// Sufficiently large hash coverage of the space
|
|
|
|
ASSERT_GT(cur_seen.max - cur_seen.min, min_span_bytes);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
// ** Test dropping column family before committing, or even creating txn **
|
|
|
|
txn = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn->Delete(handles[1], "AAA"));
|
|
|
|
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->DropColumnFamily(handles[1]);
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_OK(s);
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->DropColumnFamily(handles[2]);
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
|
2023-05-24 18:57:15 +00:00
|
|
|
ASSERT_NOK(txn->Commit());
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
ASSERT_OK(txn2->Delete(handles[2], "AAA"));
|
|
|
|
ASSERT_NOK(txn2->Commit());
|
|
|
|
|
2015-05-29 21:36:35 +00:00
|
|
|
delete txn;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
for (auto handle : handles) {
|
|
|
|
delete handle;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, EmptyTest) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "aaa", "aaa"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
delete txn;
|
|
|
|
|
|
|
|
txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Rollback());
|
2015-05-29 21:36:35 +00:00
|
|
|
delete txn;
|
|
|
|
|
|
|
|
txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "aaa");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
delete txn;
|
|
|
|
|
|
|
|
txn = txn_db->BeginTransaction(write_options);
|
|
|
|
txn->SetSnapshot();
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "aaa");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "aaa", "xxx"));
|
|
|
|
Status s = txn->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
2015-05-29 21:36:35 +00:00
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, PredicateManyPreceders) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options1, read_options2;
|
|
|
|
OptimisticTransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options);
|
2015-05-29 21:36:35 +00:00
|
|
|
read_options1.snapshot = txn1->GetSnapshot();
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn2 = txn_db->BeginTransaction(write_options);
|
2015-05-29 21:36:35 +00:00
|
|
|
txn2->SetSnapshot();
|
|
|
|
read_options2.snapshot = txn2->GetSnapshot();
|
|
|
|
|
|
|
|
std::vector<Slice> multiget_keys = {"1", "2", "3"};
|
|
|
|
std::vector<std::string> multiget_values;
|
|
|
|
|
|
|
|
std::vector<Status> results =
|
|
|
|
txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_TRUE(results[0].IsNotFound());
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_TRUE(results[1].IsNotFound());
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_TRUE(results[2].IsNotFound());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put("2", "x"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
multiget_values.clear();
|
|
|
|
results =
|
|
|
|
txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_TRUE(results[0].IsNotFound());
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_TRUE(results[1].IsNotFound());
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_TRUE(results[2].IsNotFound());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// should not commit since txn2 wrote a key txn has read
|
2020-12-10 05:19:55 +00:00
|
|
|
Status s = txn1->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn1 = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options1.snapshot = txn1->GetSnapshot();
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options2.snapshot = txn2->GetSnapshot();
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Put("4", "x"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Delete("4"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// txn1 can commit since txn2's delete hasn't happened yet (it's just batched)
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
s = txn2->GetForUpdate(read_options2, "4", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
// txn2 cannot commit since txn1 changed "4"
|
|
|
|
s = txn2->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, LostUpdate) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, read_options1, read_options2;
|
|
|
|
OptimisticTransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Test 2 transactions writing to the same key in multiple orders and
|
|
|
|
// with/without snapshots
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn1 = txn_db->BeginTransaction(write_options);
|
|
|
|
Transaction* txn2 = txn_db->BeginTransaction(write_options);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Put("1", "1"));
|
|
|
|
ASSERT_OK(txn2->Put("1", "2"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
Status s = txn2->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn_options.set_snapshot = true;
|
|
|
|
txn1 = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options1.snapshot = txn1->GetSnapshot();
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options2.snapshot = txn2->GetSnapshot();
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Put("1", "3"));
|
|
|
|
ASSERT_OK(txn2->Put("1", "4"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
s = txn2->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn1 = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options1.snapshot = txn1->GetSnapshot();
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options2.snapshot = txn2->GetSnapshot();
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Put("1", "5"));
|
|
|
|
ASSERT_OK(txn1->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put("1", "6"));
|
2015-05-29 21:36:35 +00:00
|
|
|
s = txn2->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn1 = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options1.snapshot = txn1->GetSnapshot();
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options, txn_options);
|
|
|
|
read_options2.snapshot = txn2->GetSnapshot();
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Put("1", "5"));
|
|
|
|
ASSERT_OK(txn1->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
txn2->SetSnapshot();
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put("1", "6"));
|
|
|
|
ASSERT_OK(txn2->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
txn1 = txn_db->BeginTransaction(write_options);
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options);
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Put("1", "7"));
|
|
|
|
ASSERT_OK(txn1->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put("1", "8"));
|
|
|
|
ASSERT_OK(txn2->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
delete txn1;
|
|
|
|
delete txn2;
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "1", &value));
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_EQ(value, "8");
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, UntrackedWrites) {
|
2015-05-29 21:36:35 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-05-29 21:36:35 +00:00
|
|
|
Status s;
|
|
|
|
|
|
|
|
// Verify transaction rollback works for untracked keys.
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->PutUntracked("untracked", "0"));
|
|
|
|
ASSERT_OK(txn->Rollback());
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->Get(read_options, "untracked", &value);
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
txn = txn_db->BeginTransaction(write_options);
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("tracked", "1"));
|
|
|
|
ASSERT_OK(txn->PutUntracked("untracked", "1"));
|
|
|
|
ASSERT_OK(txn->MergeUntracked("untracked", "2"));
|
|
|
|
ASSERT_OK(txn->DeleteUntracked("untracked"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Write to the untracked key outside of the transaction and verify
|
|
|
|
// it doesn't prevent the transaction from committing.
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "untracked", "x"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->Get(read_options, "untracked", &value);
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
txn = txn_db->BeginTransaction(write_options);
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("tracked", "10"));
|
|
|
|
ASSERT_OK(txn->PutUntracked("untracked", "A"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
// Write to tracked key outside of the transaction and verify that the
|
|
|
|
// untracked keys are not written when the commit fails.
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Delete(write_options, "tracked"));
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
s = txn->Commit();
|
2015-07-31 03:31:36 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->Get(read_options, "untracked", &value);
|
2015-05-29 21:36:35 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, IteratorTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
OptimisticTransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
|
|
|
// Write some keys to the db
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "A", "a"));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "G", "g"));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "F", "f"));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "C", "c"));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "D", "d"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
|
|
|
// Write some keys in a txn
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("B", "b"));
|
|
|
|
ASSERT_OK(txn->Put("H", "h"));
|
|
|
|
ASSERT_OK(txn->Delete("D"));
|
|
|
|
ASSERT_OK(txn->Put("E", "e"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
|
|
|
txn->SetSnapshot();
|
|
|
|
const Snapshot* snapshot = txn->GetSnapshot();
|
|
|
|
|
|
|
|
// Write some keys to the db after the snapshot
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "BB", "xx"));
|
|
|
|
ASSERT_OK(txn_db->Put(write_options, "C", "xx"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
|
|
|
read_options.snapshot = snapshot;
|
|
|
|
Iterator* iter = txn->GetIterator(read_options);
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
iter->SeekToFirst();
|
|
|
|
|
|
|
|
// Read all keys via iter and lock them all
|
|
|
|
std::string results[] = {"a", "b", "c", "e", "f", "g", "h"};
|
|
|
|
for (int i = 0; i < 7; i++) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(results[i], iter->value().ToString());
|
|
|
|
|
JNI get_helper code sharing / multiGet() use efficient batch C++ support (#12344)
Summary:
Implement RAII-based helpers for JNIGet() and multiGet()
Replace JNI C++ helpers `rocksdb_get_helper, rocksdb_get_helper_direct`, `multi_get_helper`, `multi_get_helper_direct`, `multi_get_helper_release_keys`, `txn_get_helper`, and `txn_multi_get_helper`.
The model is to entirely do away with a single helper, instead a number of utility methods allow each separate
JNI `Get()` and `MultiGet()` method to organise their parameters efficiently, then call the underlying C++ `db->Get()`,
`db->MultiGet()`, `txn->Get()`, or `txn->MultiGet()` method itself, and use further utilities to retrieve results.
Roughly speaking:
* get keys into C++ form
* Call C++ Get()
* get results and status into Java form
We achieve a useful performance gain as part of this work; by using the updated C++ multiGet we immediately pick up its performance gains (batch improvements to multiGet C++ were previously implemented, but not until now used by Java/JNI). multiGetBB already uses the batched C++ multiGet(), and all other benchmarks show consistent improvement after the changes:
## Before:
```
Benchmark (columnFamilyTestType) (keyCount) (keySize) (multiGetSize) (valueSize) Mode Cnt Score Error Units
MultiGetNewBenchmarks.multiGetBB200 no_column_family 10000 1024 100 256 thrpt 25 5315.459 ± 20.465 ops/s
MultiGetNewBenchmarks.multiGetBB200 no_column_family 10000 1024 100 1024 thrpt 25 5673.115 ± 78.299 ops/s
MultiGetNewBenchmarks.multiGetBB200 no_column_family 10000 1024 100 4096 thrpt 25 2616.860 ± 46.994 ops/s
MultiGetNewBenchmarks.multiGetBB200 no_column_family 10000 1024 100 16384 thrpt 25 1700.058 ± 24.034 ops/s
MultiGetNewBenchmarks.multiGetBB200 no_column_family 10000 1024 100 65536 thrpt 25 791.171 ± 13.955 ops/s
MultiGetNewBenchmarks.multiGetList10 no_column_family 10000 1024 100 256 thrpt 25 6129.929 ± 94.200 ops/s
MultiGetNewBenchmarks.multiGetList10 no_column_family 10000 1024 100 1024 thrpt 25 7012.405 ± 97.886 ops/s
MultiGetNewBenchmarks.multiGetList10 no_column_family 10000 1024 100 4096 thrpt 25 2799.014 ± 39.352 ops/s
MultiGetNewBenchmarks.multiGetList10 no_column_family 10000 1024 100 16384 thrpt 25 1417.205 ± 22.272 ops/s
MultiGetNewBenchmarks.multiGetList10 no_column_family 10000 1024 100 65536 thrpt 25 655.594 ± 13.050 ops/s
MultiGetNewBenchmarks.multiGetListExplicitCF20 no_column_family 10000 1024 100 256 thrpt 25 6147.247 ± 82.711 ops/s
MultiGetNewBenchmarks.multiGetListExplicitCF20 no_column_family 10000 1024 100 1024 thrpt 25 7004.213 ± 79.251 ops/s
MultiGetNewBenchmarks.multiGetListExplicitCF20 no_column_family 10000 1024 100 4096 thrpt 25 2715.154 ± 110.017 ops/s
MultiGetNewBenchmarks.multiGetListExplicitCF20 no_column_family 10000 1024 100 16384 thrpt 25 1408.070 ± 31.714 ops/s
MultiGetNewBenchmarks.multiGetListExplicitCF20 no_column_family 10000 1024 100 65536 thrpt 25 623.829 ± 57.374 ops/s
MultiGetNewBenchmarks.multiGetListRandomCF30 no_column_family 10000 1024 100 256 thrpt 25 6119.243 ± 116.313 ops/s
MultiGetNewBenchmarks.multiGetListRandomCF30 no_column_family 10000 1024 100 1024 thrpt 25 6931.873 ± 128.094 ops/s
MultiGetNewBenchmarks.multiGetListRandomCF30 no_column_family 10000 1024 100 4096 thrpt 25 2678.253 ± 39.113 ops/s
MultiGetNewBenchmarks.multiGetListRandomCF30 no_column_family 10000 1024 100 16384 thrpt 25 1337.384 ± 19.500 ops/s
MultiGetNewBenchmarks.multiGetListRandomCF30 no_column_family 10000 1024 100 65536 thrpt 25 625.596 ± 14.525 ops/s
```
## After:
```
Benchmark (columnFamilyTestType) (keyCount) (keySize) (multiGetSize) (valueSize) Mode Cnt Score Error Units
MultiGetBenchmarks.multiGetBB200 no_column_family 10000 1024 100 256 thrpt 25 5191.074 ± 78.250 ops/s
MultiGetBenchmarks.multiGetBB200 no_column_family 10000 1024 100 1024 thrpt 25 5378.692 ± 260.682 ops/s
MultiGetBenchmarks.multiGetBB200 no_column_family 10000 1024 100 4096 thrpt 25 2590.183 ± 34.844 ops/s
MultiGetBenchmarks.multiGetBB200 no_column_family 10000 1024 100 16384 thrpt 25 1634.793 ± 34.022 ops/s
MultiGetBenchmarks.multiGetBB200 no_column_family 10000 1024 100 65536 thrpt 25 786.455 ± 8.462 ops/s
MultiGetBenchmarks.multiGetBB200 1_column_family 10000 1024 100 256 thrpt 25 5285.055 ± 11.676 ops/s
MultiGetBenchmarks.multiGetBB200 1_column_family 10000 1024 100 1024 thrpt 25 5586.758 ± 213.008 ops/s
MultiGetBenchmarks.multiGetBB200 1_column_family 10000 1024 100 4096 thrpt 25 2527.172 ± 17.106 ops/s
MultiGetBenchmarks.multiGetBB200 1_column_family 10000 1024 100 16384 thrpt 25 1819.547 ± 12.958 ops/s
MultiGetBenchmarks.multiGetBB200 1_column_family 10000 1024 100 65536 thrpt 25 803.861 ± 9.963 ops/s
MultiGetBenchmarks.multiGetBB200 20_column_families 10000 1024 100 256 thrpt 25 5253.793 ± 28.020 ops/s
MultiGetBenchmarks.multiGetBB200 20_column_families 10000 1024 100 1024 thrpt 25 5705.591 ± 20.556 ops/s
MultiGetBenchmarks.multiGetBB200 20_column_families 10000 1024 100 4096 thrpt 25 2523.377 ± 15.415 ops/s
MultiGetBenchmarks.multiGetBB200 20_column_families 10000 1024 100 16384 thrpt 25 1815.344 ± 11.309 ops/s
MultiGetBenchmarks.multiGetBB200 20_column_families 10000 1024 100 65536 thrpt 25 820.792 ± 3.192 ops/s
MultiGetBenchmarks.multiGetBB200 100_column_families 10000 1024 100 256 thrpt 25 5262.184 ± 20.477 ops/s
MultiGetBenchmarks.multiGetBB200 100_column_families 10000 1024 100 1024 thrpt 25 5706.959 ± 23.123 ops/s
MultiGetBenchmarks.multiGetBB200 100_column_families 10000 1024 100 4096 thrpt 25 2520.362 ± 9.170 ops/s
MultiGetBenchmarks.multiGetBB200 100_column_families 10000 1024 100 16384 thrpt 25 1789.185 ± 14.239 ops/s
MultiGetBenchmarks.multiGetBB200 100_column_families 10000 1024 100 65536 thrpt 25 818.401 ± 12.132 ops/s
MultiGetBenchmarks.multiGetList10 no_column_family 10000 1024 100 256 thrpt 25 6978.310 ± 14.084 ops/s
MultiGetBenchmarks.multiGetList10 no_column_family 10000 1024 100 1024 thrpt 25 7664.242 ± 22.304 ops/s
MultiGetBenchmarks.multiGetList10 no_column_family 10000 1024 100 4096 thrpt 25 2881.778 ± 81.054 ops/s
MultiGetBenchmarks.multiGetList10 no_column_family 10000 1024 100 16384 thrpt 25 1599.826 ± 7.190 ops/s
MultiGetBenchmarks.multiGetList10 no_column_family 10000 1024 100 65536 thrpt 25 737.520 ± 6.809 ops/s
MultiGetBenchmarks.multiGetList10 1_column_family 10000 1024 100 256 thrpt 25 6974.376 ± 10.716 ops/s
MultiGetBenchmarks.multiGetList10 1_column_family 10000 1024 100 1024 thrpt 25 7637.440 ± 45.877 ops/s
MultiGetBenchmarks.multiGetList10 1_column_family 10000 1024 100 4096 thrpt 25 2820.472 ± 42.231 ops/s
MultiGetBenchmarks.multiGetList10 1_column_family 10000 1024 100 16384 thrpt 25 1716.663 ± 8.527 ops/s
MultiGetBenchmarks.multiGetList10 1_column_family 10000 1024 100 65536 thrpt 25 755.848 ± 7.514 ops/s
MultiGetBenchmarks.multiGetList10 20_column_families 10000 1024 100 256 thrpt 25 6943.651 ± 20.040 ops/s
MultiGetBenchmarks.multiGetList10 20_column_families 10000 1024 100 1024 thrpt 25 7679.415 ± 9.114 ops/s
MultiGetBenchmarks.multiGetList10 20_column_families 10000 1024 100 4096 thrpt 25 2844.564 ± 13.388 ops/s
MultiGetBenchmarks.multiGetList10 20_column_families 10000 1024 100 16384 thrpt 25 1729.545 ± 5.983 ops/s
MultiGetBenchmarks.multiGetList10 20_column_families 10000 1024 100 65536 thrpt 25 783.218 ± 1.530 ops/s
MultiGetBenchmarks.multiGetList10 100_column_families 10000 1024 100 256 thrpt 25 6944.276 ± 29.995 ops/s
MultiGetBenchmarks.multiGetList10 100_column_families 10000 1024 100 1024 thrpt 25 7670.301 ± 8.986 ops/s
MultiGetBenchmarks.multiGetList10 100_column_families 10000 1024 100 4096 thrpt 25 2839.828 ± 12.421 ops/s
MultiGetBenchmarks.multiGetList10 100_column_families 10000 1024 100 16384 thrpt 25 1730.005 ± 9.209 ops/s
MultiGetBenchmarks.multiGetList10 100_column_families 10000 1024 100 65536 thrpt 25 787.096 ± 1.977 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 no_column_family 10000 1024 100 256 thrpt 25 6896.944 ± 21.530 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 no_column_family 10000 1024 100 1024 thrpt 25 7622.407 ± 12.824 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 no_column_family 10000 1024 100 4096 thrpt 25 2927.538 ± 19.792 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 no_column_family 10000 1024 100 16384 thrpt 25 1598.041 ± 4.312 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 no_column_family 10000 1024 100 65536 thrpt 25 744.564 ± 9.236 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 1_column_family 10000 1024 100 256 thrpt 25 6853.760 ± 78.041 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 1_column_family 10000 1024 100 1024 thrpt 25 7360.917 ± 355.365 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 1_column_family 10000 1024 100 4096 thrpt 25 2848.774 ± 13.409 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 1_column_family 10000 1024 100 16384 thrpt 25 1727.688 ± 3.329 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 1_column_family 10000 1024 100 65536 thrpt 25 776.088 ± 7.517 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 20_column_families 10000 1024 100 256 thrpt 25 6910.339 ± 14.366 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 20_column_families 10000 1024 100 1024 thrpt 25 7633.660 ± 10.830 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 20_column_families 10000 1024 100 4096 thrpt 25 2787.799 ± 81.775 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 20_column_families 10000 1024 100 16384 thrpt 25 1726.517 ± 6.830 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 20_column_families 10000 1024 100 65536 thrpt 25 787.597 ± 3.362 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 100_column_families 10000 1024 100 256 thrpt 25 6922.445 ± 10.493 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 100_column_families 10000 1024 100 1024 thrpt 25 7604.710 ± 48.043 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 100_column_families 10000 1024 100 4096 thrpt 25 2848.788 ± 15.783 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 100_column_families 10000 1024 100 16384 thrpt 25 1730.837 ± 6.497 ops/s
MultiGetBenchmarks.multiGetListExplicitCF20 100_column_families 10000 1024 100 65536 thrpt 25 794.557 ± 1.869 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 no_column_family 10000 1024 100 256 thrpt 25 6918.716 ± 15.766 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 no_column_family 10000 1024 100 1024 thrpt 25 7626.692 ± 9.394 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 no_column_family 10000 1024 100 4096 thrpt 25 2871.382 ± 72.155 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 no_column_family 10000 1024 100 16384 thrpt 25 1598.786 ± 4.819 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 no_column_family 10000 1024 100 65536 thrpt 25 748.469 ± 7.234 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 1_column_family 10000 1024 100 256 thrpt 25 6922.666 ± 17.131 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 1_column_family 10000 1024 100 1024 thrpt 25 7623.890 ± 8.805 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 1_column_family 10000 1024 100 4096 thrpt 25 2850.698 ± 18.004 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 1_column_family 10000 1024 100 16384 thrpt 25 1727.623 ± 4.868 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 1_column_family 10000 1024 100 65536 thrpt 25 774.534 ± 10.025 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 20_column_families 10000 1024 100 256 thrpt 25 5486.251 ± 13.582 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 20_column_families 10000 1024 100 1024 thrpt 25 4920.656 ± 44.557 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 20_column_families 10000 1024 100 4096 thrpt 25 3922.913 ± 25.686 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 20_column_families 10000 1024 100 16384 thrpt 25 2873.106 ± 4.336 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 20_column_families 10000 1024 100 65536 thrpt 25 802.404 ± 8.967 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 100_column_families 10000 1024 100 256 thrpt 25 4817.996 ± 18.042 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 100_column_families 10000 1024 100 1024 thrpt 25 4243.922 ± 13.929 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 100_column_families 10000 1024 100 4096 thrpt 25 3175.998 ± 7.773 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 100_column_families 10000 1024 100 16384 thrpt 25 2321.990 ± 12.501 ops/s
MultiGetBenchmarks.multiGetListRandomCF30 100_column_families 10000 1024 100 65536 thrpt 25 1753.028 ± 7.130 ops/s
```
Closes https://github.com/facebook/rocksdb/issues/11518
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12344
Reviewed By: cbi42
Differential Revision: D54809714
Pulled By: pdillinger
fbshipit-source-id: bee3b949720abac073bce043b59ce976a11e99eb
2024-03-12 19:42:08 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
txn->GetForUpdate(read_options, iter->key(), (std::string*)nullptr));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
iter->Seek("G");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("g", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("f", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Seek("D");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("e", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Seek("C");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("c", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("e", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Seek("");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("a", iter->value().ToString());
|
|
|
|
|
|
|
|
iter->Seek("X");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("h", iter->value().ToString());
|
|
|
|
|
|
|
|
// key "C" was modified in the db after txn's snapshot. txn will not commit.
|
2020-12-10 05:19:55 +00:00
|
|
|
Status s = txn->Commit();
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2021-02-05 23:55:34 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, DeleteRangeSupportTest) {
|
|
|
|
// `OptimisticTransactionDB` does not allow range deletion in any API.
|
|
|
|
ASSERT_TRUE(
|
|
|
|
txn_db
|
|
|
|
->DeleteRange(WriteOptions(), txn_db->DefaultColumnFamily(), "a", "b")
|
|
|
|
.IsNotSupported());
|
|
|
|
WriteBatch wb;
|
|
|
|
ASSERT_OK(wb.DeleteRange("a", "b"));
|
|
|
|
ASSERT_NOK(txn_db->Write(WriteOptions(), &wb));
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, SavepointTest) {
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
OptimisticTransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
|
|
|
Transaction* txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
Status s = txn->RollbackToSavePoint();
|
2015-07-31 22:18:27 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
|
|
|
txn->SetSavePoint(); // 1
|
|
|
|
|
2015-07-31 22:18:27 +00:00
|
|
|
ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to beginning of txn
|
|
|
|
s = txn->RollbackToSavePoint();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("B", "b"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "B", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_EQ("b", value);
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
txn = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_NE(txn, nullptr);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("A", "a"));
|
|
|
|
ASSERT_OK(txn->Put("B", "bb"));
|
|
|
|
ASSERT_OK(txn->Put("C", "c"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
|
|
|
txn->SetSavePoint(); // 2
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Delete("B"));
|
|
|
|
ASSERT_OK(txn->Put("C", "cc"));
|
|
|
|
ASSERT_OK(txn->Put("D", "d"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2015-07-31 22:18:27 +00:00
|
|
|
ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 2
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Get(read_options, "A", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_EQ("a", value);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Get(read_options, "B", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_EQ("bb", value);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Get(read_options, "C", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_EQ("c", value);
|
|
|
|
s = txn->Get(read_options, "D", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("A", "a"));
|
|
|
|
ASSERT_OK(txn->Put("E", "e"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2015-07-31 22:18:27 +00:00
|
|
|
// Rollback to beginning of txn
|
|
|
|
s = txn->RollbackToSavePoint();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Rollback());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
|
|
|
s = txn->Get(read_options, "A", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Get(read_options, "B", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_EQ("b", value);
|
|
|
|
s = txn->Get(read_options, "D", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = txn->Get(read_options, "D", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
s = txn->Get(read_options, "E", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("A", "aa"));
|
|
|
|
ASSERT_OK(txn->Put("F", "f"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
|
|
|
txn->SetSavePoint(); // 3
|
|
|
|
txn->SetSavePoint(); // 4
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Put("G", "g"));
|
|
|
|
ASSERT_OK(txn->Delete("F"));
|
|
|
|
ASSERT_OK(txn->Delete("B"));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Get(read_options, "A", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_EQ("aa", value);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "F", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "B", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
2015-07-31 22:18:27 +00:00
|
|
|
ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 3
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Get(read_options, "F", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_EQ("f", value);
|
|
|
|
|
|
|
|
s = txn->Get(read_options, "G", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn->Commit());
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "F", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_EQ("f", value);
|
|
|
|
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->Get(read_options, "G", &value);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "A", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_EQ("aa", value);
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Get(read_options, "B", &value));
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_EQ("b", value);
|
|
|
|
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->Get(read_options, "C", &value);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->Get(read_options, "D", &value);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
2018-04-03 22:24:23 +00:00
|
|
|
s = txn_db->Get(read_options, "E", &value);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
delete txn;
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, UndoGetForUpdateTest) {
|
2015-09-15 00:11:52 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options, snapshot_read_options;
|
|
|
|
OptimisticTransactionOptions txn_options;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
std::string value;
|
2015-09-15 00:11:52 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn_db->Put(write_options, "A", ""));
|
2015-09-15 00:11:52 +00:00
|
|
|
|
|
|
|
Transaction* txn1 = txn_db->BeginTransaction(write_options);
|
|
|
|
ASSERT_TRUE(txn1);
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
|
2015-09-15 00:11:52 +00:00
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
|
|
|
|
Transaction* txn2 = txn_db->BeginTransaction(write_options);
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(txn2->Put("A", "x"));
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Commit());
|
2015-09-15 00:11:52 +00:00
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
// Verify that txn1 can commit since A isn't conflict checked
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Commit());
|
2015-09-15 00:11:52 +00:00
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
txn1 = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Put("A", "a"));
|
2015-09-15 00:11:52 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
|
2015-09-15 00:11:52 +00:00
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put("A", "x"));
|
|
|
|
ASSERT_OK(txn2->Commit());
|
2015-09-15 00:11:52 +00:00
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
// Verify that txn1 cannot commit since A will still be conflict checked
|
2020-12-10 05:19:55 +00:00
|
|
|
Status s = txn1->Commit();
|
2015-09-15 00:11:52 +00:00
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
txn1 = txn_db->BeginTransaction(write_options);
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
|
|
|
|
ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
|
2015-09-15 00:11:52 +00:00
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put("A", "x"));
|
|
|
|
ASSERT_OK(txn2->Commit());
|
2015-09-15 00:11:52 +00:00
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
// Verify that txn1 cannot commit since A will still be conflict checked
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
txn1 = txn_db->BeginTransaction(write_options);
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
|
|
|
|
ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
|
2015-09-15 00:11:52 +00:00
|
|
|
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put("A", "x"));
|
|
|
|
ASSERT_OK(txn2->Commit());
|
2015-09-15 00:11:52 +00:00
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
// Verify that txn1 can commit since A isn't conflict checked
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Commit());
|
2015-09-15 00:11:52 +00:00
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
txn1 = txn_db->BeginTransaction(write_options);
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
|
2015-09-15 00:11:52 +00:00
|
|
|
|
|
|
|
txn1->SetSavePoint();
|
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put("A", "x"));
|
|
|
|
ASSERT_OK(txn2->Commit());
|
2015-09-15 00:11:52 +00:00
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
// Verify that txn1 cannot commit since A will still be conflict checked
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
txn1 = txn_db->BeginTransaction(write_options);
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
|
2015-09-15 00:11:52 +00:00
|
|
|
|
|
|
|
txn1->SetSavePoint();
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
|
2015-09-15 00:11:52 +00:00
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put("A", "x"));
|
|
|
|
ASSERT_OK(txn2->Commit());
|
2015-09-15 00:11:52 +00:00
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
// Verify that txn1 cannot commit since A will still be conflict checked
|
|
|
|
s = txn1->Commit();
|
|
|
|
ASSERT_TRUE(s.IsBusy());
|
|
|
|
delete txn1;
|
|
|
|
|
|
|
|
txn1 = txn_db->BeginTransaction(write_options);
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
|
2015-09-15 00:11:52 +00:00
|
|
|
|
|
|
|
txn1->SetSavePoint();
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
|
2015-09-15 00:11:52 +00:00
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->RollbackToSavePoint());
|
2015-09-15 00:11:52 +00:00
|
|
|
txn1->UndoGetForUpdate("A");
|
|
|
|
|
|
|
|
txn2 = txn_db->BeginTransaction(write_options);
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn2->Put("A", "x"));
|
|
|
|
ASSERT_OK(txn2->Commit());
|
2015-09-15 00:11:52 +00:00
|
|
|
delete txn2;
|
|
|
|
|
|
|
|
// Verify that txn1 can commit since A isn't conflict checked
|
2020-12-10 05:19:55 +00:00
|
|
|
ASSERT_OK(txn1->Commit());
|
2015-09-15 00:11:52 +00:00
|
|
|
delete txn1;
|
|
|
|
}
|
|
|
|
|
2016-03-03 19:20:25 +00:00
|
|
|
namespace {
|
|
|
|
Status OptimisticTransactionStressTestInserter(OptimisticTransactionDB* db,
|
|
|
|
const size_t num_transactions,
|
|
|
|
const size_t num_sets,
|
|
|
|
const size_t num_keys_per_set) {
|
|
|
|
size_t seed = std::hash<std::thread::id>()(std::this_thread::get_id());
|
|
|
|
Random64 _rand(seed);
|
2016-03-04 00:33:26 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
ReadOptions read_options;
|
|
|
|
OptimisticTransactionOptions txn_options;
|
|
|
|
txn_options.set_snapshot = true;
|
|
|
|
|
2016-03-03 19:20:25 +00:00
|
|
|
RandomTransactionInserter inserter(&_rand, write_options, read_options,
|
2016-03-15 17:57:33 +00:00
|
|
|
num_keys_per_set,
|
|
|
|
static_cast<uint16_t>(num_sets));
|
2016-03-04 00:33:26 +00:00
|
|
|
|
2016-03-03 19:20:25 +00:00
|
|
|
for (size_t t = 0; t < num_transactions; t++) {
|
|
|
|
bool success = inserter.OptimisticTransactionDBInsert(db, txn_options);
|
|
|
|
if (!success) {
|
|
|
|
// unexpected failure
|
|
|
|
return inserter.GetLastStatus();
|
|
|
|
}
|
|
|
|
}
|
2016-03-04 00:33:26 +00:00
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
inserter.GetLastStatus().PermitUncheckedError();
|
|
|
|
|
2016-03-03 19:20:25 +00:00
|
|
|
// Make sure at least some of the transactions succeeded. It's ok if
|
|
|
|
// some failed due to write-conflicts.
|
|
|
|
if (inserter.GetFailureCount() > num_transactions / 2) {
|
|
|
|
return Status::TryAgain("Too many transactions failed! " +
|
|
|
|
std::to_string(inserter.GetFailureCount()) + " / " +
|
|
|
|
std::to_string(num_transactions));
|
|
|
|
}
|
2016-03-04 00:33:26 +00:00
|
|
|
|
2016-03-03 19:20:25 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, OptimisticTransactionStressTest) {
|
2016-03-03 19:20:25 +00:00
|
|
|
const size_t num_threads = 4;
|
|
|
|
const size_t num_transactions_per_thread = 10000;
|
|
|
|
const size_t num_sets = 3;
|
|
|
|
const size_t num_keys_per_set = 100;
|
|
|
|
// Setting the key-space to be 100 keys should cause enough write-conflicts
|
|
|
|
// to make this test interesting.
|
|
|
|
|
2017-02-06 22:43:55 +00:00
|
|
|
std::vector<port::Thread> threads;
|
2016-03-03 19:20:25 +00:00
|
|
|
|
|
|
|
std::function<void()> call_inserter = [&] {
|
|
|
|
ASSERT_OK(OptimisticTransactionStressTestInserter(
|
2023-05-24 18:57:15 +00:00
|
|
|
txn_db.get(), num_transactions_per_thread, num_sets, num_keys_per_set));
|
2016-03-03 19:20:25 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// Create N threads that use RandomTransactionInserter to write
|
|
|
|
// many transactions.
|
|
|
|
for (uint32_t i = 0; i < num_threads; i++) {
|
|
|
|
threads.emplace_back(call_inserter);
|
|
|
|
}
|
2016-03-04 00:33:26 +00:00
|
|
|
|
2016-03-03 19:20:25 +00:00
|
|
|
// Wait for all threads to run
|
|
|
|
for (auto& t : threads) {
|
|
|
|
t.join();
|
|
|
|
}
|
2016-03-04 00:33:26 +00:00
|
|
|
|
2016-03-03 19:20:25 +00:00
|
|
|
// Verify that data is consistent
|
2023-05-24 18:57:15 +00:00
|
|
|
Status s = RandomTransactionInserter::Verify(txn_db.get(), num_sets);
|
2016-03-04 00:33:26 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
2020-01-07 22:19:06 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverTest) {
|
2016-11-09 20:12:40 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
OptimisticTransactionOptions transaction_options;
|
|
|
|
|
2022-10-25 21:15:22 +00:00
|
|
|
Transaction* transaction(
|
|
|
|
txn_db->BeginTransaction(write_options, transaction_options));
|
2016-11-09 20:12:40 +00:00
|
|
|
Status s = transaction->Put("foo", "val");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = transaction->Put("foo2", "val");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = transaction->Put("foo3", "val");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = transaction->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete transaction;
|
|
|
|
|
|
|
|
Reopen();
|
|
|
|
transaction = txn_db->BeginTransaction(write_options, transaction_options);
|
|
|
|
s = transaction->Put("bar", "val");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = transaction->Put("bar2", "val");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = transaction->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete transaction;
|
|
|
|
}
|
|
|
|
|
2023-09-21 20:52:01 +00:00
|
|
|
#ifdef __SANITIZE_THREAD__
|
|
|
|
// Skip OptimisticTransactionTest.SequenceNumberAfterRecoverLargeTest under TSAN
|
|
|
|
// to avoid false positive because of TSAN lock limit of 64.
|
|
|
|
#else
|
|
|
|
TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverLargeTest) {
|
|
|
|
WriteOptions write_options;
|
|
|
|
OptimisticTransactionOptions transaction_options;
|
|
|
|
|
|
|
|
Transaction* transaction(
|
|
|
|
txn_db->BeginTransaction(write_options, transaction_options));
|
|
|
|
|
|
|
|
std::string value(1024 * 1024, 'X');
|
|
|
|
const size_t n_zero = 2;
|
|
|
|
std::string s_i;
|
|
|
|
Status s;
|
|
|
|
for (int i = 1; i <= 64; i++) {
|
|
|
|
s_i = std::to_string(i);
|
|
|
|
auto key = std::string(n_zero - std::min(n_zero, s_i.length()), '0') + s_i;
|
|
|
|
s = transaction->Put(key, value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
s = transaction->Commit();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
delete transaction;
|
|
|
|
|
|
|
|
Reopen();
|
|
|
|
transaction = txn_db->BeginTransaction(write_options, transaction_options);
|
|
|
|
s = transaction->Put("bar", "val");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
s = transaction->Commit();
|
|
|
|
if (!s.ok()) {
|
|
|
|
std::cerr << "Failed to commit records. Error: " << s.ToString()
|
|
|
|
<< std::endl;
|
|
|
|
}
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
delete transaction;
|
|
|
|
}
|
|
|
|
#endif // __SANITIZE_THREAD__
|
|
|
|
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
TEST_P(OptimisticTransactionTest, TimestampedSnapshotMissingCommitTs) {
|
|
|
|
std::unique_ptr<Transaction> txn(txn_db->BeginTransaction(WriteOptions()));
|
|
|
|
ASSERT_OK(txn->Put("a", "v"));
|
|
|
|
Status s = txn->CommitAndTryCreateSnapshot();
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(OptimisticTransactionTest, TimestampedSnapshotSetCommitTs) {
|
|
|
|
std::unique_ptr<Transaction> txn(txn_db->BeginTransaction(WriteOptions()));
|
|
|
|
ASSERT_OK(txn->Put("a", "v"));
|
|
|
|
std::shared_ptr<const Snapshot> snapshot;
|
|
|
|
Status s = txn->CommitAndTryCreateSnapshot(nullptr, /*ts=*/100, &snapshot);
|
|
|
|
ASSERT_TRUE(s.IsNotSupported());
|
|
|
|
}
|
|
|
|
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
2020-01-07 22:19:06 +00:00
|
|
|
InstanceOccGroup, OptimisticTransactionTest,
|
|
|
|
testing::Values(OccValidationPolicy::kValidateSerial,
|
|
|
|
OccValidationPolicy::kValidateParallel));
|
|
|
|
|
2023-05-24 18:57:15 +00:00
|
|
|
TEST(OccLockBucketsTest, CacheAligned) {
|
|
|
|
// Typical x86_64 is 40 byte mutex, 64 byte cache line
|
|
|
|
if (sizeof(port::Mutex) >= sizeof(CacheAlignedWrapper<port::Mutex>)) {
|
|
|
|
ROCKSDB_GTEST_BYPASS("Test requires mutex smaller than cache line");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
auto buckets_unaligned = MakeSharedOccLockBuckets(100, false);
|
|
|
|
auto buckets_aligned = MakeSharedOccLockBuckets(100, true);
|
|
|
|
// Save at least one byte per bucket
|
|
|
|
ASSERT_LE(buckets_unaligned->ApproximateMemoryUsage() + 100,
|
|
|
|
buckets_aligned->ApproximateMemoryUsage());
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2022-10-18 07:35:35 +00:00
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
2015-05-29 21:36:35 +00:00
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|