2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
#include "utilities/transactions/transaction_base.h"
|
|
|
|
|
2019-06-06 20:52:39 +00:00
|
|
|
#include <cinttypes>
|
2019-03-07 15:26:36 +00:00
|
|
|
|
2015-08-21 22:47:21 +00:00
|
|
|
#include "db/column_family.h"
|
2019-05-31 22:21:36 +00:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2021-09-29 11:01:57 +00:00
|
|
|
#include "logging/logging.h"
|
2015-08-21 22:47:21 +00:00
|
|
|
#include "rocksdb/comparator.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/status.h"
|
2019-08-27 17:57:28 +00:00
|
|
|
#include "util/cast_util.h"
|
2015-08-21 22:47:21 +00:00
|
|
|
#include "util/string_util.h"
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
#include "utilities/transactions/lock/lock_tracker.h"
|
2015-08-21 22:47:21 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2015-08-21 22:47:21 +00:00
|
|
|
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
Status Transaction::CommitAndTryCreateSnapshot(
|
|
|
|
std::shared_ptr<TransactionNotifier> notifier, TxnTimestamp ts,
|
|
|
|
std::shared_ptr<const Snapshot>* snapshot) {
|
|
|
|
if (snapshot) {
|
|
|
|
snapshot->reset();
|
|
|
|
}
|
|
|
|
TxnTimestamp commit_ts = GetCommitTimestamp();
|
|
|
|
if (commit_ts == kMaxTxnTimestamp) {
|
|
|
|
if (ts == kMaxTxnTimestamp) {
|
|
|
|
return Status::InvalidArgument("Commit timestamp unset");
|
|
|
|
} else {
|
|
|
|
const Status s = SetCommitTimestamp(ts);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (ts != kMaxTxnTimestamp) {
|
|
|
|
if (ts != commit_ts) {
|
|
|
|
// For now we treat this as error.
|
|
|
|
return Status::InvalidArgument("Different commit ts specified");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
SetSnapshotOnNextOperation(notifier);
|
|
|
|
Status s = Commit();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
assert(s.ok());
|
|
|
|
// If we reach here, we must return ok status for this function.
|
|
|
|
std::shared_ptr<const Snapshot> new_snapshot = GetTimestampedSnapshot();
|
|
|
|
|
|
|
|
if (snapshot) {
|
|
|
|
*snapshot = new_snapshot;
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2020-10-19 17:12:53 +00:00
|
|
|
TransactionBaseImpl::TransactionBaseImpl(
|
|
|
|
DB* db, const WriteOptions& write_options,
|
|
|
|
const LockTrackerFactory& lock_tracker_factory)
|
2015-08-21 22:47:21 +00:00
|
|
|
: db_(db),
|
2020-04-29 20:06:27 +00:00
|
|
|
dbimpl_(static_cast_with_check<DBImpl>(db)),
|
2015-08-21 22:47:21 +00:00
|
|
|
write_options_(write_options),
|
|
|
|
cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())),
|
2020-10-19 17:12:53 +00:00
|
|
|
lock_tracker_factory_(lock_tracker_factory),
|
2021-03-15 11:32:24 +00:00
|
|
|
start_time_(dbimpl_->GetSystemClock()->NowMicros()),
|
2022-06-17 06:10:07 +00:00
|
|
|
write_batch_(cmp_, 0, true, 0, write_options.protection_bytes_per_key),
|
2020-10-19 17:12:53 +00:00
|
|
|
tracked_locks_(lock_tracker_factory_.Create()),
|
2022-06-17 06:10:07 +00:00
|
|
|
commit_time_batch_(0 /* reserved_bytes */, 0 /* max_bytes */,
|
|
|
|
write_options.protection_bytes_per_key,
|
|
|
|
0 /* default_cf_ts_sz */),
|
2016-04-18 18:15:50 +00:00
|
|
|
indexing_enabled_(true) {
|
|
|
|
assert(dynamic_cast<DBImpl*>(db_) != nullptr);
|
|
|
|
log_number_ = 0;
|
|
|
|
if (dbimpl_->allow_2pc()) {
|
2019-07-31 20:36:22 +00:00
|
|
|
InitWriteBatch();
|
2016-04-18 18:15:50 +00:00
|
|
|
}
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
|
2016-02-03 03:19:17 +00:00
|
|
|
TransactionBaseImpl::~TransactionBaseImpl() {
|
|
|
|
// Release snapshot if snapshot is set
|
|
|
|
SetSnapshotInternal(nullptr);
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
|
2015-08-25 02:13:18 +00:00
|
|
|
void TransactionBaseImpl::Clear() {
|
|
|
|
save_points_.reset(nullptr);
|
2016-01-28 01:11:44 +00:00
|
|
|
write_batch_.Clear();
|
2016-04-18 18:15:50 +00:00
|
|
|
commit_time_batch_.Clear();
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
tracked_locks_->Clear();
|
2015-08-25 02:13:18 +00:00
|
|
|
num_puts_ = 0;
|
2024-05-06 21:41:00 +00:00
|
|
|
num_put_entities_ = 0;
|
2015-08-25 02:13:18 +00:00
|
|
|
num_deletes_ = 0;
|
|
|
|
num_merges_ = 0;
|
2016-04-18 18:15:50 +00:00
|
|
|
|
|
|
|
if (dbimpl_->allow_2pc()) {
|
2019-07-31 20:36:22 +00:00
|
|
|
InitWriteBatch();
|
2016-04-18 18:15:50 +00:00
|
|
|
}
|
2015-08-25 02:13:18 +00:00
|
|
|
}
|
|
|
|
|
2016-03-03 23:36:26 +00:00
|
|
|
void TransactionBaseImpl::Reinitialize(DB* db,
|
|
|
|
const WriteOptions& write_options) {
|
2016-02-03 03:19:17 +00:00
|
|
|
Clear();
|
2016-03-03 23:36:26 +00:00
|
|
|
ClearSnapshot();
|
2017-10-06 21:18:30 +00:00
|
|
|
id_ = 0;
|
2016-03-03 23:36:26 +00:00
|
|
|
db_ = db;
|
2016-04-18 18:15:50 +00:00
|
|
|
name_.clear();
|
|
|
|
log_number_ = 0;
|
2016-02-03 03:19:17 +00:00
|
|
|
write_options_ = write_options;
|
2021-03-15 11:32:24 +00:00
|
|
|
start_time_ = dbimpl_->GetSystemClock()->NowMicros();
|
2016-03-03 23:36:26 +00:00
|
|
|
indexing_enabled_ = true;
|
|
|
|
cmp_ = GetColumnFamilyUserComparator(db_->DefaultColumnFamily());
|
2023-08-09 20:49:42 +00:00
|
|
|
WriteBatchInternal::SetDefaultColumnFamilyTimestampSize(
|
|
|
|
write_batch_.GetWriteBatch(), cmp_->timestamp_size());
|
2022-06-17 06:10:07 +00:00
|
|
|
WriteBatchInternal::UpdateProtectionInfo(
|
|
|
|
write_batch_.GetWriteBatch(), write_options_.protection_bytes_per_key)
|
|
|
|
.PermitUncheckedError();
|
|
|
|
WriteBatchInternal::UpdateProtectionInfo(
|
|
|
|
&commit_time_batch_, write_options_.protection_bytes_per_key)
|
|
|
|
.PermitUncheckedError();
|
2016-02-03 03:19:17 +00:00
|
|
|
}
|
|
|
|
|
2015-08-21 22:47:21 +00:00
|
|
|
void TransactionBaseImpl::SetSnapshot() {
|
2016-04-18 18:15:50 +00:00
|
|
|
const Snapshot* snapshot = dbimpl_->GetSnapshotForWriteConflictBoundary();
|
2016-02-03 03:19:17 +00:00
|
|
|
SetSnapshotInternal(snapshot);
|
|
|
|
}
|
|
|
|
|
|
|
|
void TransactionBaseImpl::SetSnapshotInternal(const Snapshot* snapshot) {
|
2016-01-28 01:11:44 +00:00
|
|
|
// Set a custom deleter for the snapshot_ SharedPtr as the snapshot needs to
|
|
|
|
// be released, not deleted when it is no longer referenced.
|
|
|
|
snapshot_.reset(snapshot, std::bind(&TransactionBaseImpl::ReleaseSnapshot,
|
|
|
|
this, std::placeholders::_1, db_));
|
2015-09-28 19:12:17 +00:00
|
|
|
snapshot_needed_ = false;
|
2015-12-04 18:12:27 +00:00
|
|
|
snapshot_notifier_ = nullptr;
|
2015-09-28 19:12:17 +00:00
|
|
|
}
|
|
|
|
|
2015-12-04 18:12:27 +00:00
|
|
|
void TransactionBaseImpl::SetSnapshotOnNextOperation(
|
|
|
|
std::shared_ptr<TransactionNotifier> notifier) {
|
2015-09-28 19:12:17 +00:00
|
|
|
snapshot_needed_ = true;
|
2015-12-04 18:12:27 +00:00
|
|
|
snapshot_notifier_ = notifier;
|
2015-09-28 19:12:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void TransactionBaseImpl::SetSnapshotIfNeeded() {
|
|
|
|
if (snapshot_needed_) {
|
2015-12-04 18:12:27 +00:00
|
|
|
std::shared_ptr<TransactionNotifier> notifier = snapshot_notifier_;
|
2015-09-28 19:12:17 +00:00
|
|
|
SetSnapshot();
|
2015-12-04 18:12:27 +00:00
|
|
|
if (notifier != nullptr) {
|
|
|
|
notifier->SnapshotCreated(GetSnapshot());
|
|
|
|
}
|
2015-09-28 19:12:17 +00:00
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::TryLock(ColumnFamilyHandle* column_family,
|
2015-09-15 00:11:52 +00:00
|
|
|
const SliceParts& key, bool read_only,
|
2018-12-07 01:46:57 +00:00
|
|
|
bool exclusive, const bool do_validate,
|
|
|
|
const bool assume_tracked) {
|
2015-08-21 22:47:21 +00:00
|
|
|
size_t key_size = 0;
|
|
|
|
for (int i = 0; i < key.num_parts; ++i) {
|
|
|
|
key_size += key.parts[i].size();
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string str;
|
|
|
|
str.reserve(key_size);
|
|
|
|
|
|
|
|
for (int i = 0; i < key.num_parts; ++i) {
|
|
|
|
str.append(key.parts[i].data(), key.parts[i].size());
|
|
|
|
}
|
|
|
|
|
2018-12-07 01:46:57 +00:00
|
|
|
return TryLock(column_family, str, read_only, exclusive, do_validate,
|
|
|
|
assume_tracked);
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void TransactionBaseImpl::SetSavePoint() {
|
|
|
|
if (save_points_ == nullptr) {
|
2022-02-22 22:19:02 +00:00
|
|
|
save_points_.reset(
|
|
|
|
new std::stack<TransactionBaseImpl::SavePoint,
|
|
|
|
autovector<TransactionBaseImpl::SavePoint>>());
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
2015-12-04 18:12:27 +00:00
|
|
|
save_points_->emplace(snapshot_, snapshot_needed_, snapshot_notifier_,
|
2024-05-06 21:41:00 +00:00
|
|
|
num_puts_, num_put_entities_, num_deletes_, num_merges_,
|
2020-10-19 17:12:53 +00:00
|
|
|
lock_tracker_factory_);
|
2016-01-28 01:11:44 +00:00
|
|
|
write_batch_.SetSavePoint();
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::RollbackToSavePoint() {
|
|
|
|
if (save_points_ != nullptr && save_points_->size() > 0) {
|
2015-08-25 02:13:18 +00:00
|
|
|
// Restore saved SavePoint
|
|
|
|
TransactionBaseImpl::SavePoint& save_point = save_points_->top();
|
|
|
|
snapshot_ = save_point.snapshot_;
|
2015-09-28 19:12:17 +00:00
|
|
|
snapshot_needed_ = save_point.snapshot_needed_;
|
2015-12-04 18:12:27 +00:00
|
|
|
snapshot_notifier_ = save_point.snapshot_notifier_;
|
2015-08-25 02:13:18 +00:00
|
|
|
num_puts_ = save_point.num_puts_;
|
2024-05-06 21:41:00 +00:00
|
|
|
num_put_entities_ = save_point.num_put_entities_;
|
2015-08-25 02:13:18 +00:00
|
|
|
num_deletes_ = save_point.num_deletes_;
|
|
|
|
num_merges_ = save_point.num_merges_;
|
|
|
|
|
2015-08-21 22:47:21 +00:00
|
|
|
// Rollback batch
|
2016-01-28 01:11:44 +00:00
|
|
|
Status s = write_batch_.RollbackToSavePoint();
|
2015-08-21 22:47:21 +00:00
|
|
|
assert(s.ok());
|
|
|
|
|
2015-09-12 01:10:50 +00:00
|
|
|
// Rollback any keys that were tracked since the last savepoint
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
tracked_locks_->Subtract(*save_point.new_locks_);
|
2015-09-12 01:10:50 +00:00
|
|
|
|
|
|
|
save_points_->pop();
|
|
|
|
|
2015-08-21 22:47:21 +00:00
|
|
|
return s;
|
|
|
|
} else {
|
2016-01-28 01:11:44 +00:00
|
|
|
assert(write_batch_.RollbackToSavePoint().IsNotFound());
|
2015-08-21 22:47:21 +00:00
|
|
|
return Status::NotFound();
|
|
|
|
}
|
|
|
|
}
|
2018-12-13 22:12:02 +00:00
|
|
|
|
2018-08-17 18:53:33 +00:00
|
|
|
Status TransactionBaseImpl::PopSavePoint() {
|
2022-10-25 21:15:22 +00:00
|
|
|
if (save_points_ == nullptr || save_points_->empty()) {
|
2018-08-17 18:53:33 +00:00
|
|
|
// No SavePoint yet.
|
|
|
|
assert(write_batch_.PopSavePoint().IsNotFound());
|
|
|
|
return Status::NotFound();
|
|
|
|
}
|
|
|
|
|
2018-12-13 22:12:02 +00:00
|
|
|
assert(!save_points_->empty());
|
2019-07-26 18:31:46 +00:00
|
|
|
// If there is another savepoint A below the current savepoint B, then A needs
|
|
|
|
// to inherit tracked_keys in B so that if we rollback to savepoint A, we
|
|
|
|
// remember to unlock keys in B. If there is no other savepoint below, then we
|
|
|
|
// can safely discard savepoint info.
|
|
|
|
if (save_points_->size() == 1) {
|
|
|
|
save_points_->pop();
|
|
|
|
} else {
|
2020-10-19 17:12:53 +00:00
|
|
|
TransactionBaseImpl::SavePoint top(lock_tracker_factory_);
|
2019-07-26 18:31:46 +00:00
|
|
|
std::swap(top, save_points_->top());
|
|
|
|
save_points_->pop();
|
|
|
|
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
save_points_->top().new_locks_->Merge(*top.new_locks_);
|
2019-07-26 18:31:46 +00:00
|
|
|
}
|
|
|
|
|
2018-08-17 18:53:33 +00:00
|
|
|
return write_batch_.PopSavePoint();
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
Status TransactionBaseImpl::Get(const ReadOptions& _read_options,
|
2015-08-21 22:47:21 +00:00
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, std::string* value) {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
if (_read_options.io_activity != Env::IOActivity::kUnknown &&
|
|
|
|
_read_options.io_activity != Env::IOActivity::kGet) {
|
2023-04-21 16:07:18 +00:00
|
|
|
return Status::InvalidArgument(
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
"Can only call Get with `ReadOptions::io_activity` is "
|
|
|
|
"`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`");
|
2023-04-21 16:07:18 +00:00
|
|
|
}
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
ReadOptions read_options(_read_options);
|
|
|
|
if (read_options.io_activity == Env::IOActivity::kUnknown) {
|
|
|
|
read_options.io_activity = Env::IOActivity::kGet;
|
|
|
|
}
|
|
|
|
auto s = GetImpl(read_options, column_family, key, value);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::GetImpl(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, std::string* value) {
|
2017-08-23 17:01:17 +00:00
|
|
|
assert(value != nullptr);
|
|
|
|
PinnableSlice pinnable_val(value);
|
|
|
|
assert(!pinnable_val.IsPinned());
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
auto s = GetImpl(read_options, column_family, key, &pinnable_val);
|
2017-08-23 17:01:17 +00:00
|
|
|
if (s.ok() && pinnable_val.IsPinned()) {
|
|
|
|
value->assign(pinnable_val.data(), pinnable_val.size());
|
|
|
|
} // else value is already assigned
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
Status TransactionBaseImpl::Get(const ReadOptions& _read_options,
|
2017-08-23 17:01:17 +00:00
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, PinnableSlice* pinnable_val) {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
if (_read_options.io_activity != Env::IOActivity::kUnknown &&
|
|
|
|
_read_options.io_activity != Env::IOActivity::kGet) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Can only call Get with `ReadOptions::io_activity` is "
|
|
|
|
"`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`");
|
|
|
|
}
|
|
|
|
ReadOptions read_options(_read_options);
|
|
|
|
if (read_options.io_activity == Env::IOActivity::kUnknown) {
|
|
|
|
read_options.io_activity = Env::IOActivity::kGet;
|
|
|
|
}
|
|
|
|
return GetImpl(read_options, column_family, key, pinnable_val);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::GetImpl(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key,
|
|
|
|
PinnableSlice* pinnable_val) {
|
2016-01-28 01:11:44 +00:00
|
|
|
return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key,
|
2017-08-23 17:01:17 +00:00
|
|
|
pinnable_val);
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
2016-12-06 01:18:14 +00:00
|
|
|
const Slice& key, std::string* value,
|
2018-12-07 01:46:57 +00:00
|
|
|
bool exclusive,
|
|
|
|
const bool do_validate) {
|
|
|
|
if (!do_validate && read_options.snapshot != nullptr) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"If do_validate is false then GetForUpdate with snapshot is not "
|
|
|
|
"defined.");
|
|
|
|
}
|
2023-04-21 16:07:18 +00:00
|
|
|
if (read_options.io_activity != Env::IOActivity::kUnknown) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Cannot call GetForUpdate with `ReadOptions::io_activity` != "
|
|
|
|
"`Env::IOActivity::kUnknown`");
|
|
|
|
}
|
2018-12-07 01:46:57 +00:00
|
|
|
Status s =
|
|
|
|
TryLock(column_family, key, true /* read_only */, exclusive, do_validate);
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
if (s.ok() && value != nullptr) {
|
2017-08-23 17:01:17 +00:00
|
|
|
assert(value != nullptr);
|
|
|
|
PinnableSlice pinnable_val(value);
|
|
|
|
assert(!pinnable_val.IsPinned());
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
s = GetImpl(read_options, column_family, key, &pinnable_val);
|
2017-08-23 17:01:17 +00:00
|
|
|
if (s.ok() && pinnable_val.IsPinned()) {
|
|
|
|
value->assign(pinnable_val.data(), pinnable_val.size());
|
|
|
|
} // else value is already assigned
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key,
|
|
|
|
PinnableSlice* pinnable_val,
|
2018-12-07 01:46:57 +00:00
|
|
|
bool exclusive,
|
|
|
|
const bool do_validate) {
|
|
|
|
if (!do_validate && read_options.snapshot != nullptr) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"If do_validate is false then GetForUpdate with snapshot is not "
|
|
|
|
"defined.");
|
|
|
|
}
|
2023-04-21 16:07:18 +00:00
|
|
|
if (read_options.io_activity != Env::IOActivity::kUnknown) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Cannot call GetForUpdate with `ReadOptions::io_activity` != "
|
|
|
|
"`Env::IOActivity::kUnknown`");
|
|
|
|
}
|
2018-12-07 01:46:57 +00:00
|
|
|
Status s =
|
|
|
|
TryLock(column_family, key, true /* read_only */, exclusive, do_validate);
|
2017-08-23 17:01:17 +00:00
|
|
|
|
|
|
|
if (s.ok() && pinnable_val != nullptr) {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
s = GetImpl(read_options, column_family, key, pinnable_val);
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<Status> TransactionBaseImpl::MultiGet(
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
const ReadOptions& _read_options,
|
2015-08-21 22:47:21 +00:00
|
|
|
const std::vector<ColumnFamilyHandle*>& column_family,
|
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
|
|
|
size_t num_keys = keys.size();
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
std::vector<Status> stat_list(num_keys);
|
|
|
|
if (_read_options.io_activity != Env::IOActivity::kUnknown &&
|
|
|
|
_read_options.io_activity != Env::IOActivity::kMultiGet) {
|
2023-04-21 16:07:18 +00:00
|
|
|
Status s = Status::InvalidArgument(
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
"Can only call MultiGet with `ReadOptions::io_activity` is "
|
|
|
|
"`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`");
|
|
|
|
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
|
|
|
stat_list[i] = s;
|
|
|
|
}
|
|
|
|
return stat_list;
|
|
|
|
}
|
|
|
|
ReadOptions read_options(_read_options);
|
|
|
|
if (read_options.io_activity == Env::IOActivity::kUnknown) {
|
|
|
|
read_options.io_activity = Env::IOActivity::kMultiGet;
|
2023-04-21 16:07:18 +00:00
|
|
|
}
|
|
|
|
|
2015-08-21 22:47:21 +00:00
|
|
|
values->resize(num_keys);
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
stat_list[i] =
|
|
|
|
GetImpl(read_options, column_family[i], keys[i], &(*values)[i]);
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return stat_list;
|
|
|
|
}
|
|
|
|
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
void TransactionBaseImpl::MultiGet(const ReadOptions& _read_options,
|
2019-04-23 21:08:24 +00:00
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const size_t num_keys, const Slice* keys,
|
|
|
|
PinnableSlice* values, Status* statuses,
|
2019-11-27 00:55:46 +00:00
|
|
|
const bool sorted_input) {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
if (_read_options.io_activity != Env::IOActivity::kUnknown &&
|
|
|
|
_read_options.io_activity != Env::IOActivity::kMultiGet) {
|
|
|
|
Status s = Status::InvalidArgument(
|
|
|
|
"Can only call MultiGet with `ReadOptions::io_activity` is "
|
|
|
|
"`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`");
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
|
|
|
if (statuses[i].ok()) {
|
|
|
|
statuses[i] = s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ReadOptions read_options(_read_options);
|
|
|
|
if (read_options.io_activity == Env::IOActivity::kUnknown) {
|
|
|
|
read_options.io_activity = Env::IOActivity::kMultiGet;
|
|
|
|
}
|
2019-04-23 21:08:24 +00:00
|
|
|
write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family,
|
|
|
|
num_keys, keys, values, statuses,
|
|
|
|
sorted_input);
|
|
|
|
}
|
|
|
|
|
2015-08-21 22:47:21 +00:00
|
|
|
std::vector<Status> TransactionBaseImpl::MultiGetForUpdate(
|
|
|
|
const ReadOptions& read_options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_family,
|
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
|
|
|
size_t num_keys = keys.size();
|
2023-04-21 16:07:18 +00:00
|
|
|
if (read_options.io_activity != Env::IOActivity::kUnknown) {
|
|
|
|
Status s = Status::InvalidArgument(
|
|
|
|
"Cannot call MultiGetForUpdate with `ReadOptions::io_activity` != "
|
|
|
|
"`Env::IOActivity::kUnknown`");
|
|
|
|
return std::vector<Status>(num_keys, s);
|
|
|
|
}
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
// Regardless of whether the MultiGet succeeded, track these keys.
|
2015-08-21 22:47:21 +00:00
|
|
|
values->resize(num_keys);
|
|
|
|
|
|
|
|
// Lock all keys
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
2016-12-06 01:18:14 +00:00
|
|
|
Status s = TryLock(column_family[i], keys[i], true /* read_only */,
|
|
|
|
true /* exclusive */);
|
2015-08-21 22:47:21 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
// Fail entire multiget if we cannot lock all keys
|
|
|
|
return std::vector<Status>(num_keys, s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO(agiardullo): optimize multiget?
|
|
|
|
std::vector<Status> stat_list(num_keys);
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
stat_list[i] =
|
|
|
|
GetImpl(read_options, column_family[i], keys[i], &(*values)[i]);
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return stat_list;
|
|
|
|
}
|
|
|
|
|
|
|
|
Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options) {
|
|
|
|
Iterator* db_iter = db_->NewIterator(read_options);
|
|
|
|
assert(db_iter);
|
|
|
|
|
Add Merge Operator support to WriteBatchWithIndex (#8135)
Summary:
The WBWI has two differing modes of operation dependent on the value
of the constructor parameter `overwrite_key`.
Currently, regardless of the parameter, neither mode performs as
expected when using Merge. This PR remedies this by correctly invoking
the appropriate Merge Operator before returning results from the WBWI.
Examples of issues that exist which are solved by this PR:
## Example 1 with `overwrite_key=false`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
Get('k1')
```
Incorrectly yields `v2`, that is to say that the Merge behaves like a Put.
## Example 2 with o`verwrite_key=true`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
Get('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
## Example 3 with `overwrite_key=false`
Currently, with a database containing `('k1' -> 'v1')`, the following sequence:
```
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `v1,v2`
## Example 4 with `overwrite_key=true`
Currently, with a database containing `('k1' -> 'v1')`, the following sequence:
```
Merge('k1', 'v1')
GetFromBatchAndDB('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
## Example 5 with `overwrite_key=false`
Currently, from an empty database, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `v1,v2`
## Example 6 with `overwrite_key=true`
Currently, from an empty database, `('k1' -> 'v1')`, the following sequence:
```
Put('k1', 'v1')
Merge('k1', 'v2')
GetFromBatchAndDB('k1')
```
Incorrectly yields `ERROR: kMergeInProgress`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8135
Reviewed By: pdillinger
Differential Revision: D27657938
Pulled By: mrambacher
fbshipit-source-id: 0fbda6bbc66bedeba96a84786d90141d776297df
2021-05-10 19:49:25 +00:00
|
|
|
return write_batch_.NewIteratorWithBase(db_->DefaultColumnFamily(), db_iter,
|
|
|
|
&read_options);
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family) {
|
|
|
|
Iterator* db_iter = db_->NewIterator(read_options, column_family);
|
|
|
|
assert(db_iter);
|
|
|
|
|
2019-11-05 19:29:31 +00:00
|
|
|
return write_batch_.NewIteratorWithBase(column_family, db_iter,
|
|
|
|
&read_options);
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
2024-05-06 21:41:00 +00:00
|
|
|
Status TransactionBaseImpl::PutEntityImpl(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key,
|
|
|
|
const WideColumns& columns,
|
|
|
|
bool do_validate,
|
|
|
|
bool assume_tracked) {
|
|
|
|
{
|
|
|
|
constexpr bool read_only = false;
|
|
|
|
constexpr bool exclusive = true;
|
|
|
|
const Status s = TryLock(column_family, key, read_only, exclusive,
|
|
|
|
do_validate, assume_tracked);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
const Status s = GetBatchForWrite()->PutEntity(column_family, key, columns);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
++num_put_entities_;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2015-08-21 22:47:21 +00:00
|
|
|
Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
|
2018-12-07 01:46:57 +00:00
|
|
|
const Slice& key, const Slice& value,
|
|
|
|
const bool assume_tracked) {
|
|
|
|
const bool do_validate = !assume_tracked;
|
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
|
|
|
true /* exclusive */, do_validate, assume_tracked);
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->Put(column_family, key, value);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_puts_++;
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
|
2018-12-07 01:46:57 +00:00
|
|
|
const SliceParts& key, const SliceParts& value,
|
|
|
|
const bool assume_tracked) {
|
|
|
|
const bool do_validate = !assume_tracked;
|
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
|
|
|
true /* exclusive */, do_validate, assume_tracked);
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->Put(column_family, key, value);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_puts_++;
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::Merge(ColumnFamilyHandle* column_family,
|
2018-12-07 01:46:57 +00:00
|
|
|
const Slice& key, const Slice& value,
|
|
|
|
const bool assume_tracked) {
|
|
|
|
const bool do_validate = !assume_tracked;
|
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
|
|
|
true /* exclusive */, do_validate, assume_tracked);
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->Merge(column_family, key, value);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_merges_++;
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
|
2018-12-07 01:46:57 +00:00
|
|
|
const Slice& key,
|
|
|
|
const bool assume_tracked) {
|
|
|
|
const bool do_validate = !assume_tracked;
|
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
|
|
|
true /* exclusive */, do_validate, assume_tracked);
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->Delete(column_family, key);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_deletes_++;
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
|
2018-12-07 01:46:57 +00:00
|
|
|
const SliceParts& key,
|
|
|
|
const bool assume_tracked) {
|
|
|
|
const bool do_validate = !assume_tracked;
|
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
|
|
|
true /* exclusive */, do_validate, assume_tracked);
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->Delete(column_family, key);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_deletes_++;
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-09-25 01:31:32 +00:00
|
|
|
Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
|
2018-12-07 01:46:57 +00:00
|
|
|
const Slice& key,
|
|
|
|
const bool assume_tracked) {
|
|
|
|
const bool do_validate = !assume_tracked;
|
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
|
|
|
true /* exclusive */, do_validate, assume_tracked);
|
2015-09-25 01:31:32 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->SingleDelete(column_family, key);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_deletes_++;
|
|
|
|
}
|
2015-09-25 01:31:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
|
2018-12-07 01:46:57 +00:00
|
|
|
const SliceParts& key,
|
|
|
|
const bool assume_tracked) {
|
|
|
|
const bool do_validate = !assume_tracked;
|
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
|
|
|
true /* exclusive */, do_validate, assume_tracked);
|
2015-09-25 01:31:32 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->SingleDelete(column_family, key);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_deletes_++;
|
|
|
|
}
|
2015-09-25 01:31:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-08-21 22:47:21 +00:00
|
|
|
Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& value) {
|
2016-12-06 01:18:14 +00:00
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
2018-12-07 01:46:57 +00:00
|
|
|
true /* exclusive */, false /* do_validate */);
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->Put(column_family, key, value);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_puts_++;
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family,
|
|
|
|
const SliceParts& key,
|
|
|
|
const SliceParts& value) {
|
2016-12-06 01:18:14 +00:00
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
2018-12-07 01:46:57 +00:00
|
|
|
true /* exclusive */, false /* do_validate */);
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->Put(column_family, key, value);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_puts_++;
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::MergeUntracked(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key,
|
|
|
|
const Slice& value) {
|
2016-12-06 01:18:14 +00:00
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
2018-12-07 01:46:57 +00:00
|
|
|
true /* exclusive */, false /* do_validate */);
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->Merge(column_family, key, value);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_merges_++;
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) {
|
2016-12-06 01:18:14 +00:00
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
2018-12-07 01:46:57 +00:00
|
|
|
true /* exclusive */, false /* do_validate */);
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->Delete(column_family, key);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_deletes_++;
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
|
|
|
|
const SliceParts& key) {
|
2016-12-06 01:18:14 +00:00
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
2018-12-07 01:46:57 +00:00
|
|
|
true /* exclusive */, false /* do_validate */);
|
2015-08-21 22:47:21 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2017-04-10 22:38:34 +00:00
|
|
|
s = GetBatchForWrite()->Delete(column_family, key);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_deletes_++;
|
|
|
|
}
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2017-09-27 17:24:42 +00:00
|
|
|
Status TransactionBaseImpl::SingleDeleteUntracked(
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key) {
|
|
|
|
Status s = TryLock(column_family, key, false /* read_only */,
|
2018-12-07 01:46:57 +00:00
|
|
|
true /* exclusive */, false /* do_validate */);
|
2017-09-27 17:24:42 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
s = GetBatchForWrite()->SingleDelete(column_family, key);
|
|
|
|
if (s.ok()) {
|
|
|
|
num_deletes_++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-08-21 22:47:21 +00:00
|
|
|
void TransactionBaseImpl::PutLogData(const Slice& blob) {
|
2020-10-21 21:02:00 +00:00
|
|
|
auto s = write_batch_.PutLogData(blob);
|
|
|
|
(void)s;
|
|
|
|
assert(s.ok());
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatchWithIndex* TransactionBaseImpl::GetWriteBatch() {
|
2016-01-28 01:11:44 +00:00
|
|
|
return &write_batch_;
|
2015-08-21 22:47:21 +00:00
|
|
|
}
|
|
|
|
|
2015-08-25 02:13:18 +00:00
|
|
|
uint64_t TransactionBaseImpl::GetElapsedTime() const {
|
2021-03-15 11:32:24 +00:00
|
|
|
return (dbimpl_->GetSystemClock()->NowMicros() - start_time_) / 1000;
|
2015-08-25 02:13:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t TransactionBaseImpl::GetNumPuts() const { return num_puts_; }
|
|
|
|
|
2024-05-06 21:41:00 +00:00
|
|
|
uint64_t TransactionBaseImpl::GetNumPutEntities() const {
|
|
|
|
return num_put_entities_;
|
|
|
|
}
|
|
|
|
|
2015-08-25 02:13:18 +00:00
|
|
|
uint64_t TransactionBaseImpl::GetNumDeletes() const { return num_deletes_; }
|
|
|
|
|
|
|
|
uint64_t TransactionBaseImpl::GetNumMerges() const { return num_merges_; }
|
|
|
|
|
2015-09-12 01:10:50 +00:00
|
|
|
uint64_t TransactionBaseImpl::GetNumKeys() const {
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
return tracked_locks_->GetNumPointLocks();
|
2015-09-12 01:10:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void TransactionBaseImpl::TrackKey(uint32_t cfh_id, const std::string& key,
|
2017-04-10 22:47:20 +00:00
|
|
|
SequenceNumber seq, bool read_only,
|
|
|
|
bool exclusive) {
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
PointLockRequest r;
|
|
|
|
r.column_family_id = cfh_id;
|
|
|
|
r.key = key;
|
|
|
|
r.seq = seq;
|
|
|
|
r.read_only = read_only;
|
|
|
|
r.exclusive = exclusive;
|
|
|
|
|
2015-09-15 00:11:52 +00:00
|
|
|
// Update map of all tracked keys for this transaction
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
tracked_locks_->Track(r);
|
2015-09-15 00:11:52 +00:00
|
|
|
|
|
|
|
if (save_points_ != nullptr && !save_points_->empty()) {
|
|
|
|
// Update map of tracked keys in this SavePoint
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
save_points_->top().new_locks_->Track(r);
|
2015-09-12 01:10:50 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-05-06 21:41:00 +00:00
|
|
|
// Gets the write batch that should be used for Put/PutEntity/Merge/Delete
|
|
|
|
// operations.
|
2015-10-09 20:31:10 +00:00
|
|
|
//
|
|
|
|
// Returns either a WriteBatch or WriteBatchWithIndex depending on whether
|
|
|
|
// DisableIndexing() has been called.
|
|
|
|
WriteBatchBase* TransactionBaseImpl::GetBatchForWrite() {
|
|
|
|
if (indexing_enabled_) {
|
|
|
|
// Use WriteBatchWithIndex
|
2016-01-28 01:11:44 +00:00
|
|
|
return &write_batch_;
|
2015-10-09 20:31:10 +00:00
|
|
|
} else {
|
|
|
|
// Don't use WriteBatchWithIndex. Return base WriteBatch.
|
2016-01-28 01:11:44 +00:00
|
|
|
return write_batch_.GetWriteBatch();
|
2015-10-09 20:31:10 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-01-28 01:11:44 +00:00
|
|
|
void TransactionBaseImpl::ReleaseSnapshot(const Snapshot* snapshot, DB* db) {
|
2016-02-03 03:19:17 +00:00
|
|
|
if (snapshot != nullptr) {
|
2019-02-20 00:52:50 +00:00
|
|
|
ROCKS_LOG_DETAILS(dbimpl_->immutable_db_options().info_log,
|
|
|
|
"ReleaseSnapshot %" PRIu64 " Set",
|
|
|
|
snapshot->GetSequenceNumber());
|
2016-02-03 03:19:17 +00:00
|
|
|
db->ReleaseSnapshot(snapshot);
|
|
|
|
}
|
2016-01-28 01:11:44 +00:00
|
|
|
}
|
|
|
|
|
2015-09-15 00:11:52 +00:00
|
|
|
void TransactionBaseImpl::UndoGetForUpdate(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) {
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
PointLockRequest r;
|
|
|
|
r.column_family_id = GetColumnFamilyID(column_family);
|
|
|
|
r.key = key.ToString();
|
|
|
|
r.read_only = true;
|
2015-09-15 00:11:52 +00:00
|
|
|
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
bool can_untrack = false;
|
2015-09-15 00:11:52 +00:00
|
|
|
if (save_points_ != nullptr && !save_points_->empty()) {
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
// If there is no GetForUpdate of the key in this save point,
|
|
|
|
// then cannot untrack from the global lock tracker.
|
|
|
|
UntrackStatus s = save_points_->top().new_locks_->Untrack(r);
|
|
|
|
can_untrack = (s != UntrackStatus::NOT_TRACKED);
|
2015-09-15 00:11:52 +00:00
|
|
|
} else {
|
Replace tracked_keys with a new LockTracker interface in TransactionDB (#7013)
Summary:
We're going to support more locking protocols such as range lock in transaction.
However, in current design, `TransactionBase` has a member `tracked_keys` which assumes that point lock (lock a single key) is used, and is used in snapshot checking (isolation protocol). When using range lock, we may use read committed instead of snapshot checking as the isolation protocol.
The most significant usage scenarios of `tracked_keys` are:
1. pessimistic transaction uses it to track the locked keys, and unlock these keys when commit or rollback.
2. optimistic transaction does not lock keys upfront, it only tracks the lock intentions in tracked_keys, and do write conflict checking when commit.
3. each `SavePoint` tracks the keys that are locked since the `SavePoint`, `RollbackToSavePoint` or `PopSavePoint` relies on both the tracked keys in `SavePoint`s and `tracked_keys`.
Based on these scenarios, if we can abstract out a `LockTracker` interface to hold a set of tracked locks (can be keys or key ranges), and have methods that can be composed together to implement the scenarios, then `tracked_keys` can be an internal data structure of one implementation of `LockTracker`. See `utilities/transactions/lock/lock_tracker.h` for the detailed interface design, and `utilities/transactions/lock/point_lock_tracker.cc` for the implementation.
In the future, a `RangeLockTracker` can be implemented to track range locks without affecting other components.
After this PR, a clean interface for lock manager should be possible, and then ideally, we can have pluggable locking protocols.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7013
Test Plan: Run `transaction_test` and `optimistic_transaction_test`.
Reviewed By: ajkr
Differential Revision: D22163706
Pulled By: cheng-chang
fbshipit-source-id: f2860577b5334e31dd2994f5bc6d7c40d502b1b4
2020-08-06 19:36:48 +00:00
|
|
|
// No save point, so can untrack from the global lock tracker.
|
|
|
|
can_untrack = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (can_untrack) {
|
|
|
|
// If erased from the global tracker, then can unlock the key.
|
|
|
|
UntrackStatus s = tracked_locks_->Untrack(r);
|
|
|
|
bool can_unlock = (s == UntrackStatus::REMOVED);
|
|
|
|
if (can_unlock) {
|
|
|
|
UnlockGetForUpdate(column_family, key);
|
2015-09-15 00:11:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-18 18:15:50 +00:00
|
|
|
Status TransactionBaseImpl::RebuildFromWriteBatch(WriteBatch* src_batch) {
|
|
|
|
struct IndexedWriteBatchBuilder : public WriteBatch::Handler {
|
|
|
|
Transaction* txn_;
|
|
|
|
DBImpl* db_;
|
|
|
|
IndexedWriteBatchBuilder(Transaction* txn, DBImpl* db)
|
|
|
|
: txn_(txn), db_(db) {
|
|
|
|
assert(dynamic_cast<TransactionBaseImpl*>(txn_) != nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override {
|
|
|
|
return txn_->Put(db_->GetColumnFamilyHandle(cf), key, val);
|
|
|
|
}
|
|
|
|
|
2024-05-06 21:41:00 +00:00
|
|
|
Status PutEntityCF(uint32_t cf, const Slice& key,
|
|
|
|
const Slice& entity) override {
|
|
|
|
Slice entity_copy = entity;
|
|
|
|
WideColumns columns;
|
|
|
|
const Status s =
|
|
|
|
WideColumnSerialization::Deserialize(entity_copy, columns);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
return txn_->PutEntity(db_->GetColumnFamilyHandle(cf), key, columns);
|
|
|
|
}
|
|
|
|
|
2016-04-18 18:15:50 +00:00
|
|
|
Status DeleteCF(uint32_t cf, const Slice& key) override {
|
|
|
|
return txn_->Delete(db_->GetColumnFamilyHandle(cf), key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
|
|
|
|
return txn_->SingleDelete(db_->GetColumnFamilyHandle(cf), key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override {
|
|
|
|
return txn_->Merge(db_->GetColumnFamilyHandle(cf), key, val);
|
|
|
|
}
|
|
|
|
|
|
|
|
// this is used for reconstructing prepared transactions upon
|
|
|
|
// recovery. there should not be any meta markers in the batches
|
|
|
|
// we are processing.
|
2018-07-07 00:17:36 +00:00
|
|
|
Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
|
2016-04-18 18:15:50 +00:00
|
|
|
|
|
|
|
Status MarkEndPrepare(const Slice&) override {
|
|
|
|
return Status::InvalidArgument();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkCommit(const Slice&) override {
|
|
|
|
return Status::InvalidArgument();
|
|
|
|
}
|
|
|
|
|
2021-12-10 19:03:39 +00:00
|
|
|
Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
|
|
|
|
return Status::InvalidArgument();
|
|
|
|
}
|
|
|
|
|
2016-04-18 18:15:50 +00:00
|
|
|
Status MarkRollback(const Slice&) override {
|
|
|
|
return Status::InvalidArgument();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
IndexedWriteBatchBuilder copycat(this, dbimpl_);
|
|
|
|
return src_batch->Iterate(©cat);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch* TransactionBaseImpl::GetCommitTimeWriteBatch() {
|
|
|
|
return &commit_time_batch_;
|
|
|
|
}
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|