2017-08-03 15:46:47 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
|
2017-08-07 23:07:40 +00:00
|
|
|
#include "utilities/transactions/write_prepared_txn.h"
|
2017-08-03 15:46:47 +00:00
|
|
|
|
2019-06-06 20:52:39 +00:00
|
|
|
#include <cinttypes>
|
2017-08-03 15:46:47 +00:00
|
|
|
#include <map>
|
2018-02-06 02:32:54 +00:00
|
|
|
#include <set>
|
2017-08-03 15:46:47 +00:00
|
|
|
|
|
|
|
#include "db/column_family.h"
|
2019-05-31 18:52:59 +00:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2017-08-03 15:46:47 +00:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "rocksdb/utilities/transaction_db.h"
|
2018-04-03 03:19:21 +00:00
|
|
|
#include "util/cast_util.h"
|
2017-08-07 23:07:40 +00:00
|
|
|
#include "utilities/transactions/pessimistic_transaction.h"
|
2017-11-02 18:05:55 +00:00
|
|
|
#include "utilities/transactions/write_prepared_txn_db.h"
|
2017-08-03 15:46:47 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2017-08-03 15:46:47 +00:00
|
|
|
|
|
|
|
struct WriteOptions;
|
|
|
|
|
2017-08-16 23:49:11 +00:00
|
|
|
WritePreparedTxn::WritePreparedTxn(WritePreparedTxnDB* txn_db,
|
|
|
|
const WriteOptions& write_options,
|
|
|
|
const TransactionOptions& txn_options)
|
2019-01-16 02:07:50 +00:00
|
|
|
: PessimisticTransaction(txn_db, write_options, txn_options, false),
|
|
|
|
wpt_db_(txn_db) {
|
|
|
|
// Call Initialize outside PessimisticTransaction constructor otherwise it
|
|
|
|
// would skip overridden functions in WritePreparedTxn since they are not
|
|
|
|
// defined yet in the constructor of PessimisticTransaction
|
|
|
|
Initialize(txn_options);
|
|
|
|
}
|
2017-08-03 15:46:47 +00:00
|
|
|
|
2018-07-24 07:09:18 +00:00
|
|
|
void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) {
|
|
|
|
PessimisticTransaction::Initialize(txn_options);
|
|
|
|
prepare_batch_cnt_ = 0;
|
|
|
|
}
|
|
|
|
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
void WritePreparedTxn::MultiGet(const ReadOptions& _read_options,
|
2019-07-30 00:51:30 +00:00
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const size_t num_keys, const Slice* keys,
|
|
|
|
PinnableSlice* values, Status* statuses,
|
2019-11-27 00:55:46 +00:00
|
|
|
const bool sorted_input) {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
if (_read_options.io_activity != Env::IOActivity::kUnknown &&
|
|
|
|
_read_options.io_activity != Env::IOActivity::kMultiGet) {
|
|
|
|
Status s = Status::InvalidArgument(
|
|
|
|
"Can only call MultiGet with `ReadOptions::io_activity` is "
|
|
|
|
"`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`");
|
|
|
|
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
|
|
|
if (statuses[i].ok()) {
|
|
|
|
statuses[i] = s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ReadOptions read_options(_read_options);
|
|
|
|
if (read_options.io_activity == Env::IOActivity::kUnknown) {
|
|
|
|
read_options.io_activity = Env::IOActivity::kMultiGet;
|
|
|
|
}
|
|
|
|
|
2019-07-30 00:51:30 +00:00
|
|
|
SequenceNumber min_uncommitted, snap_seq;
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
const SnapshotBackup backed_by_snapshot = wpt_db_->AssignMinMaxSeqs(
|
|
|
|
read_options.snapshot, &min_uncommitted, &snap_seq);
|
2019-08-05 20:30:56 +00:00
|
|
|
WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
|
|
|
|
backed_by_snapshot);
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family,
|
|
|
|
num_keys, keys, values, statuses,
|
|
|
|
sorted_input, &callback);
|
2019-08-05 20:30:56 +00:00
|
|
|
if (UNLIKELY(!callback.valid() ||
|
|
|
|
!wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
|
|
|
|
wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
|
2019-07-30 00:51:30 +00:00
|
|
|
for (size_t i = 0; i < num_keys; i++) {
|
|
|
|
statuses[i] = Status::TryAgain();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
Status WritePreparedTxn::Get(const ReadOptions& _read_options,
|
2017-09-11 15:58:52 +00:00
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, PinnableSlice* pinnable_val) {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
if (_read_options.io_activity != Env::IOActivity::kUnknown &&
|
|
|
|
_read_options.io_activity != Env::IOActivity::kGet) {
|
2023-04-21 16:07:18 +00:00
|
|
|
return Status::InvalidArgument(
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
"Can only call Get with `ReadOptions::io_activity` is "
|
|
|
|
"`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`");
|
|
|
|
}
|
|
|
|
ReadOptions read_options(_read_options);
|
|
|
|
if (read_options.io_activity == Env::IOActivity::kUnknown) {
|
|
|
|
read_options.io_activity = Env::IOActivity::kGet;
|
2023-04-21 16:07:18 +00:00
|
|
|
}
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
|
|
|
|
return GetImpl(read_options, column_family, key, pinnable_val);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status WritePreparedTxn::GetImpl(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key,
|
|
|
|
PinnableSlice* pinnable_val) {
|
2019-04-12 21:36:36 +00:00
|
|
|
SequenceNumber min_uncommitted, snap_seq;
|
2019-08-05 20:30:56 +00:00
|
|
|
const SnapshotBackup backed_by_snapshot =
|
2019-04-12 21:36:36 +00:00
|
|
|
wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
|
2019-08-05 20:30:56 +00:00
|
|
|
WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
|
|
|
|
backed_by_snapshot);
|
2020-12-10 05:19:55 +00:00
|
|
|
Status res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
|
|
|
|
pinnable_val, &callback);
|
|
|
|
const bool callback_valid =
|
|
|
|
callback.valid(); // NOTE: validity of callback must always be checked
|
|
|
|
// before it is destructed
|
|
|
|
if (res.ok()) {
|
|
|
|
if (!LIKELY(callback_valid &&
|
|
|
|
wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
|
|
|
|
backed_by_snapshot))) {
|
|
|
|
wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
|
|
|
|
res = Status::TryAgain();
|
|
|
|
}
|
2019-04-12 21:36:36 +00:00
|
|
|
}
|
2020-12-10 05:19:55 +00:00
|
|
|
|
|
|
|
return res;
|
2017-09-11 15:58:52 +00:00
|
|
|
}
|
|
|
|
|
2017-10-10 00:05:34 +00:00
|
|
|
Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options) {
|
2023-10-20 20:28:28 +00:00
|
|
|
return GetIterator(options, wpt_db_->DefaultColumnFamily());
|
2017-10-10 00:05:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family) {
|
|
|
|
// Make sure to get iterator from WritePrepareTxnDB, not the root db.
|
|
|
|
Iterator* db_iter = wpt_db_->NewIterator(options, column_family);
|
|
|
|
assert(db_iter);
|
|
|
|
|
2023-10-20 20:28:28 +00:00
|
|
|
return write_batch_.NewIteratorWithBase(column_family, db_iter, &options);
|
2017-10-10 00:05:34 +00:00
|
|
|
}
|
|
|
|
|
2017-08-07 23:07:40 +00:00
|
|
|
Status WritePreparedTxn::PrepareInternal() {
|
2017-08-16 23:49:11 +00:00
|
|
|
WriteOptions write_options = write_options_;
|
|
|
|
write_options.disableWAL = false;
|
2017-12-01 07:39:56 +00:00
|
|
|
const bool WRITE_AFTER_COMMIT = true;
|
2019-04-02 22:14:41 +00:00
|
|
|
const bool kFirstPrepareBatch = true;
|
2020-10-21 21:02:00 +00:00
|
|
|
auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
|
|
|
|
name_, !WRITE_AFTER_COMMIT);
|
|
|
|
assert(s.ok());
|
2018-02-06 02:32:54 +00:00
|
|
|
// For each duplicate key we account for a new sub-batch
|
2018-02-23 02:05:14 +00:00
|
|
|
prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
|
2019-02-28 23:20:40 +00:00
|
|
|
// Having AddPrepared in the PreReleaseCallback allows in-order addition of
|
2022-10-25 21:15:22 +00:00
|
|
|
// prepared entries to PreparedHeap and hence enables an optimization. Refer
|
|
|
|
// to SmallestUnCommittedSeq for more details.
|
2018-04-03 03:19:21 +00:00
|
|
|
AddPreparedCallback add_prepared_callback(
|
2019-04-02 22:14:41 +00:00
|
|
|
wpt_db_, db_impl_, prepare_batch_cnt_,
|
|
|
|
db_impl_->immutable_db_options().two_write_queues, kFirstPrepareBatch);
|
2018-04-03 03:19:21 +00:00
|
|
|
const bool DISABLE_MEMTABLE = true;
|
|
|
|
uint64_t seq_used = kMaxSequenceNumber;
|
2020-10-21 21:02:00 +00:00
|
|
|
s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
|
|
|
|
/*callback*/ nullptr, &log_number_, /*log ref*/ 0,
|
|
|
|
!DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_,
|
|
|
|
&add_prepared_callback);
|
2018-01-09 16:47:46 +00:00
|
|
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
2017-09-28 23:43:04 +00:00
|
|
|
auto prepare_seq = seq_used;
|
|
|
|
SetId(prepare_seq);
|
2017-08-16 23:49:11 +00:00
|
|
|
return s;
|
2017-08-03 15:46:47 +00:00
|
|
|
}
|
|
|
|
|
2017-08-07 23:07:40 +00:00
|
|
|
Status WritePreparedTxn::CommitWithoutPrepareInternal() {
|
2018-02-06 02:32:54 +00:00
|
|
|
// For each duplicate key we account for a new sub-batch
|
2018-02-23 02:05:14 +00:00
|
|
|
const size_t batch_cnt = GetWriteBatch()->SubBatchCnt();
|
2018-02-06 02:32:54 +00:00
|
|
|
return CommitBatchInternal(GetWriteBatch()->GetWriteBatch(), batch_cnt);
|
2017-09-08 22:53:51 +00:00
|
|
|
}
|
|
|
|
|
2018-02-06 02:32:54 +00:00
|
|
|
Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch,
|
|
|
|
size_t batch_cnt) {
|
2018-02-13 00:27:39 +00:00
|
|
|
return wpt_db_->WriteInternal(write_options_, batch, batch_cnt, this);
|
2017-08-03 15:46:47 +00:00
|
|
|
}
|
|
|
|
|
2017-08-07 23:07:40 +00:00
|
|
|
Status WritePreparedTxn::CommitInternal() {
|
2018-01-09 16:47:46 +00:00
|
|
|
ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
|
|
|
|
"CommitInternal prepare_seq: %" PRIu64, GetID());
|
2017-08-16 23:49:11 +00:00
|
|
|
// We take the commit-time batch and append the Commit marker.
|
|
|
|
// The Memtable will ignore the Commit marker in non-recovery mode
|
|
|
|
WriteBatch* working_batch = GetCommitTimeWriteBatch();
|
2017-10-06 21:18:30 +00:00
|
|
|
const bool empty = working_batch->Count() == 0;
|
2020-10-21 21:02:00 +00:00
|
|
|
auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
|
|
|
|
assert(s.ok());
|
2017-08-16 23:49:11 +00:00
|
|
|
|
2017-11-02 00:23:52 +00:00
|
|
|
const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
|
2022-04-05 18:10:20 +00:00
|
|
|
if (!empty) {
|
2017-11-02 00:23:52 +00:00
|
|
|
// When not writing to memtable, we can still cache the latest write batch.
|
|
|
|
// The cached batch will be written to memtable in WriteRecoverableState
|
|
|
|
// during FlushMemTable
|
2022-04-05 18:10:20 +00:00
|
|
|
if (for_recovery) {
|
|
|
|
WriteBatchInternal::SetAsLatestPersistentState(working_batch);
|
|
|
|
} else {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Commit-time-batch can only be used if "
|
|
|
|
"use_only_the_last_commit_time_batch_for_recovery is true");
|
|
|
|
}
|
2017-11-02 00:23:52 +00:00
|
|
|
}
|
2017-08-16 23:49:11 +00:00
|
|
|
|
2017-12-01 07:39:56 +00:00
|
|
|
auto prepare_seq = GetId();
|
|
|
|
const bool includes_data = !empty && !for_recovery;
|
2018-02-06 02:32:54 +00:00
|
|
|
assert(prepare_batch_cnt_);
|
|
|
|
size_t commit_batch_cnt = 0;
|
2018-02-16 16:36:47 +00:00
|
|
|
if (UNLIKELY(includes_data)) {
|
|
|
|
ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
|
|
|
|
"Duplicate key overhead");
|
2018-02-06 02:32:54 +00:00
|
|
|
SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
|
2020-10-21 21:02:00 +00:00
|
|
|
s = working_batch->Iterate(&counter);
|
2018-02-06 02:32:54 +00:00
|
|
|
assert(s.ok());
|
|
|
|
commit_batch_cnt = counter.BatchCount();
|
|
|
|
}
|
2017-12-01 07:39:56 +00:00
|
|
|
const bool disable_memtable = !includes_data;
|
2018-03-22 21:27:44 +00:00
|
|
|
const bool do_one_write =
|
|
|
|
!db_impl_->immutable_db_options().two_write_queues || disable_memtable;
|
|
|
|
WritePreparedCommitEntryPreReleaseCallback update_commit_map(
|
2019-02-28 23:20:40 +00:00
|
|
|
wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, commit_batch_cnt);
|
|
|
|
// This is to call AddPrepared on CommitTimeWriteBatch
|
2019-04-02 22:14:41 +00:00
|
|
|
const bool kFirstPrepareBatch = true;
|
2019-02-28 23:20:40 +00:00
|
|
|
AddPreparedCallback add_prepared_callback(
|
2019-04-02 22:14:41 +00:00
|
|
|
wpt_db_, db_impl_, commit_batch_cnt,
|
|
|
|
db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
|
2019-02-28 23:20:40 +00:00
|
|
|
PreReleaseCallback* pre_release_callback;
|
|
|
|
if (do_one_write) {
|
|
|
|
pre_release_callback = &update_commit_map;
|
|
|
|
} else {
|
|
|
|
pre_release_callback = &add_prepared_callback;
|
|
|
|
}
|
2017-09-28 23:43:04 +00:00
|
|
|
uint64_t seq_used = kMaxSequenceNumber;
|
|
|
|
// Since the prepared batch is directly written to memtable, there is already
|
|
|
|
// a connection between the memtable and its WAL, so there is no need to
|
|
|
|
// redundantly reference the log that contains the prepared data.
|
|
|
|
const uint64_t zero_log_number = 0ull;
|
2018-02-16 16:36:47 +00:00
|
|
|
size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
|
2022-04-05 18:10:20 +00:00
|
|
|
// If `two_write_queues && includes_data`, then `do_one_write` is false. The
|
|
|
|
// following `WriteImpl` will insert the data of the commit-time-batch into
|
|
|
|
// the database before updating the commit cache. Therefore, the data of the
|
|
|
|
// commmit-time-batch is considered uncommitted. Furthermore, since data of
|
|
|
|
// the commit-time-batch are not locked, it is possible for two uncommitted
|
|
|
|
// versions of the same key to co-exist for a (short) period of time until
|
|
|
|
// the commit cache is updated by the second write. If the two uncommitted
|
|
|
|
// keys are compacted to the bottommost level in the meantime, it is possible
|
|
|
|
// that compaction iterator will zero out the sequence numbers of both, thus
|
|
|
|
// violating the invariant that an SST does not have two identical internal
|
|
|
|
// keys. To prevent this situation, we should allow the usage of
|
|
|
|
// commit-time-batch only if the user sets
|
|
|
|
// TransactionOptions::use_only_the_last_commit_time_batch_for_recovery to
|
|
|
|
// true. See the comments about GetCommitTimeWriteBatch() in
|
|
|
|
// include/rocksdb/utilities/transaction.h.
|
2020-10-21 21:02:00 +00:00
|
|
|
s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
|
|
|
|
zero_log_number, disable_memtable, &seq_used,
|
|
|
|
batch_cnt, pre_release_callback);
|
2018-01-09 16:47:46 +00:00
|
|
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
2019-02-28 23:20:40 +00:00
|
|
|
const SequenceNumber commit_batch_seq = seq_used;
|
2018-03-22 21:27:44 +00:00
|
|
|
if (LIKELY(do_one_write || !s.ok())) {
|
2019-06-10 18:47:16 +00:00
|
|
|
if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues &&
|
|
|
|
s.ok())) {
|
|
|
|
// Note: RemovePrepared should be called after WriteImpl that publishsed
|
2018-04-12 02:59:25 +00:00
|
|
|
// the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
|
|
|
|
wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
|
2019-06-10 18:47:16 +00:00
|
|
|
} // else RemovePrepared is called from within PreReleaseCallback
|
2019-02-28 23:20:40 +00:00
|
|
|
if (UNLIKELY(!do_one_write)) {
|
2019-06-10 18:47:16 +00:00
|
|
|
assert(!s.ok());
|
|
|
|
// Cleanup the prepared entry we added with add_prepared_callback
|
2019-02-28 23:20:40 +00:00
|
|
|
wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
|
|
|
|
}
|
2018-03-22 21:27:44 +00:00
|
|
|
return s;
|
|
|
|
} // else do the 2nd write to publish seq
|
|
|
|
// Note: the 2nd write comes with a performance penality. So if we have too
|
|
|
|
// many of commits accompanied with ComitTimeWriteBatch and yet we cannot
|
|
|
|
// enable use_only_the_last_commit_time_batch_for_recovery_ optimization,
|
|
|
|
// two_write_queues should be disabled to avoid many additional writes here.
|
2019-02-28 23:20:40 +00:00
|
|
|
const size_t kZeroData = 0;
|
|
|
|
// Update commit map only from the 2nd queue
|
|
|
|
WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_aux_batch(
|
|
|
|
wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, kZeroData,
|
|
|
|
commit_batch_seq, commit_batch_cnt);
|
2018-03-22 21:27:44 +00:00
|
|
|
WriteBatch empty_batch;
|
2020-10-21 21:02:00 +00:00
|
|
|
s = empty_batch.PutLogData(Slice());
|
|
|
|
assert(s.ok());
|
2018-03-22 21:27:44 +00:00
|
|
|
// In the absence of Prepare markers, use Noop as a batch separator
|
2020-10-21 21:02:00 +00:00
|
|
|
s = WriteBatchInternal::InsertNoop(&empty_batch);
|
|
|
|
assert(s.ok());
|
2018-03-22 21:27:44 +00:00
|
|
|
const bool DISABLE_MEMTABLE = true;
|
|
|
|
const size_t ONE_BATCH = 1;
|
|
|
|
const uint64_t NO_REF_LOG = 0;
|
|
|
|
s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
|
|
|
|
NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
|
2019-02-28 23:20:40 +00:00
|
|
|
&update_commit_map_with_aux_batch);
|
2018-03-22 21:27:44 +00:00
|
|
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
2017-08-16 23:49:11 +00:00
|
|
|
return s;
|
2017-08-07 23:07:40 +00:00
|
|
|
}
|
|
|
|
|
2017-10-03 02:46:42 +00:00
|
|
|
Status WritePreparedTxn::RollbackInternal() {
|
2018-01-09 16:47:46 +00:00
|
|
|
ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
|
|
|
|
"RollbackInternal prepare_seq: %" PRIu64, GetId());
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
|
|
|
|
assert(db_impl_);
|
|
|
|
assert(wpt_db_);
|
|
|
|
|
2022-06-17 06:10:07 +00:00
|
|
|
WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */,
|
|
|
|
write_options_.protection_bytes_per_key,
|
|
|
|
0 /* default_cf_ts_sz */);
|
2017-10-03 02:46:42 +00:00
|
|
|
assert(GetId() != kMaxSequenceNumber);
|
|
|
|
assert(GetId() > 0);
|
2018-05-03 01:09:55 +00:00
|
|
|
auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap();
|
|
|
|
auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap();
|
2019-01-07 22:53:26 +00:00
|
|
|
auto read_at_seq = kMaxSequenceNumber;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
2019-08-05 20:30:56 +00:00
|
|
|
ReadOptions roptions;
|
|
|
|
// to prevent callback's seq to be overrriden inside DBImpk::Get
|
|
|
|
roptions.snapshot = wpt_db_->GetMaxSnapshot();
|
2017-10-03 02:46:42 +00:00
|
|
|
struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
DBImpl* const db_;
|
|
|
|
WritePreparedTxnDB* const wpt_db_;
|
|
|
|
WritePreparedTxnReadCallback callback_;
|
2017-10-03 02:46:42 +00:00
|
|
|
WriteBatch* rollback_batch_;
|
2018-02-06 02:32:54 +00:00
|
|
|
std::map<uint32_t, const Comparator*>& comparators_;
|
2018-05-03 01:09:55 +00:00
|
|
|
std::map<uint32_t, ColumnFamilyHandle*>& handles_;
|
2018-02-06 02:32:54 +00:00
|
|
|
using CFKeys = std::set<Slice, SetComparator>;
|
|
|
|
std::map<uint32_t, CFKeys> keys_;
|
2018-04-12 18:52:15 +00:00
|
|
|
bool rollback_merge_operands_;
|
2019-08-05 20:30:56 +00:00
|
|
|
ReadOptions roptions_;
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
|
2018-02-06 02:32:54 +00:00
|
|
|
RollbackWriteBatchBuilder(
|
|
|
|
DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq,
|
|
|
|
WriteBatch* dst_batch,
|
2018-04-12 18:52:15 +00:00
|
|
|
std::map<uint32_t, const Comparator*>& comparators,
|
2018-05-03 01:09:55 +00:00
|
|
|
std::map<uint32_t, ColumnFamilyHandle*>& handles,
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
bool rollback_merge_operands, const ReadOptions& _roptions)
|
2018-02-06 02:32:54 +00:00
|
|
|
: db_(db),
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
wpt_db_(wpt_db),
|
|
|
|
callback_(wpt_db, snap_seq), // disable min_uncommitted optimization
|
2018-02-06 02:32:54 +00:00
|
|
|
rollback_batch_(dst_batch),
|
2018-04-12 18:52:15 +00:00
|
|
|
comparators_(comparators),
|
2018-05-03 01:09:55 +00:00
|
|
|
handles_(handles),
|
2019-08-05 20:30:56 +00:00
|
|
|
rollback_merge_operands_(rollback_merge_operands),
|
|
|
|
roptions_(_roptions) {}
|
2017-10-03 02:46:42 +00:00
|
|
|
|
|
|
|
Status Rollback(uint32_t cf, const Slice& key) {
|
2018-02-06 02:32:54 +00:00
|
|
|
Status s;
|
|
|
|
CFKeys& cf_keys = keys_[cf];
|
|
|
|
if (cf_keys.size() == 0) { // just inserted
|
|
|
|
auto cmp = comparators_[cf];
|
|
|
|
keys_[cf] = CFKeys(SetComparator(cmp));
|
|
|
|
}
|
|
|
|
auto it = cf_keys.insert(key);
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
// second is false if a element already existed.
|
|
|
|
if (it.second == false) {
|
2018-02-06 02:32:54 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2017-10-03 02:46:42 +00:00
|
|
|
PinnableSlice pinnable_val;
|
|
|
|
bool not_used;
|
2018-05-03 01:09:55 +00:00
|
|
|
auto cf_handle = handles_[cf];
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 21:22:34 +00:00
|
|
|
DBImpl::GetImplOptions get_impl_options;
|
|
|
|
get_impl_options.column_family = cf_handle;
|
|
|
|
get_impl_options.value = &pinnable_val;
|
|
|
|
get_impl_options.value_found = ¬_used;
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
get_impl_options.callback = &callback_;
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 21:22:34 +00:00
|
|
|
s = db_->GetImpl(roptions_, key, get_impl_options);
|
2017-10-03 02:46:42 +00:00
|
|
|
assert(s.ok() || s.IsNotFound());
|
|
|
|
if (s.ok()) {
|
|
|
|
s = rollback_batch_->Put(cf_handle, key, pinnable_val);
|
|
|
|
assert(s.ok());
|
|
|
|
} else if (s.IsNotFound()) {
|
|
|
|
// There has been no readable value before txn. By adding a delete we
|
|
|
|
// make sure that there will be none afterwards either.
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
if (wpt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) {
|
|
|
|
s = rollback_batch_->SingleDelete(cf_handle, key);
|
|
|
|
} else {
|
|
|
|
s = rollback_batch_->Delete(cf_handle, key);
|
|
|
|
}
|
2017-10-03 02:46:42 +00:00
|
|
|
assert(s.ok());
|
|
|
|
} else {
|
|
|
|
// Unexpected status. Return it to the user.
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2018-03-05 21:08:17 +00:00
|
|
|
Status PutCF(uint32_t cf, const Slice& key, const Slice& /*val*/) override {
|
2017-10-03 02:46:42 +00:00
|
|
|
return Rollback(cf, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DeleteCF(uint32_t cf, const Slice& key) override {
|
|
|
|
return Rollback(cf, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
|
|
|
|
return Rollback(cf, key);
|
|
|
|
}
|
|
|
|
|
2018-03-05 21:08:17 +00:00
|
|
|
Status MergeCF(uint32_t cf, const Slice& key,
|
|
|
|
const Slice& /*val*/) override {
|
2018-04-12 18:52:15 +00:00
|
|
|
if (rollback_merge_operands_) {
|
|
|
|
return Rollback(cf, key);
|
|
|
|
} else {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2017-10-03 02:46:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkNoop(bool) override { return Status::OK(); }
|
2018-07-07 00:17:36 +00:00
|
|
|
Status MarkBeginPrepare(bool) override { return Status::OK(); }
|
2017-10-03 02:46:42 +00:00
|
|
|
Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
|
|
|
|
Status MarkCommit(const Slice&) override { return Status::OK(); }
|
|
|
|
Status MarkRollback(const Slice&) override {
|
|
|
|
return Status::InvalidArgument();
|
|
|
|
}
|
2017-11-11 19:23:43 +00:00
|
|
|
|
|
|
|
protected:
|
2022-04-28 21:42:00 +00:00
|
|
|
Handler::OptionState WriteAfterCommit() const override {
|
|
|
|
return Handler::OptionState::kDisabled;
|
|
|
|
}
|
2019-01-07 22:53:26 +00:00
|
|
|
} rollback_handler(db_impl_, wpt_db_, read_at_seq, &rollback_batch,
|
2018-05-03 01:09:55 +00:00
|
|
|
*cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
|
2019-08-05 20:30:56 +00:00
|
|
|
wpt_db_->txn_db_options_.rollback_merge_operands,
|
|
|
|
roptions);
|
2017-10-03 02:46:42 +00:00
|
|
|
auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2017-11-15 16:19:57 +00:00
|
|
|
// The Rollback marker will be used as a batch separator
|
2020-10-21 21:02:00 +00:00
|
|
|
s = WriteBatchInternal::MarkRollback(&rollback_batch, name_);
|
|
|
|
assert(s.ok());
|
2017-12-18 16:03:18 +00:00
|
|
|
bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
|
2017-12-01 07:39:56 +00:00
|
|
|
const bool DISABLE_MEMTABLE = true;
|
2018-03-22 21:27:44 +00:00
|
|
|
const uint64_t NO_REF_LOG = 0;
|
2017-10-03 02:46:42 +00:00
|
|
|
uint64_t seq_used = kMaxSequenceNumber;
|
2018-02-06 02:32:54 +00:00
|
|
|
const size_t ONE_BATCH = 1;
|
2019-04-02 22:14:41 +00:00
|
|
|
const bool kFirstPrepareBatch = true;
|
2019-01-17 20:03:08 +00:00
|
|
|
// We commit the rolled back prepared batches. Although this is
|
2018-04-20 22:25:12 +00:00
|
|
|
// counter-intuitive, i) it is safe to do so, since the prepared batches are
|
|
|
|
// already canceled out by the rollback batch, ii) adding the commit entry to
|
|
|
|
// CommitCache will allow us to benefit from the existing mechanism in
|
|
|
|
// CommitCache that keeps an entry evicted due to max advance and yet overlaps
|
|
|
|
// with a live snapshot around so that the live snapshot properly skips the
|
|
|
|
// entry even if its prepare seq is lower than max_evicted_seq_.
|
2019-03-07 15:26:36 +00:00
|
|
|
AddPreparedCallback add_prepared_callback(
|
2019-04-02 22:14:41 +00:00
|
|
|
wpt_db_, db_impl_, ONE_BATCH,
|
|
|
|
db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
|
2017-12-18 16:03:18 +00:00
|
|
|
WritePreparedCommitEntryPreReleaseCallback update_commit_map(
|
2018-04-20 22:25:12 +00:00
|
|
|
wpt_db_, db_impl_, GetId(), prepare_batch_cnt_, ONE_BATCH);
|
2019-03-07 15:26:36 +00:00
|
|
|
PreReleaseCallback* pre_release_callback;
|
|
|
|
if (do_one_write) {
|
|
|
|
pre_release_callback = &update_commit_map;
|
|
|
|
} else {
|
|
|
|
pre_release_callback = &add_prepared_callback;
|
|
|
|
}
|
2018-04-03 03:19:21 +00:00
|
|
|
// Note: the rollback batch does not need AddPrepared since it is written to
|
|
|
|
// DB in one shot. min_uncommitted still works since it requires capturing
|
|
|
|
// data that is written to DB but not yet committed, while
|
2019-01-17 20:03:08 +00:00
|
|
|
// the rollback batch commits with PreReleaseCallback.
|
2017-10-03 02:46:42 +00:00
|
|
|
s = db_impl_->WriteImpl(write_options_, &rollback_batch, nullptr, nullptr,
|
2018-03-22 21:27:44 +00:00
|
|
|
NO_REF_LOG, !DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
|
2019-03-07 15:26:36 +00:00
|
|
|
pre_release_callback);
|
2018-01-09 16:47:46 +00:00
|
|
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
2017-12-01 07:39:56 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2017-12-18 16:03:18 +00:00
|
|
|
if (do_one_write) {
|
2019-06-10 18:47:16 +00:00
|
|
|
assert(!db_impl_->immutable_db_options().two_write_queues);
|
2018-04-20 22:25:12 +00:00
|
|
|
wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
|
2017-12-18 16:03:18 +00:00
|
|
|
return s;
|
|
|
|
} // else do the 2nd write for commit
|
2019-03-07 15:26:36 +00:00
|
|
|
uint64_t rollback_seq = seq_used;
|
2018-01-09 16:47:46 +00:00
|
|
|
ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
|
2019-03-07 15:26:36 +00:00
|
|
|
"RollbackInternal 2nd write rollback_seq: %" PRIu64,
|
|
|
|
rollback_seq);
|
2017-12-01 07:39:56 +00:00
|
|
|
// Commit the batch by writing an empty batch to the queue that will release
|
|
|
|
// the commit sequence number to readers.
|
WritePrepared: fix two versions in compaction see different status for released snapshots (#4890)
Summary:
Fix how CompactionIterator::findEarliestVisibleSnapshots handles released snapshot. It fixing the two scenarios:
Scenario 1:
key1 has two values v1 and v2. There're two snapshots s1 and s2 taken after v1 and v2 are committed. Right after compaction output v2, s1 is released. Now findEarliestVisibleSnapshot may see s1 being released, and return the next snapshot, which is s2. That's larger than v2's earliest visible snapshot, which was s1.
The fix: the only place we check against last snapshot and current key snapshot is when we decide whether to compact out a value if it is hidden by a later value. In the check if we see current snapshot is even larger than last snapshot, we know last snapshot is released, and we are safe to compact out current key.
Scenario 2:
key1 has two values v1 and v2. there are two snapshots s1 and s2 taken after v1 and v2 are committed. During compaction before we process the key, s1 is released. When compaction process v2, snapshot checker may return kSnapshotReleased, and the earliest visible snapshot for v2 become s2. When compaction process v1, snapshot checker may return kIsInSnapshot (for WritePrepared transaction, it could be because v1 is still in commit cache). The result will become inconsistent here.
The fix: remember the set of released snapshots ever reported by snapshot checker, and ignore them when finding result for findEarliestVisibleSnapshot.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4890
Differential Revision: D13705538
Pulled By: maysamyabandeh
fbshipit-source-id: e577f0d9ee1ff5a6035f26859e56902ecc85a5a4
2019-01-19 01:20:13 +00:00
|
|
|
WritePreparedRollbackPreReleaseCallback update_commit_map_with_prepare(
|
2019-03-07 15:26:36 +00:00
|
|
|
wpt_db_, db_impl_, GetId(), rollback_seq, prepare_batch_cnt_);
|
2017-12-01 07:39:56 +00:00
|
|
|
WriteBatch empty_batch;
|
2020-10-21 21:02:00 +00:00
|
|
|
s = empty_batch.PutLogData(Slice());
|
|
|
|
assert(s.ok());
|
2017-12-01 07:39:56 +00:00
|
|
|
// In the absence of Prepare markers, use Noop as a batch separator
|
2020-10-21 21:02:00 +00:00
|
|
|
s = WriteBatchInternal::InsertNoop(&empty_batch);
|
|
|
|
assert(s.ok());
|
2017-12-01 07:39:56 +00:00
|
|
|
s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
|
2018-03-22 21:27:44 +00:00
|
|
|
NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
|
2017-12-18 16:03:18 +00:00
|
|
|
&update_commit_map_with_prepare);
|
2018-01-09 16:47:46 +00:00
|
|
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
WritePrepared: fix two versions in compaction see different status for released snapshots (#4890)
Summary:
Fix how CompactionIterator::findEarliestVisibleSnapshots handles released snapshot. It fixing the two scenarios:
Scenario 1:
key1 has two values v1 and v2. There're two snapshots s1 and s2 taken after v1 and v2 are committed. Right after compaction output v2, s1 is released. Now findEarliestVisibleSnapshot may see s1 being released, and return the next snapshot, which is s2. That's larger than v2's earliest visible snapshot, which was s1.
The fix: the only place we check against last snapshot and current key snapshot is when we decide whether to compact out a value if it is hidden by a later value. In the check if we see current snapshot is even larger than last snapshot, we know last snapshot is released, and we are safe to compact out current key.
Scenario 2:
key1 has two values v1 and v2. there are two snapshots s1 and s2 taken after v1 and v2 are committed. During compaction before we process the key, s1 is released. When compaction process v2, snapshot checker may return kSnapshotReleased, and the earliest visible snapshot for v2 become s2. When compaction process v1, snapshot checker may return kIsInSnapshot (for WritePrepared transaction, it could be because v1 is still in commit cache). The result will become inconsistent here.
The fix: remember the set of released snapshots ever reported by snapshot checker, and ignore them when finding result for findEarliestVisibleSnapshot.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4890
Differential Revision: D13705538
Pulled By: maysamyabandeh
fbshipit-source-id: e577f0d9ee1ff5a6035f26859e56902ecc85a5a4
2019-01-19 01:20:13 +00:00
|
|
|
ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
|
|
|
|
"RollbackInternal (status=%s) commit: %" PRIu64,
|
|
|
|
s.ToString().c_str(), GetId());
|
2019-06-10 18:47:16 +00:00
|
|
|
// TODO(lth): For WriteUnPrepared that rollback is called frequently,
|
|
|
|
// RemovePrepared could be moved to the callback to reduce lock contention.
|
2018-01-09 16:47:46 +00:00
|
|
|
if (s.ok()) {
|
2018-04-20 22:25:12 +00:00
|
|
|
wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
|
2018-01-09 16:47:46 +00:00
|
|
|
}
|
2019-06-10 18:47:16 +00:00
|
|
|
// Note: RemovePrepared for prepared batch is called from within
|
|
|
|
// PreReleaseCallback
|
2019-03-07 15:26:36 +00:00
|
|
|
wpt_db_->RemovePrepared(rollback_seq, ONE_BATCH);
|
2017-10-03 02:46:42 +00:00
|
|
|
|
|
|
|
return s;
|
2017-08-03 15:46:47 +00:00
|
|
|
}
|
|
|
|
|
2017-11-02 01:56:25 +00:00
|
|
|
Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key,
|
2017-11-11 21:08:22 +00:00
|
|
|
SequenceNumber* tracked_at_seq) {
|
2017-11-02 01:56:25 +00:00
|
|
|
assert(snapshot_);
|
|
|
|
|
2018-04-03 03:19:21 +00:00
|
|
|
SequenceNumber min_uncommitted =
|
2020-04-29 20:06:27 +00:00
|
|
|
static_cast_with_check<const SnapshotImpl>(snapshot_.get())
|
2018-04-03 03:19:21 +00:00
|
|
|
->min_uncommitted_;
|
2017-11-02 01:56:25 +00:00
|
|
|
SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
|
2017-11-11 21:08:22 +00:00
|
|
|
// tracked_at_seq is either max or the last snapshot with which this key was
|
2017-11-02 01:56:25 +00:00
|
|
|
// trackeed so there is no need to apply the IsInSnapshot to this comparison
|
2017-11-11 21:08:22 +00:00
|
|
|
// here as tracked_at_seq is not a prepare seq.
|
|
|
|
if (*tracked_at_seq <= snap_seq) {
|
2017-11-02 01:56:25 +00:00
|
|
|
// If the key has been previous validated at a sequence number earlier
|
|
|
|
// than the curent snapshot's sequence number, we already know it has not
|
|
|
|
// been modified.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2017-11-11 21:08:22 +00:00
|
|
|
*tracked_at_seq = snap_seq;
|
2017-11-02 01:56:25 +00:00
|
|
|
|
|
|
|
ColumnFamilyHandle* cfh =
|
|
|
|
column_family ? column_family : db_impl_->DefaultColumnFamily();
|
|
|
|
|
2019-08-05 20:30:56 +00:00
|
|
|
WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted,
|
|
|
|
kBackedByDBSnapshot);
|
2021-11-15 20:50:42 +00:00
|
|
|
// TODO(yanqin): support user-defined timestamp
|
|
|
|
return TransactionUtil::CheckKeyForConflicts(
|
|
|
|
db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr,
|
|
|
|
false /* cache_only */, &snap_checker, min_uncommitted);
|
2017-11-02 01:56:25 +00:00
|
|
|
}
|
|
|
|
|
2018-04-03 03:19:21 +00:00
|
|
|
void WritePreparedTxn::SetSnapshot() {
|
2019-01-16 02:07:50 +00:00
|
|
|
const bool kForWWConflictCheck = true;
|
|
|
|
SnapshotImpl* snapshot = wpt_db_->GetSnapshotInternal(kForWWConflictCheck);
|
2018-04-03 03:19:21 +00:00
|
|
|
SetSnapshotInternal(snapshot);
|
|
|
|
}
|
|
|
|
|
2018-02-06 02:32:54 +00:00
|
|
|
Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) {
|
|
|
|
auto ret = PessimisticTransaction::RebuildFromWriteBatch(src_batch);
|
2018-02-23 02:05:14 +00:00
|
|
|
prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
|
2018-02-06 02:32:54 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|