2017-08-03 15:46:47 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
|
2017-08-07 23:07:40 +00:00
|
|
|
#include "utilities/transactions/write_prepared_txn.h"
|
2017-08-03 15:46:47 +00:00
|
|
|
|
2019-06-06 20:52:39 +00:00
|
|
|
#include <cinttypes>
|
2017-08-03 15:46:47 +00:00
|
|
|
#include <map>
|
2018-02-06 02:32:54 +00:00
|
|
|
#include <set>
|
2017-08-03 15:46:47 +00:00
|
|
|
|
|
|
|
#include "db/column_family.h"
|
2019-05-31 18:52:59 +00:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2017-08-03 15:46:47 +00:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "rocksdb/utilities/transaction_db.h"
|
2018-04-03 03:19:21 +00:00
|
|
|
#include "util/cast_util.h"
|
2017-08-07 23:07:40 +00:00
|
|
|
#include "utilities/transactions/pessimistic_transaction.h"
|
2017-11-02 18:05:55 +00:00
|
|
|
#include "utilities/transactions/write_prepared_txn_db.h"
|
2017-08-03 15:46:47 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2017-08-03 15:46:47 +00:00
|
|
|
|
|
|
|
struct WriteOptions;
|
|
|
|
|
2017-08-16 23:49:11 +00:00
|
|
|
WritePreparedTxn::WritePreparedTxn(WritePreparedTxnDB* txn_db,
|
|
|
|
const WriteOptions& write_options,
|
|
|
|
const TransactionOptions& txn_options)
|
2019-01-16 02:07:50 +00:00
|
|
|
: PessimisticTransaction(txn_db, write_options, txn_options, false),
|
|
|
|
wpt_db_(txn_db) {
|
|
|
|
// Call Initialize outside PessimisticTransaction constructor otherwise it
|
|
|
|
// would skip overridden functions in WritePreparedTxn since they are not
|
|
|
|
// defined yet in the constructor of PessimisticTransaction
|
|
|
|
Initialize(txn_options);
|
|
|
|
}
|
2017-08-03 15:46:47 +00:00
|
|
|
|
2018-07-24 07:09:18 +00:00
|
|
|
void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) {
|
|
|
|
PessimisticTransaction::Initialize(txn_options);
|
|
|
|
prepare_batch_cnt_ = 0;
|
|
|
|
}
|
|
|
|
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
void WritePreparedTxn::MultiGet(const ReadOptions& _read_options,
|
2019-07-30 00:51:30 +00:00
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const size_t num_keys, const Slice* keys,
|
|
|
|
PinnableSlice* values, Status* statuses,
|
2019-11-27 00:55:46 +00:00
|
|
|
const bool sorted_input) {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
if (_read_options.io_activity != Env::IOActivity::kUnknown &&
|
|
|
|
_read_options.io_activity != Env::IOActivity::kMultiGet) {
|
|
|
|
Status s = Status::InvalidArgument(
|
|
|
|
"Can only call MultiGet with `ReadOptions::io_activity` is "
|
|
|
|
"`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`");
|
|
|
|
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
|
|
|
if (statuses[i].ok()) {
|
|
|
|
statuses[i] = s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ReadOptions read_options(_read_options);
|
|
|
|
if (read_options.io_activity == Env::IOActivity::kUnknown) {
|
|
|
|
read_options.io_activity = Env::IOActivity::kMultiGet;
|
|
|
|
}
|
|
|
|
|
2019-07-30 00:51:30 +00:00
|
|
|
SequenceNumber min_uncommitted, snap_seq;
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
const SnapshotBackup backed_by_snapshot = wpt_db_->AssignMinMaxSeqs(
|
|
|
|
read_options.snapshot, &min_uncommitted, &snap_seq);
|
2019-08-05 20:30:56 +00:00
|
|
|
WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
|
|
|
|
backed_by_snapshot);
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family,
|
|
|
|
num_keys, keys, values, statuses,
|
|
|
|
sorted_input, &callback);
|
2019-08-05 20:30:56 +00:00
|
|
|
if (UNLIKELY(!callback.valid() ||
|
|
|
|
!wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
|
|
|
|
wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
|
2019-07-30 00:51:30 +00:00
|
|
|
for (size_t i = 0; i < num_keys; i++) {
|
|
|
|
statuses[i] = Status::TryAgain();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
Status WritePreparedTxn::Get(const ReadOptions& _read_options,
|
2017-09-11 15:58:52 +00:00
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, PinnableSlice* pinnable_val) {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
if (_read_options.io_activity != Env::IOActivity::kUnknown &&
|
|
|
|
_read_options.io_activity != Env::IOActivity::kGet) {
|
2023-04-21 16:07:18 +00:00
|
|
|
return Status::InvalidArgument(
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
"Can only call Get with `ReadOptions::io_activity` is "
|
|
|
|
"`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`");
|
|
|
|
}
|
|
|
|
ReadOptions read_options(_read_options);
|
|
|
|
if (read_options.io_activity == Env::IOActivity::kUnknown) {
|
|
|
|
read_options.io_activity = Env::IOActivity::kGet;
|
2023-04-21 16:07:18 +00:00
|
|
|
}
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
|
|
|
|
return GetImpl(read_options, column_family, key, pinnable_val);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status WritePreparedTxn::GetImpl(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key,
|
|
|
|
PinnableSlice* pinnable_val) {
|
2019-04-12 21:36:36 +00:00
|
|
|
SequenceNumber min_uncommitted, snap_seq;
|
2019-08-05 20:30:56 +00:00
|
|
|
const SnapshotBackup backed_by_snapshot =
|
2019-04-12 21:36:36 +00:00
|
|
|
wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
|
2019-08-05 20:30:56 +00:00
|
|
|
WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
|
|
|
|
backed_by_snapshot);
|
2020-12-10 05:19:55 +00:00
|
|
|
Status res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
|
|
|
|
pinnable_val, &callback);
|
|
|
|
const bool callback_valid =
|
|
|
|
callback.valid(); // NOTE: validity of callback must always be checked
|
|
|
|
// before it is destructed
|
|
|
|
if (res.ok()) {
|
|
|
|
if (!LIKELY(callback_valid &&
|
|
|
|
wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
|
|
|
|
backed_by_snapshot))) {
|
|
|
|
wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
|
|
|
|
res = Status::TryAgain();
|
|
|
|
}
|
2019-04-12 21:36:36 +00:00
|
|
|
}
|
2020-12-10 05:19:55 +00:00
|
|
|
|
|
|
|
return res;
|
2017-09-11 15:58:52 +00:00
|
|
|
}
|
|
|
|
|
2017-10-10 00:05:34 +00:00
|
|
|
Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options) {
|
|
|
|
// Make sure to get iterator from WritePrepareTxnDB, not the root db.
|
|
|
|
Iterator* db_iter = wpt_db_->NewIterator(options);
|
|
|
|
assert(db_iter);
|
|
|
|
|
2018-12-04 07:36:32 +00:00
|
|
|
return write_batch_.NewIteratorWithBase(db_iter);
|
2017-10-10 00:05:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family) {
|
|
|
|
// Make sure to get iterator from WritePrepareTxnDB, not the root db.
|
|
|
|
Iterator* db_iter = wpt_db_->NewIterator(options, column_family);
|
|
|
|
assert(db_iter);
|
|
|
|
|
2018-12-04 07:36:32 +00:00
|
|
|
return write_batch_.NewIteratorWithBase(column_family, db_iter);
|
2017-10-10 00:05:34 +00:00
|
|
|
}
|
|
|
|
|
2017-08-07 23:07:40 +00:00
|
|
|
Status WritePreparedTxn::PrepareInternal() {
|
2017-08-16 23:49:11 +00:00
|
|
|
WriteOptions write_options = write_options_;
|
|
|
|
write_options.disableWAL = false;
|
2017-12-01 07:39:56 +00:00
|
|
|
const bool WRITE_AFTER_COMMIT = true;
|
2019-04-02 22:14:41 +00:00
|
|
|
const bool kFirstPrepareBatch = true;
|
2020-10-21 21:02:00 +00:00
|
|
|
auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
|
|
|
|
name_, !WRITE_AFTER_COMMIT);
|
|
|
|
assert(s.ok());
|
2018-02-06 02:32:54 +00:00
|
|
|
// For each duplicate key we account for a new sub-batch
|
2018-02-23 02:05:14 +00:00
|
|
|
prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
|
2019-02-28 23:20:40 +00:00
|
|
|
// Having AddPrepared in the PreReleaseCallback allows in-order addition of
|
2022-10-25 21:15:22 +00:00
|
|
|
// prepared entries to PreparedHeap and hence enables an optimization. Refer
|
|
|
|
// to SmallestUnCommittedSeq for more details.
|
2018-04-03 03:19:21 +00:00
|
|
|
AddPreparedCallback add_prepared_callback(
|
2019-04-02 22:14:41 +00:00
|
|
|
wpt_db_, db_impl_, prepare_batch_cnt_,
|
|
|
|
db_impl_->immutable_db_options().two_write_queues, kFirstPrepareBatch);
|
2018-04-03 03:19:21 +00:00
|
|
|
const bool DISABLE_MEMTABLE = true;
|
|
|
|
uint64_t seq_used = kMaxSequenceNumber;
|
2020-10-21 21:02:00 +00:00
|
|
|
s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
|
|
|
|
/*callback*/ nullptr, &log_number_, /*log ref*/ 0,
|
|
|
|
!DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_,
|
|
|
|
&add_prepared_callback);
|
2018-01-09 16:47:46 +00:00
|
|
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
2017-09-28 23:43:04 +00:00
|
|
|
auto prepare_seq = seq_used;
|
|
|
|
SetId(prepare_seq);
|
2017-08-16 23:49:11 +00:00
|
|
|
return s;
|
2017-08-03 15:46:47 +00:00
|
|
|
}
|
|
|
|
|
2017-08-07 23:07:40 +00:00
|
|
|
Status WritePreparedTxn::CommitWithoutPrepareInternal() {
|
2018-02-06 02:32:54 +00:00
|
|
|
// For each duplicate key we account for a new sub-batch
|
2018-02-23 02:05:14 +00:00
|
|
|
const size_t batch_cnt = GetWriteBatch()->SubBatchCnt();
|
2018-02-06 02:32:54 +00:00
|
|
|
return CommitBatchInternal(GetWriteBatch()->GetWriteBatch(), batch_cnt);
|
2017-09-08 22:53:51 +00:00
|
|
|
}
|
|
|
|
|
2018-02-06 02:32:54 +00:00
|
|
|
Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch,
|
|
|
|
size_t batch_cnt) {
|
2018-02-13 00:27:39 +00:00
|
|
|
return wpt_db_->WriteInternal(write_options_, batch, batch_cnt, this);
|
2017-08-03 15:46:47 +00:00
|
|
|
}
|
|
|
|
|
2017-08-07 23:07:40 +00:00
|
|
|
Status WritePreparedTxn::CommitInternal() {
|
2018-01-09 16:47:46 +00:00
|
|
|
ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
|
|
|
|
"CommitInternal prepare_seq: %" PRIu64, GetID());
|
2017-08-16 23:49:11 +00:00
|
|
|
// We take the commit-time batch and append the Commit marker.
|
|
|
|
// The Memtable will ignore the Commit marker in non-recovery mode
|
|
|
|
WriteBatch* working_batch = GetCommitTimeWriteBatch();
|
2017-10-06 21:18:30 +00:00
|
|
|
const bool empty = working_batch->Count() == 0;
|
2020-10-21 21:02:00 +00:00
|
|
|
auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
|
|
|
|
assert(s.ok());
|
2017-08-16 23:49:11 +00:00
|
|
|
|
2017-11-02 00:23:52 +00:00
|
|
|
const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
|
2022-04-05 18:10:20 +00:00
|
|
|
if (!empty) {
|
2017-11-02 00:23:52 +00:00
|
|
|
// When not writing to memtable, we can still cache the latest write batch.
|
|
|
|
// The cached batch will be written to memtable in WriteRecoverableState
|
|
|
|
// during FlushMemTable
|
2022-04-05 18:10:20 +00:00
|
|
|
if (for_recovery) {
|
|
|
|
WriteBatchInternal::SetAsLatestPersistentState(working_batch);
|
|
|
|
} else {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Commit-time-batch can only be used if "
|
|
|
|
"use_only_the_last_commit_time_batch_for_recovery is true");
|
|
|
|
}
|
2017-11-02 00:23:52 +00:00
|
|
|
}
|
2017-08-16 23:49:11 +00:00
|
|
|
|
2017-12-01 07:39:56 +00:00
|
|
|
auto prepare_seq = GetId();
|
|
|
|
const bool includes_data = !empty && !for_recovery;
|
2018-02-06 02:32:54 +00:00
|
|
|
assert(prepare_batch_cnt_);
|
|
|
|
size_t commit_batch_cnt = 0;
|
2018-02-16 16:36:47 +00:00
|
|
|
if (UNLIKELY(includes_data)) {
|
|
|
|
ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
|
|
|
|
"Duplicate key overhead");
|
2018-02-06 02:32:54 +00:00
|
|
|
SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
|
2020-10-21 21:02:00 +00:00
|
|
|
s = working_batch->Iterate(&counter);
|
2018-02-06 02:32:54 +00:00
|
|
|
assert(s.ok());
|
|
|
|
commit_batch_cnt = counter.BatchCount();
|
|
|
|
}
|
2017-12-01 07:39:56 +00:00
|
|
|
const bool disable_memtable = !includes_data;
|
2018-03-22 21:27:44 +00:00
|
|
|
const bool do_one_write =
|
|
|
|
!db_impl_->immutable_db_options().two_write_queues || disable_memtable;
|
|
|
|
WritePreparedCommitEntryPreReleaseCallback update_commit_map(
|
2019-02-28 23:20:40 +00:00
|
|
|
wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, commit_batch_cnt);
|
|
|
|
// This is to call AddPrepared on CommitTimeWriteBatch
|
2019-04-02 22:14:41 +00:00
|
|
|
const bool kFirstPrepareBatch = true;
|
2019-02-28 23:20:40 +00:00
|
|
|
AddPreparedCallback add_prepared_callback(
|
2019-04-02 22:14:41 +00:00
|
|
|
wpt_db_, db_impl_, commit_batch_cnt,
|
|
|
|
db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
|
2019-02-28 23:20:40 +00:00
|
|
|
PreReleaseCallback* pre_release_callback;
|
|
|
|
if (do_one_write) {
|
|
|
|
pre_release_callback = &update_commit_map;
|
|
|
|
} else {
|
|
|
|
pre_release_callback = &add_prepared_callback;
|
|
|
|
}
|
2017-09-28 23:43:04 +00:00
|
|
|
uint64_t seq_used = kMaxSequenceNumber;
|
|
|
|
// Since the prepared batch is directly written to memtable, there is already
|
|
|
|
// a connection between the memtable and its WAL, so there is no need to
|
|
|
|
// redundantly reference the log that contains the prepared data.
|
|
|
|
const uint64_t zero_log_number = 0ull;
|
2018-02-16 16:36:47 +00:00
|
|
|
size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
|
2022-04-05 18:10:20 +00:00
|
|
|
// If `two_write_queues && includes_data`, then `do_one_write` is false. The
|
|
|
|
// following `WriteImpl` will insert the data of the commit-time-batch into
|
|
|
|
// the database before updating the commit cache. Therefore, the data of the
|
|
|
|
// commmit-time-batch is considered uncommitted. Furthermore, since data of
|
|
|
|
// the commit-time-batch are not locked, it is possible for two uncommitted
|
|
|
|
// versions of the same key to co-exist for a (short) period of time until
|
|
|
|
// the commit cache is updated by the second write. If the two uncommitted
|
|
|
|
// keys are compacted to the bottommost level in the meantime, it is possible
|
|
|
|
// that compaction iterator will zero out the sequence numbers of both, thus
|
|
|
|
// violating the invariant that an SST does not have two identical internal
|
|
|
|
// keys. To prevent this situation, we should allow the usage of
|
|
|
|
// commit-time-batch only if the user sets
|
|
|
|
// TransactionOptions::use_only_the_last_commit_time_batch_for_recovery to
|
|
|
|
// true. See the comments about GetCommitTimeWriteBatch() in
|
|
|
|
// include/rocksdb/utilities/transaction.h.
|
2020-10-21 21:02:00 +00:00
|
|
|
s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
|
|
|
|
zero_log_number, disable_memtable, &seq_used,
|
|
|
|
batch_cnt, pre_release_callback);
|
2018-01-09 16:47:46 +00:00
|
|
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
2019-02-28 23:20:40 +00:00
|
|
|
const SequenceNumber commit_batch_seq = seq_used;
|
2018-03-22 21:27:44 +00:00
|
|
|
if (LIKELY(do_one_write || !s.ok())) {
|
2019-06-10 18:47:16 +00:00
|
|
|
if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues &&
|
|
|
|
s.ok())) {
|
|
|
|
// Note: RemovePrepared should be called after WriteImpl that publishsed
|
2018-04-12 02:59:25 +00:00
|
|
|
// the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
|
|
|
|
wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
|
2019-06-10 18:47:16 +00:00
|
|
|
} // else RemovePrepared is called from within PreReleaseCallback
|
2019-02-28 23:20:40 +00:00
|
|
|
if (UNLIKELY(!do_one_write)) {
|
2019-06-10 18:47:16 +00:00
|
|
|
assert(!s.ok());
|
|
|
|
// Cleanup the prepared entry we added with add_prepared_callback
|
2019-02-28 23:20:40 +00:00
|
|
|
wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
|
|
|
|
}
|
2018-03-22 21:27:44 +00:00
|
|
|
return s;
|
|
|
|
} // else do the 2nd write to publish seq
|
|
|
|
// Note: the 2nd write comes with a performance penality. So if we have too
|
|
|
|
// many of commits accompanied with ComitTimeWriteBatch and yet we cannot
|
|
|
|
// enable use_only_the_last_commit_time_batch_for_recovery_ optimization,
|
|
|
|
// two_write_queues should be disabled to avoid many additional writes here.
|
2019-02-28 23:20:40 +00:00
|
|
|
const size_t kZeroData = 0;
|
|
|
|
// Update commit map only from the 2nd queue
|
|
|
|
WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_aux_batch(
|
|
|
|
wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, kZeroData,
|
|
|
|
commit_batch_seq, commit_batch_cnt);
|
2018-03-22 21:27:44 +00:00
|
|
|
WriteBatch empty_batch;
|
2020-10-21 21:02:00 +00:00
|
|
|
s = empty_batch.PutLogData(Slice());
|
|
|
|
assert(s.ok());
|
2018-03-22 21:27:44 +00:00
|
|
|
// In the absence of Prepare markers, use Noop as a batch separator
|
2020-10-21 21:02:00 +00:00
|
|
|
s = WriteBatchInternal::InsertNoop(&empty_batch);
|
|
|
|
assert(s.ok());
|
2018-03-22 21:27:44 +00:00
|
|
|
const bool DISABLE_MEMTABLE = true;
|
|
|
|
const size_t ONE_BATCH = 1;
|
|
|
|
const uint64_t NO_REF_LOG = 0;
|
|
|
|
s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
|
|
|
|
NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
|
2019-02-28 23:20:40 +00:00
|
|
|
&update_commit_map_with_aux_batch);
|
2018-03-22 21:27:44 +00:00
|
|
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
2017-08-16 23:49:11 +00:00
|
|
|
return s;
|
2017-08-07 23:07:40 +00:00
|
|
|
}
|
|
|
|
|
2017-10-03 02:46:42 +00:00
|
|
|
Status WritePreparedTxn::RollbackInternal() {
|
2018-01-09 16:47:46 +00:00
|
|
|
ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
|
|
|
|
"RollbackInternal prepare_seq: %" PRIu64, GetId());
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
|
|
|
|
assert(db_impl_);
|
|
|
|
assert(wpt_db_);
|
|
|
|
|
2022-06-17 06:10:07 +00:00
|
|
|
WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */,
|
|
|
|
write_options_.protection_bytes_per_key,
|
|
|
|
0 /* default_cf_ts_sz */);
|
2017-10-03 02:46:42 +00:00
|
|
|
assert(GetId() != kMaxSequenceNumber);
|
|
|
|
assert(GetId() > 0);
|
2018-05-03 01:09:55 +00:00
|
|
|
auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap();
|
|
|
|
auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap();
|
2019-01-07 22:53:26 +00:00
|
|
|
auto read_at_seq = kMaxSequenceNumber;
|
2019-08-05 20:30:56 +00:00
|
|
|
ReadOptions roptions;
|
|
|
|
// to prevent callback's seq to be overrriden inside DBImpk::Get
|
|
|
|
roptions.snapshot = wpt_db_->GetMaxSnapshot();
|
2017-10-03 02:46:42 +00:00
|
|
|
struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
DBImpl* const db_;
|
|
|
|
WritePreparedTxnDB* const wpt_db_;
|
|
|
|
WritePreparedTxnReadCallback callback_;
|
2017-10-03 02:46:42 +00:00
|
|
|
WriteBatch* rollback_batch_;
|
2018-02-06 02:32:54 +00:00
|
|
|
std::map<uint32_t, const Comparator*>& comparators_;
|
2018-05-03 01:09:55 +00:00
|
|
|
std::map<uint32_t, ColumnFamilyHandle*>& handles_;
|
2018-02-06 02:32:54 +00:00
|
|
|
using CFKeys = std::set<Slice, SetComparator>;
|
|
|
|
std::map<uint32_t, CFKeys> keys_;
|
2018-04-12 18:52:15 +00:00
|
|
|
bool rollback_merge_operands_;
|
2019-08-05 20:30:56 +00:00
|
|
|
ReadOptions roptions_;
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
|
2018-02-06 02:32:54 +00:00
|
|
|
RollbackWriteBatchBuilder(
|
|
|
|
DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq,
|
|
|
|
WriteBatch* dst_batch,
|
2018-04-12 18:52:15 +00:00
|
|
|
std::map<uint32_t, const Comparator*>& comparators,
|
2018-05-03 01:09:55 +00:00
|
|
|
std::map<uint32_t, ColumnFamilyHandle*>& handles,
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
bool rollback_merge_operands, const ReadOptions& _roptions)
|
2018-02-06 02:32:54 +00:00
|
|
|
: db_(db),
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
wpt_db_(wpt_db),
|
|
|
|
callback_(wpt_db, snap_seq), // disable min_uncommitted optimization
|
2018-02-06 02:32:54 +00:00
|
|
|
rollback_batch_(dst_batch),
|
2018-04-12 18:52:15 +00:00
|
|
|
comparators_(comparators),
|
2018-05-03 01:09:55 +00:00
|
|
|
handles_(handles),
|
2019-08-05 20:30:56 +00:00
|
|
|
rollback_merge_operands_(rollback_merge_operands),
|
|
|
|
roptions_(_roptions) {}
|
2017-10-03 02:46:42 +00:00
|
|
|
|
|
|
|
Status Rollback(uint32_t cf, const Slice& key) {
|
2018-02-06 02:32:54 +00:00
|
|
|
Status s;
|
|
|
|
CFKeys& cf_keys = keys_[cf];
|
|
|
|
if (cf_keys.size() == 0) { // just inserted
|
|
|
|
auto cmp = comparators_[cf];
|
|
|
|
keys_[cf] = CFKeys(SetComparator(cmp));
|
|
|
|
}
|
|
|
|
auto it = cf_keys.insert(key);
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
// second is false if a element already existed.
|
|
|
|
if (it.second == false) {
|
2018-02-06 02:32:54 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2017-10-03 02:46:42 +00:00
|
|
|
PinnableSlice pinnable_val;
|
|
|
|
bool not_used;
|
2018-05-03 01:09:55 +00:00
|
|
|
auto cf_handle = handles_[cf];
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 21:22:34 +00:00
|
|
|
DBImpl::GetImplOptions get_impl_options;
|
|
|
|
get_impl_options.column_family = cf_handle;
|
|
|
|
get_impl_options.value = &pinnable_val;
|
|
|
|
get_impl_options.value_found = ¬_used;
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
get_impl_options.callback = &callback_;
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 21:22:34 +00:00
|
|
|
s = db_->GetImpl(roptions_, key, get_impl_options);
|
2017-10-03 02:46:42 +00:00
|
|
|
assert(s.ok() || s.IsNotFound());
|
|
|
|
if (s.ok()) {
|
|
|
|
s = rollback_batch_->Put(cf_handle, key, pinnable_val);
|
|
|
|
assert(s.ok());
|
|
|
|
} else if (s.IsNotFound()) {
|
|
|
|
// There has been no readable value before txn. By adding a delete we
|
|
|
|
// make sure that there will be none afterwards either.
|
Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary:
This PR does not affect write-committed.
Add a member, `rollback_deletion_type_callback` to TransactionDBOptions
so that a write-prepared transaction, when rolling back, can call this
callback to decide if a `Delete` or `SingleDelete` should be used to
cancel a prior `Put` written to the database during prepare phase.
The purpose of this PR is to prevent mixing `Delete` and `SingleDelete`
for the same key, causing undefined behaviors. Without this PR, the
following can happen:
```
// The application always issues SingleDelete when deleting keys.
txn1->Put('a');
txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax
txn1->Rollback(); // inserts DELETE('a')
txn2->Put('a');
txn2->Commit(); // writes to memtable and potentially gets flushed/compacted
```
In the database, we may have
```
L0: [PUT('a', s=100)]
L1: [DELETE('a', s=90)]
Lmax: [PUT('a', s=0)]
```
If a compaction compacts L0 and L1, then we have
```
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
If a future transaction issues a SingleDelete, we have
```
L0: [SD('a', s=110)]
L1: [PUT('a', s=100)]
Lmax: [PUT('a', s=0)]
```
Then, a compaction including L0, L1 and Lmax leads to
```
Lmax: [PUT('a', s=0)]
```
which is incorrect.
Similar bugs reported and addressed in
https://github.com/cockroachdb/pebble/issues/1255. Based on our team's
current priority, we have decided to take this approach for now. We may
come back and revisit in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D35762170
Pulled By: riversand963
fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
2022-04-21 01:57:32 +00:00
|
|
|
if (wpt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) {
|
|
|
|
s = rollback_batch_->SingleDelete(cf_handle, key);
|
|
|
|
} else {
|
|
|
|
s = rollback_batch_->Delete(cf_handle, key);
|
|
|
|
}
|
2017-10-03 02:46:42 +00:00
|
|
|
assert(s.ok());
|
|
|
|
} else {
|
|
|
|
// Unexpected status. Return it to the user.
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2018-03-05 21:08:17 +00:00
|
|
|
Status PutCF(uint32_t cf, const Slice& key, const Slice& /*val*/) override {
|
2017-10-03 02:46:42 +00:00
|
|
|
return Rollback(cf, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DeleteCF(uint32_t cf, const Slice& key) override {
|
|
|
|
return Rollback(cf, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
|
|
|
|
return Rollback(cf, key);
|
|
|
|
}
|
|
|
|
|
2018-03-05 21:08:17 +00:00
|
|
|
Status MergeCF(uint32_t cf, const Slice& key,
|
|
|
|
const Slice& /*val*/) override {
|
2018-04-12 18:52:15 +00:00
|
|
|
if (rollback_merge_operands_) {
|
|
|
|
return Rollback(cf, key);
|
|
|
|
} else {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2017-10-03 02:46:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkNoop(bool) override { return Status::OK(); }
|
2018-07-07 00:17:36 +00:00
|
|
|
Status MarkBeginPrepare(bool) override { return Status::OK(); }
|
2017-10-03 02:46:42 +00:00
|
|
|
Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
|
|
|
|
Status MarkCommit(const Slice&) override { return Status::OK(); }
|
|
|
|
Status MarkRollback(const Slice&) override {
|
|
|
|
return Status::InvalidArgument();
|
|
|
|
}
|
2017-11-11 19:23:43 +00:00
|
|
|
|
|
|
|
protected:
|
2022-04-28 21:42:00 +00:00
|
|
|
Handler::OptionState WriteAfterCommit() const override {
|
|
|
|
return Handler::OptionState::kDisabled;
|
|
|
|
}
|
2019-01-07 22:53:26 +00:00
|
|
|
} rollback_handler(db_impl_, wpt_db_, read_at_seq, &rollback_batch,
|
2018-05-03 01:09:55 +00:00
|
|
|
*cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
|
2019-08-05 20:30:56 +00:00
|
|
|
wpt_db_->txn_db_options_.rollback_merge_operands,
|
|
|
|
roptions);
|
2017-10-03 02:46:42 +00:00
|
|
|
auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2017-11-15 16:19:57 +00:00
|
|
|
// The Rollback marker will be used as a batch separator
|
2020-10-21 21:02:00 +00:00
|
|
|
s = WriteBatchInternal::MarkRollback(&rollback_batch, name_);
|
|
|
|
assert(s.ok());
|
2017-12-18 16:03:18 +00:00
|
|
|
bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
|
2017-12-01 07:39:56 +00:00
|
|
|
const bool DISABLE_MEMTABLE = true;
|
2018-03-22 21:27:44 +00:00
|
|
|
const uint64_t NO_REF_LOG = 0;
|
2017-10-03 02:46:42 +00:00
|
|
|
uint64_t seq_used = kMaxSequenceNumber;
|
2018-02-06 02:32:54 +00:00
|
|
|
const size_t ONE_BATCH = 1;
|
2019-04-02 22:14:41 +00:00
|
|
|
const bool kFirstPrepareBatch = true;
|
2019-01-17 20:03:08 +00:00
|
|
|
// We commit the rolled back prepared batches. Although this is
|
2018-04-20 22:25:12 +00:00
|
|
|
// counter-intuitive, i) it is safe to do so, since the prepared batches are
|
|
|
|
// already canceled out by the rollback batch, ii) adding the commit entry to
|
|
|
|
// CommitCache will allow us to benefit from the existing mechanism in
|
|
|
|
// CommitCache that keeps an entry evicted due to max advance and yet overlaps
|
|
|
|
// with a live snapshot around so that the live snapshot properly skips the
|
|
|
|
// entry even if its prepare seq is lower than max_evicted_seq_.
|
2019-03-07 15:26:36 +00:00
|
|
|
AddPreparedCallback add_prepared_callback(
|
2019-04-02 22:14:41 +00:00
|
|
|
wpt_db_, db_impl_, ONE_BATCH,
|
|
|
|
db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
|
2017-12-18 16:03:18 +00:00
|
|
|
WritePreparedCommitEntryPreReleaseCallback update_commit_map(
|
2018-04-20 22:25:12 +00:00
|
|
|
wpt_db_, db_impl_, GetId(), prepare_batch_cnt_, ONE_BATCH);
|
2019-03-07 15:26:36 +00:00
|
|
|
PreReleaseCallback* pre_release_callback;
|
|
|
|
if (do_one_write) {
|
|
|
|
pre_release_callback = &update_commit_map;
|
|
|
|
} else {
|
|
|
|
pre_release_callback = &add_prepared_callback;
|
|
|
|
}
|
2018-04-03 03:19:21 +00:00
|
|
|
// Note: the rollback batch does not need AddPrepared since it is written to
|
|
|
|
// DB in one shot. min_uncommitted still works since it requires capturing
|
|
|
|
// data that is written to DB but not yet committed, while
|
2019-01-17 20:03:08 +00:00
|
|
|
// the rollback batch commits with PreReleaseCallback.
|
2017-10-03 02:46:42 +00:00
|
|
|
s = db_impl_->WriteImpl(write_options_, &rollback_batch, nullptr, nullptr,
|
2018-03-22 21:27:44 +00:00
|
|
|
NO_REF_LOG, !DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
|
2019-03-07 15:26:36 +00:00
|
|
|
pre_release_callback);
|
2018-01-09 16:47:46 +00:00
|
|
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
2017-12-01 07:39:56 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2017-12-18 16:03:18 +00:00
|
|
|
if (do_one_write) {
|
2019-06-10 18:47:16 +00:00
|
|
|
assert(!db_impl_->immutable_db_options().two_write_queues);
|
2018-04-20 22:25:12 +00:00
|
|
|
wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
|
2017-12-18 16:03:18 +00:00
|
|
|
return s;
|
|
|
|
} // else do the 2nd write for commit
|
2019-03-07 15:26:36 +00:00
|
|
|
uint64_t rollback_seq = seq_used;
|
2018-01-09 16:47:46 +00:00
|
|
|
ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
|
2019-03-07 15:26:36 +00:00
|
|
|
"RollbackInternal 2nd write rollback_seq: %" PRIu64,
|
|
|
|
rollback_seq);
|
2017-12-01 07:39:56 +00:00
|
|
|
// Commit the batch by writing an empty batch to the queue that will release
|
|
|
|
// the commit sequence number to readers.
|
WritePrepared: fix two versions in compaction see different status for released snapshots (#4890)
Summary:
Fix how CompactionIterator::findEarliestVisibleSnapshots handles released snapshot. It fixing the two scenarios:
Scenario 1:
key1 has two values v1 and v2. There're two snapshots s1 and s2 taken after v1 and v2 are committed. Right after compaction output v2, s1 is released. Now findEarliestVisibleSnapshot may see s1 being released, and return the next snapshot, which is s2. That's larger than v2's earliest visible snapshot, which was s1.
The fix: the only place we check against last snapshot and current key snapshot is when we decide whether to compact out a value if it is hidden by a later value. In the check if we see current snapshot is even larger than last snapshot, we know last snapshot is released, and we are safe to compact out current key.
Scenario 2:
key1 has two values v1 and v2. there are two snapshots s1 and s2 taken after v1 and v2 are committed. During compaction before we process the key, s1 is released. When compaction process v2, snapshot checker may return kSnapshotReleased, and the earliest visible snapshot for v2 become s2. When compaction process v1, snapshot checker may return kIsInSnapshot (for WritePrepared transaction, it could be because v1 is still in commit cache). The result will become inconsistent here.
The fix: remember the set of released snapshots ever reported by snapshot checker, and ignore them when finding result for findEarliestVisibleSnapshot.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4890
Differential Revision: D13705538
Pulled By: maysamyabandeh
fbshipit-source-id: e577f0d9ee1ff5a6035f26859e56902ecc85a5a4
2019-01-19 01:20:13 +00:00
|
|
|
WritePreparedRollbackPreReleaseCallback update_commit_map_with_prepare(
|
2019-03-07 15:26:36 +00:00
|
|
|
wpt_db_, db_impl_, GetId(), rollback_seq, prepare_batch_cnt_);
|
2017-12-01 07:39:56 +00:00
|
|
|
WriteBatch empty_batch;
|
2020-10-21 21:02:00 +00:00
|
|
|
s = empty_batch.PutLogData(Slice());
|
|
|
|
assert(s.ok());
|
2017-12-01 07:39:56 +00:00
|
|
|
// In the absence of Prepare markers, use Noop as a batch separator
|
2020-10-21 21:02:00 +00:00
|
|
|
s = WriteBatchInternal::InsertNoop(&empty_batch);
|
|
|
|
assert(s.ok());
|
2017-12-01 07:39:56 +00:00
|
|
|
s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
|
2018-03-22 21:27:44 +00:00
|
|
|
NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
|
2017-12-18 16:03:18 +00:00
|
|
|
&update_commit_map_with_prepare);
|
2018-01-09 16:47:46 +00:00
|
|
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
WritePrepared: fix two versions in compaction see different status for released snapshots (#4890)
Summary:
Fix how CompactionIterator::findEarliestVisibleSnapshots handles released snapshot. It fixing the two scenarios:
Scenario 1:
key1 has two values v1 and v2. There're two snapshots s1 and s2 taken after v1 and v2 are committed. Right after compaction output v2, s1 is released. Now findEarliestVisibleSnapshot may see s1 being released, and return the next snapshot, which is s2. That's larger than v2's earliest visible snapshot, which was s1.
The fix: the only place we check against last snapshot and current key snapshot is when we decide whether to compact out a value if it is hidden by a later value. In the check if we see current snapshot is even larger than last snapshot, we know last snapshot is released, and we are safe to compact out current key.
Scenario 2:
key1 has two values v1 and v2. there are two snapshots s1 and s2 taken after v1 and v2 are committed. During compaction before we process the key, s1 is released. When compaction process v2, snapshot checker may return kSnapshotReleased, and the earliest visible snapshot for v2 become s2. When compaction process v1, snapshot checker may return kIsInSnapshot (for WritePrepared transaction, it could be because v1 is still in commit cache). The result will become inconsistent here.
The fix: remember the set of released snapshots ever reported by snapshot checker, and ignore them when finding result for findEarliestVisibleSnapshot.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4890
Differential Revision: D13705538
Pulled By: maysamyabandeh
fbshipit-source-id: e577f0d9ee1ff5a6035f26859e56902ecc85a5a4
2019-01-19 01:20:13 +00:00
|
|
|
ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
|
|
|
|
"RollbackInternal (status=%s) commit: %" PRIu64,
|
|
|
|
s.ToString().c_str(), GetId());
|
2019-06-10 18:47:16 +00:00
|
|
|
// TODO(lth): For WriteUnPrepared that rollback is called frequently,
|
|
|
|
// RemovePrepared could be moved to the callback to reduce lock contention.
|
2018-01-09 16:47:46 +00:00
|
|
|
if (s.ok()) {
|
2018-04-20 22:25:12 +00:00
|
|
|
wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
|
2018-01-09 16:47:46 +00:00
|
|
|
}
|
2019-06-10 18:47:16 +00:00
|
|
|
// Note: RemovePrepared for prepared batch is called from within
|
|
|
|
// PreReleaseCallback
|
2019-03-07 15:26:36 +00:00
|
|
|
wpt_db_->RemovePrepared(rollback_seq, ONE_BATCH);
|
2017-10-03 02:46:42 +00:00
|
|
|
|
|
|
|
return s;
|
2017-08-03 15:46:47 +00:00
|
|
|
}
|
|
|
|
|
2017-11-02 01:56:25 +00:00
|
|
|
Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key,
|
2017-11-11 21:08:22 +00:00
|
|
|
SequenceNumber* tracked_at_seq) {
|
2017-11-02 01:56:25 +00:00
|
|
|
assert(snapshot_);
|
|
|
|
|
2018-04-03 03:19:21 +00:00
|
|
|
SequenceNumber min_uncommitted =
|
2020-04-29 20:06:27 +00:00
|
|
|
static_cast_with_check<const SnapshotImpl>(snapshot_.get())
|
2018-04-03 03:19:21 +00:00
|
|
|
->min_uncommitted_;
|
2017-11-02 01:56:25 +00:00
|
|
|
SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
|
2017-11-11 21:08:22 +00:00
|
|
|
// tracked_at_seq is either max or the last snapshot with which this key was
|
2017-11-02 01:56:25 +00:00
|
|
|
// trackeed so there is no need to apply the IsInSnapshot to this comparison
|
2017-11-11 21:08:22 +00:00
|
|
|
// here as tracked_at_seq is not a prepare seq.
|
|
|
|
if (*tracked_at_seq <= snap_seq) {
|
2017-11-02 01:56:25 +00:00
|
|
|
// If the key has been previous validated at a sequence number earlier
|
|
|
|
// than the curent snapshot's sequence number, we already know it has not
|
|
|
|
// been modified.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2017-11-11 21:08:22 +00:00
|
|
|
*tracked_at_seq = snap_seq;
|
2017-11-02 01:56:25 +00:00
|
|
|
|
|
|
|
ColumnFamilyHandle* cfh =
|
|
|
|
column_family ? column_family : db_impl_->DefaultColumnFamily();
|
|
|
|
|
2019-08-05 20:30:56 +00:00
|
|
|
WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted,
|
|
|
|
kBackedByDBSnapshot);
|
2021-11-15 20:50:42 +00:00
|
|
|
// TODO(yanqin): support user-defined timestamp
|
|
|
|
return TransactionUtil::CheckKeyForConflicts(
|
|
|
|
db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr,
|
|
|
|
false /* cache_only */, &snap_checker, min_uncommitted);
|
2017-11-02 01:56:25 +00:00
|
|
|
}
|
|
|
|
|
2018-04-03 03:19:21 +00:00
|
|
|
void WritePreparedTxn::SetSnapshot() {
|
2019-01-16 02:07:50 +00:00
|
|
|
const bool kForWWConflictCheck = true;
|
|
|
|
SnapshotImpl* snapshot = wpt_db_->GetSnapshotInternal(kForWWConflictCheck);
|
2018-04-03 03:19:21 +00:00
|
|
|
SetSnapshotInternal(snapshot);
|
|
|
|
}
|
|
|
|
|
2018-02-06 02:32:54 +00:00
|
|
|
Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) {
|
|
|
|
auto ret = PessimisticTransaction::RebuildFromWriteBatch(src_batch);
|
2018-02-23 02:05:14 +00:00
|
|
|
prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
|
2018-02-06 02:32:54 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|