rocksdb/utilities/transactions/write_prepared_txn.h

119 lines
4.4 KiB
C
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <algorithm>
#include <atomic>
#include <mutex>
#include <stack>
#include <string>
#include <unordered_map>
#include <vector>
#include "db/write_callback.h"
#include "rocksdb/db.h"
#include "rocksdb/slice.h"
#include "rocksdb/snapshot.h"
#include "rocksdb/status.h"
#include "rocksdb/types.h"
#include "rocksdb/utilities/transaction.h"
#include "rocksdb/utilities/transaction_db.h"
#include "rocksdb/utilities/write_batch_with_index.h"
#include "util/autovector.h"
#include "utilities/transactions/pessimistic_transaction.h"
#include "utilities/transactions/pessimistic_transaction_db.h"
#include "utilities/transactions/transaction_base.h"
#include "utilities/transactions/transaction_util.h"
namespace ROCKSDB_NAMESPACE {
class WritePreparedTxnDB;
// This impl could write to DB also uncommitted data and then later tell apart
// committed data from uncommitted data. Uncommitted data could be after the
// Prepare phase in 2PC (WritePreparedTxn) or before that
// (WriteUnpreparedTxnImpl).
class WritePreparedTxn : public PessimisticTransaction {
public:
WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options,
const TransactionOptions& txn_options);
// No copying allowed
WritePreparedTxn(const WritePreparedTxn&) = delete;
void operator=(const WritePreparedTxn&) = delete;
virtual ~WritePreparedTxn() {}
// To make WAL commit markers visible, the snapshot will be based on the last
// seq in the WAL that is also published, LastPublishedSequence, as opposed to
// the last seq in the memtable.
using Transaction::Get;
Status Get(const ReadOptions& _read_options,
ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value) override;
using Transaction::MultiGet;
void MultiGet(const ReadOptions& _read_options,
ColumnFamilyHandle* column_family, const size_t num_keys,
const Slice* keys, PinnableSlice* values, Status* statuses,
const bool sorted_input = false) override;
WriteUnPrepared: less virtual in iterator callback (#5049) Summary: WriteUnPrepared adds a virtual function, MaxUnpreparedSequenceNumber, to ReadCallback, which returns 0 unless WriteUnPrepared is enabled and the transaction has uncommitted data written to the DB. Together with snapshot sequence number, this determines the last sequence that is visible to reads. The patch clarifies the guarantees of the GetIterator API in WriteUnPrepared transactions and make use of that to statically initialize the read callback and thus avoid the virtual call. Furthermore it increases the minimum value for min_uncommitted from 0 to 1 as seq 0 is used only for last level keys that are committed in all snapshots. The following benchmark shows +0.26% higher throughput in seekrandom benchmark. Benchmark: ./db_bench --benchmarks=fillrandom --use_existing_db=0 --num=1000000 --db=/dev/shm/dbbench ./db_bench --benchmarks=seekrandom[X10] --use_existing_db=1 --db=/dev/shm/dbbench --num=1000000 --duration=60 --seek_nexts=100 seekrandom [AVG 10 runs] : 20355 ops/sec; 225.2 MB/sec seekrandom [MEDIAN 10 runs] : 20425 ops/sec; 225.9 MB/sec ./db_bench_lessvirtual3 --benchmarks=seekrandom[X10] --use_existing_db=1 --db=/dev/shm/dbbench --num=1000000 --duration=60 --seek_nexts=100 seekrandom [AVG 10 runs] : 20409 ops/sec; 225.8 MB/sec seekrandom [MEDIAN 10 runs] : 20487 ops/sec; 226.6 MB/sec Pull Request resolved: https://github.com/facebook/rocksdb/pull/5049 Differential Revision: D14366459 Pulled By: maysamyabandeh fbshipit-source-id: ebaff8908332a5ae9af7defeadabcb624be660ef
2019-04-02 21:43:03 +00:00
// Note: The behavior is undefined in presence of interleaved writes to the
// same transaction.
// To make WAL commit markers visible, the snapshot will be
// based on the last seq in the WAL that is also published,
// LastPublishedSequence, as opposed to the last seq in the memtable.
using Transaction::GetIterator;
Iterator* GetIterator(const ReadOptions& options) override;
Iterator* GetIterator(const ReadOptions& options,
ColumnFamilyHandle* column_family) override;
void SetSnapshot() override;
protected:
void Initialize(const TransactionOptions& txn_options) override;
// Override the protected SetId to make it visible to the friend class
// WritePreparedTxnDB
inline void SetId(uint64_t id) override { Transaction::SetId(id); }
private:
friend class WritePreparedTransactionTest_BasicRecoveryTest_Test;
friend class WritePreparedTxnDB;
friend class WriteUnpreparedTxnDB;
friend class WriteUnpreparedTxn;
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
using Transaction::GetImpl;
Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* value) override;
Status PrepareInternal() override;
Status CommitWithoutPrepareInternal() override;
Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override;
// Since the data is already written to memtables at the Prepare phase, the
// commit entails writing only a commit marker in the WAL. The sequence number
// of the commit marker is then the commit timestamp of the transaction. To
// make WAL commit markers visible, the snapshot will be based on the last seq
// in the WAL that is also published, LastPublishedSequence, as opposed to the
// last seq in the memtable.
Status CommitInternal() override;
Status RollbackInternal() override;
Status ValidateSnapshot(ColumnFamilyHandle* column_family, const Slice& key,
SequenceNumber* tracked_at_seq) override;
Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
WritePreparedTxnDB* wpt_db_;
// Number of sub-batches in prepare
size_t prepare_batch_cnt_ = 0;
};
} // namespace ROCKSDB_NAMESPACE