mirror of https://github.com/facebook/rocksdb.git
323 lines
13 KiB
C++
323 lines
13 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
#include <set>
|
|
|
|
#include "utilities/transactions/write_prepared_txn.h"
|
|
#include "utilities/transactions/write_unprepared_txn_db.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
class WriteUnpreparedTxnDB;
|
|
class WriteUnpreparedTxn;
|
|
|
|
// WriteUnprepared transactions needs to be able to read their own uncommitted
|
|
// writes, and supporting this requires some careful consideration. Because
|
|
// writes in the current transaction may be flushed to DB already, we cannot
|
|
// rely on the contents of WriteBatchWithIndex to determine whether a key should
|
|
// be visible or not, so we have to remember to check the DB for any uncommitted
|
|
// keys that should be visible to us. First, we will need to change the seek to
|
|
// snapshot logic, to seek to max_visible_seq = max(snap_seq, max_unprep_seq).
|
|
// Any key greater than max_visible_seq should not be visible because they
|
|
// cannot be unprepared by the current transaction and they are not in its
|
|
// snapshot.
|
|
//
|
|
// When we seek to max_visible_seq, one of these cases will happen:
|
|
// 1. We hit a unprepared key from the current transaction.
|
|
// 2. We hit a unprepared key from the another transaction.
|
|
// 3. We hit a committed key with snap_seq < seq < max_unprep_seq.
|
|
// 4. We hit a committed key with seq <= snap_seq.
|
|
//
|
|
// IsVisibleFullCheck handles all cases correctly.
|
|
//
|
|
// Other notes:
|
|
// Note that max_visible_seq is only calculated once at iterator construction
|
|
// time, meaning if the same transaction is adding more unprep seqs through
|
|
// writes during iteration, these newer writes may not be visible. This is not a
|
|
// problem for MySQL though because it avoids modifying the index as it is
|
|
// scanning through it to avoid the Halloween Problem. Instead, it scans the
|
|
// index once up front, and modifies based on a temporary copy.
|
|
//
|
|
// In DBIter, there is a "reseek" optimization if the iterator skips over too
|
|
// many keys. However, this assumes that the reseek seeks exactly to the
|
|
// required key. In write unprepared, even after seeking directly to
|
|
// max_visible_seq, some iteration may be required before hitting a visible key,
|
|
// and special precautions must be taken to avoid performing another reseek,
|
|
// leading to an infinite loop.
|
|
//
|
|
class WriteUnpreparedTxnReadCallback : public ReadCallback {
|
|
public:
|
|
WriteUnpreparedTxnReadCallback(
|
|
WritePreparedTxnDB* db, SequenceNumber snapshot,
|
|
SequenceNumber min_uncommitted,
|
|
const std::map<SequenceNumber, size_t>& unprep_seqs,
|
|
SnapshotBackup backed_by_snapshot)
|
|
// Pass our last uncommitted seq as the snapshot to the parent class to
|
|
// ensure that the parent will not prematurely filter out own writes. We
|
|
// will do the exact comparison against snapshots in IsVisibleFullCheck
|
|
// override.
|
|
: ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted),
|
|
db_(db),
|
|
unprep_seqs_(unprep_seqs),
|
|
wup_snapshot_(snapshot),
|
|
backed_by_snapshot_(backed_by_snapshot) {
|
|
(void)backed_by_snapshot_; // to silence unused private field warning
|
|
}
|
|
|
|
virtual ~WriteUnpreparedTxnReadCallback() {
|
|
// If it is not backed by snapshot, the caller must check validity
|
|
assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot);
|
|
}
|
|
|
|
virtual bool IsVisibleFullCheck(SequenceNumber seq) override;
|
|
|
|
inline bool valid() {
|
|
valid_checked_ = true;
|
|
return snap_released_ == false;
|
|
}
|
|
|
|
void Refresh(SequenceNumber seq) override {
|
|
max_visible_seq_ = std::max(max_visible_seq_, seq);
|
|
wup_snapshot_ = seq;
|
|
}
|
|
|
|
static SequenceNumber CalcMaxVisibleSeq(
|
|
const std::map<SequenceNumber, size_t>& unprep_seqs,
|
|
SequenceNumber snapshot_seq) {
|
|
SequenceNumber max_unprepared = 0;
|
|
if (unprep_seqs.size()) {
|
|
max_unprepared =
|
|
unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1;
|
|
}
|
|
return std::max(max_unprepared, snapshot_seq);
|
|
}
|
|
|
|
private:
|
|
WritePreparedTxnDB* db_;
|
|
const std::map<SequenceNumber, size_t>& unprep_seqs_;
|
|
SequenceNumber wup_snapshot_;
|
|
// Whether max_visible_seq_ is backed by a snapshot
|
|
const SnapshotBackup backed_by_snapshot_;
|
|
bool snap_released_ = false;
|
|
// Safety check to ensure that the caller has checked invalid statuses
|
|
bool valid_checked_ = false;
|
|
};
|
|
|
|
class WriteUnpreparedTxn : public WritePreparedTxn {
|
|
public:
|
|
WriteUnpreparedTxn(WriteUnpreparedTxnDB* db,
|
|
const WriteOptions& write_options,
|
|
const TransactionOptions& txn_options);
|
|
|
|
virtual ~WriteUnpreparedTxn();
|
|
|
|
using TransactionBaseImpl::Put;
|
|
virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
|
|
const Slice& value,
|
|
const bool assume_tracked = false) override;
|
|
virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
|
|
const SliceParts& value,
|
|
const bool assume_tracked = false) override;
|
|
|
|
using TransactionBaseImpl::Merge;
|
|
virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
|
const Slice& value,
|
|
const bool assume_tracked = false) override;
|
|
|
|
using TransactionBaseImpl::Delete;
|
|
virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
|
|
const bool assume_tracked = false) override;
|
|
virtual Status Delete(ColumnFamilyHandle* column_family,
|
|
const SliceParts& key,
|
|
const bool assume_tracked = false) override;
|
|
|
|
using TransactionBaseImpl::SingleDelete;
|
|
virtual Status SingleDelete(ColumnFamilyHandle* column_family,
|
|
const Slice& key,
|
|
const bool assume_tracked = false) override;
|
|
virtual Status SingleDelete(ColumnFamilyHandle* column_family,
|
|
const SliceParts& key,
|
|
const bool assume_tracked = false) override;
|
|
|
|
// In WriteUnprepared, untracked writes will break snapshot validation logic.
|
|
// Snapshot validation will only check the largest sequence number of a key to
|
|
// see if it was committed or not. However, an untracked unprepared write will
|
|
// hide smaller committed sequence numbers.
|
|
//
|
|
// TODO(lth): Investigate whether it is worth having snapshot validation
|
|
// validate all values larger than snap_seq. Otherwise, we should return
|
|
// Status::NotSupported for untracked writes.
|
|
|
|
virtual Status RebuildFromWriteBatch(WriteBatch*) override;
|
|
|
|
virtual uint64_t GetLastLogNumber() const override {
|
|
return last_log_number_;
|
|
}
|
|
|
|
void RemoveActiveIterator(Iterator* iter) {
|
|
active_iterators_.erase(
|
|
std::remove(active_iterators_.begin(), active_iterators_.end(), iter),
|
|
active_iterators_.end());
|
|
}
|
|
|
|
protected:
|
|
void Initialize(const TransactionOptions& txn_options) override;
|
|
|
|
Status PrepareInternal() override;
|
|
|
|
Status CommitWithoutPrepareInternal() override;
|
|
Status CommitInternal() override;
|
|
|
|
Status RollbackInternal() override;
|
|
|
|
void Clear() override;
|
|
|
|
void SetSavePoint() override;
|
|
Status RollbackToSavePoint() override;
|
|
Status PopSavePoint() override;
|
|
|
|
// Get and GetIterator needs to be overridden so that a ReadCallback to
|
|
// handle read-your-own-write is used.
|
|
using Transaction::Get;
|
|
virtual Status Get(const ReadOptions& options,
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
PinnableSlice* value) override;
|
|
|
|
using Transaction::MultiGet;
|
|
virtual void MultiGet(const ReadOptions& options,
|
|
ColumnFamilyHandle* column_family,
|
|
const size_t num_keys, const Slice* keys,
|
|
PinnableSlice* values, Status* statuses,
|
|
bool sorted_input = false) override;
|
|
|
|
using Transaction::GetIterator;
|
|
virtual Iterator* GetIterator(const ReadOptions& options) override;
|
|
virtual Iterator* GetIterator(const ReadOptions& options,
|
|
ColumnFamilyHandle* column_family) override;
|
|
|
|
virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
|
|
const Slice& key,
|
|
SequenceNumber* tracked_at_seq) override;
|
|
|
|
private:
|
|
friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test;
|
|
friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
|
|
friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test;
|
|
friend class WriteUnpreparedTxnDB;
|
|
|
|
const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
|
|
|
|
Status MaybeFlushWriteBatchToDB();
|
|
Status FlushWriteBatchToDB(bool prepared);
|
|
Status FlushWriteBatchToDBInternal(bool prepared);
|
|
Status FlushWriteBatchWithSavePointToDB();
|
|
Status RollbackToSavePointInternal();
|
|
Status HandleWrite(std::function<Status()> do_write);
|
|
|
|
// For write unprepared, we check on every writebatch append to see if
|
|
// write_batch_flush_threshold_ has been exceeded, and then call
|
|
// FlushWriteBatchToDB if so. This logic is encapsulated in
|
|
// MaybeFlushWriteBatchToDB.
|
|
int64_t write_batch_flush_threshold_;
|
|
WriteUnpreparedTxnDB* wupt_db_;
|
|
|
|
// Ordered list of unprep_seq sequence numbers that we have already written
|
|
// to DB.
|
|
//
|
|
// This maps unprep_seq => prepare_batch_cnt for each unprepared batch
|
|
// written by this transaction.
|
|
//
|
|
// Note that this contains both prepared and unprepared batches, since they
|
|
// are treated similarily in prepare heap/commit map, so it simplifies the
|
|
// commit callbacks.
|
|
std::map<SequenceNumber, size_t> unprep_seqs_;
|
|
|
|
uint64_t last_log_number_;
|
|
|
|
// Recovered transactions have tracked_keys_ populated, but are not actually
|
|
// locked for efficiency reasons. For recovered transactions, skip unlocking
|
|
// keys when transaction ends.
|
|
bool recovered_txn_;
|
|
|
|
// Track the largest sequence number at which we performed snapshot
|
|
// validation. If snapshot validation was skipped because no snapshot was set,
|
|
// then this is set to GetLastPublishedSequence. This value is useful because
|
|
// it means that for keys that have unprepared seqnos, we can guarantee that
|
|
// no committed keys by other transactions can exist between
|
|
// largest_validated_seq_ and max_unprep_seq. See
|
|
// WriteUnpreparedTxnDB::NewIterator for an explanation for why this is
|
|
// necessary for iterator Prev().
|
|
//
|
|
// Currently this value only increases during the lifetime of a transaction,
|
|
// but in some cases, we should be able to restore the previously largest
|
|
// value when calling RollbackToSavepoint.
|
|
SequenceNumber largest_validated_seq_;
|
|
|
|
struct SavePoint {
|
|
// Record of unprep_seqs_ at this savepoint. The set of unprep_seq is
|
|
// used during RollbackToSavepoint to determine visibility when restoring
|
|
// old values.
|
|
//
|
|
// TODO(lth): Since all unprep_seqs_ sets further down the stack must be
|
|
// subsets, this can potentially be deduplicated by just storing set
|
|
// difference. Investigate if this is worth it.
|
|
std::map<SequenceNumber, size_t> unprep_seqs_;
|
|
|
|
// This snapshot will be used to read keys at this savepoint if we call
|
|
// RollbackToSavePoint.
|
|
std::unique_ptr<ManagedSnapshot> snapshot_;
|
|
|
|
SavePoint(const std::map<SequenceNumber, size_t>& seqs,
|
|
ManagedSnapshot* snapshot)
|
|
: unprep_seqs_(seqs), snapshot_(snapshot){};
|
|
};
|
|
|
|
// We have 3 data structures holding savepoint information:
|
|
// 1. TransactionBaseImpl::save_points_
|
|
// 2. WriteUnpreparedTxn::flushed_save_points_
|
|
// 3. WriteUnpreparecTxn::unflushed_save_points_
|
|
//
|
|
// TransactionBaseImpl::save_points_ holds information about all write
|
|
// batches, including the current in-memory write_batch_, or unprepared
|
|
// batches that have been written out. Its responsibility is just to track
|
|
// which keys have been modified in every savepoint.
|
|
//
|
|
// WriteUnpreparedTxn::flushed_save_points_ holds information about savepoints
|
|
// set on unprepared batches that have already flushed. It holds the snapshot
|
|
// and unprep_seqs at that savepoint, so that the rollback process can
|
|
// determine which keys were visible at that point in time.
|
|
//
|
|
// WriteUnpreparecTxn::unflushed_save_points_ holds information about
|
|
// savepoints on the current in-memory write_batch_. It simply records the
|
|
// size of the write batch at every savepoint.
|
|
//
|
|
// TODO(lth): Remove the redundancy between save_point_boundaries_ and
|
|
// write_batch_.save_points_.
|
|
//
|
|
// Based on this information, here are some invariants:
|
|
// size(unflushed_save_points_) = size(write_batch_.save_points_)
|
|
// size(flushed_save_points_) + size(unflushed_save_points_)
|
|
// = size(save_points_)
|
|
//
|
|
std::unique_ptr<autovector<WriteUnpreparedTxn::SavePoint>>
|
|
flushed_save_points_;
|
|
std::unique_ptr<autovector<size_t>> unflushed_save_points_;
|
|
|
|
// It is currently unsafe to flush a write batch if there are active iterators
|
|
// created from this transaction. This is because we use WriteBatchWithIndex
|
|
// to do merging reads from the DB and the write batch. If we flush the write
|
|
// batch, it is possible that the delta iterator on the iterator will point to
|
|
// invalid memory.
|
|
std::vector<Iterator*> active_iterators_;
|
|
};
|
|
|
|
} // namespace rocksdb
|
|
|
|
#endif // ROCKSDB_LITE
|