mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-27 11:43:49 +00:00
c3d5c4d38a
Summary: This patch refactors TransactionImpl by separating the logic for pessimistic concurrency control from the implementation of how to write the data to rocksdb. The existing implementation is named WriteCommittedTxnImpl as it writes committed data to the db. A template named WritePreparedTxnImpl is also added which will be later completed to provide a an alternative implementation. Closes https://github.com/facebook/rocksdb/pull/2676 Differential Revision: D5549998 Pulled By: maysamyabandeh fbshipit-source-id: 16298e86b43ca4849324c1f35c731913c6d17bec
466 lines
15 KiB
C++
466 lines
15 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
#include "utilities/transactions/transaction_db_impl.h"
|
|
|
|
#include <string>
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
|
|
#include "db/db_impl.h"
|
|
#include "rocksdb/db.h"
|
|
#include "rocksdb/options.h"
|
|
#include "rocksdb/utilities/transaction_db.h"
|
|
#include "util/cast_util.h"
|
|
#include "utilities/transactions/transaction_db_mutex_impl.h"
|
|
#include "utilities/transactions/transaction_impl.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
TransactionDBImpl::TransactionDBImpl(DB* db,
|
|
const TransactionDBOptions& txn_db_options)
|
|
: TransactionDB(db),
|
|
db_impl_(static_cast_with_check<DBImpl, DB>(db)),
|
|
txn_db_options_(txn_db_options),
|
|
lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks,
|
|
txn_db_options_.custom_mutex_factory
|
|
? txn_db_options_.custom_mutex_factory
|
|
: std::shared_ptr<TransactionDBMutexFactory>(
|
|
new TransactionDBMutexFactoryImpl())) {
|
|
assert(db_impl_ != nullptr);
|
|
}
|
|
|
|
// Support initiliazing TransactionDBImpl from a stackable db
|
|
//
|
|
// TransactionDBImpl
|
|
// ^ ^
|
|
// | |
|
|
// | +
|
|
// | StackableDB
|
|
// | ^
|
|
// | |
|
|
// + +
|
|
// DBImpl
|
|
// ^
|
|
// |(inherit)
|
|
// +
|
|
// DB
|
|
//
|
|
TransactionDBImpl::TransactionDBImpl(StackableDB* db,
|
|
const TransactionDBOptions& txn_db_options)
|
|
: TransactionDB(db),
|
|
db_impl_(static_cast_with_check<DBImpl, DB>(db->GetRootDB())),
|
|
txn_db_options_(txn_db_options),
|
|
lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks,
|
|
txn_db_options_.custom_mutex_factory
|
|
? txn_db_options_.custom_mutex_factory
|
|
: std::shared_ptr<TransactionDBMutexFactory>(
|
|
new TransactionDBMutexFactoryImpl())) {
|
|
assert(db_impl_ != nullptr);
|
|
}
|
|
|
|
TransactionDBImpl::~TransactionDBImpl() {
|
|
while (!transactions_.empty()) {
|
|
delete transactions_.begin()->second;
|
|
}
|
|
}
|
|
|
|
Status TransactionDBImpl::Initialize(
|
|
const std::vector<size_t>& compaction_enabled_cf_indices,
|
|
const std::vector<ColumnFamilyHandle*>& handles) {
|
|
for (auto cf_ptr : handles) {
|
|
AddColumnFamily(cf_ptr);
|
|
}
|
|
// Re-enable compaction for the column families that initially had
|
|
// compaction enabled.
|
|
std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
|
|
compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
|
|
for (auto index : compaction_enabled_cf_indices) {
|
|
compaction_enabled_cf_handles.push_back(handles[index]);
|
|
}
|
|
|
|
Status s = EnableAutoCompaction(compaction_enabled_cf_handles);
|
|
|
|
// create 'real' transactions from recovered shell transactions
|
|
auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
|
|
assert(dbimpl != nullptr);
|
|
auto rtrxs = dbimpl->recovered_transactions();
|
|
|
|
for (auto it = rtrxs.begin(); it != rtrxs.end(); it++) {
|
|
auto recovered_trx = it->second;
|
|
assert(recovered_trx);
|
|
assert(recovered_trx->log_number_);
|
|
assert(recovered_trx->name_.length());
|
|
|
|
WriteOptions w_options;
|
|
w_options.sync = true;
|
|
TransactionOptions t_options;
|
|
|
|
Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
|
|
assert(real_trx);
|
|
real_trx->SetLogNumber(recovered_trx->log_number_);
|
|
|
|
s = real_trx->SetName(recovered_trx->name_);
|
|
if (!s.ok()) {
|
|
break;
|
|
}
|
|
|
|
s = real_trx->RebuildFromWriteBatch(recovered_trx->batch_);
|
|
real_trx->SetState(Transaction::PREPARED);
|
|
if (!s.ok()) {
|
|
break;
|
|
}
|
|
}
|
|
if (s.ok()) {
|
|
dbimpl->DeleteAllRecoveredTransactions();
|
|
}
|
|
return s;
|
|
}
|
|
|
|
Transaction* TransactionDBImpl::BeginTransaction(
|
|
const WriteOptions& write_options, const TransactionOptions& txn_options,
|
|
Transaction* old_txn) {
|
|
if (old_txn != nullptr) {
|
|
ReinitializeTransaction(old_txn, write_options, txn_options);
|
|
return old_txn;
|
|
} else {
|
|
return new WriteCommittedTxnImpl(this, write_options, txn_options);
|
|
}
|
|
}
|
|
|
|
TransactionDBOptions TransactionDBImpl::ValidateTxnDBOptions(
|
|
const TransactionDBOptions& txn_db_options) {
|
|
TransactionDBOptions validated = txn_db_options;
|
|
|
|
if (txn_db_options.num_stripes == 0) {
|
|
validated.num_stripes = 1;
|
|
}
|
|
|
|
return validated;
|
|
}
|
|
|
|
Status TransactionDB::Open(const Options& options,
|
|
const TransactionDBOptions& txn_db_options,
|
|
const std::string& dbname, TransactionDB** dbptr) {
|
|
DBOptions db_options(options);
|
|
ColumnFamilyOptions cf_options(options);
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
column_families.push_back(
|
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
Status s = TransactionDB::Open(db_options, txn_db_options, dbname,
|
|
column_families, &handles, dbptr);
|
|
if (s.ok()) {
|
|
assert(handles.size() == 1);
|
|
// i can delete the handle since DBImpl is always holding a reference to
|
|
// default column family
|
|
delete handles[0];
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
Status TransactionDB::Open(
|
|
const DBOptions& db_options, const TransactionDBOptions& txn_db_options,
|
|
const std::string& dbname,
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) {
|
|
Status s;
|
|
DB* db;
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
|
|
std::vector<size_t> compaction_enabled_cf_indices;
|
|
DBOptions db_options_2pc = db_options;
|
|
PrepareWrap(&db_options_2pc, &column_families_copy,
|
|
&compaction_enabled_cf_indices);
|
|
s = DB::Open(db_options_2pc, dbname, column_families_copy, handles, &db);
|
|
if (s.ok()) {
|
|
s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles,
|
|
dbptr);
|
|
}
|
|
return s;
|
|
}
|
|
|
|
void TransactionDB::PrepareWrap(
|
|
DBOptions* db_options, std::vector<ColumnFamilyDescriptor>* column_families,
|
|
std::vector<size_t>* compaction_enabled_cf_indices) {
|
|
compaction_enabled_cf_indices->clear();
|
|
|
|
// Enable MemTable History if not already enabled
|
|
for (size_t i = 0; i < column_families->size(); i++) {
|
|
ColumnFamilyOptions* cf_options = &(*column_families)[i].options;
|
|
|
|
if (cf_options->max_write_buffer_number_to_maintain == 0) {
|
|
// Setting to -1 will set the History size to max_write_buffer_number.
|
|
cf_options->max_write_buffer_number_to_maintain = -1;
|
|
}
|
|
if (!cf_options->disable_auto_compactions) {
|
|
// Disable compactions momentarily to prevent race with DB::Open
|
|
cf_options->disable_auto_compactions = true;
|
|
compaction_enabled_cf_indices->push_back(i);
|
|
}
|
|
}
|
|
db_options->allow_2pc = true;
|
|
}
|
|
|
|
Status TransactionDB::WrapDB(
|
|
// make sure this db is already opened with memtable history enabled,
|
|
// auto compaction distabled and 2 phase commit enabled
|
|
DB* db, const TransactionDBOptions& txn_db_options,
|
|
const std::vector<size_t>& compaction_enabled_cf_indices,
|
|
const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
|
|
TransactionDBImpl* txn_db = new TransactionDBImpl(
|
|
db, TransactionDBImpl::ValidateTxnDBOptions(txn_db_options));
|
|
*dbptr = txn_db;
|
|
Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
|
|
return s;
|
|
}
|
|
|
|
Status TransactionDB::WrapStackableDB(
|
|
// make sure this stackable_db is already opened with memtable history
|
|
// enabled,
|
|
// auto compaction distabled and 2 phase commit enabled
|
|
StackableDB* db, const TransactionDBOptions& txn_db_options,
|
|
const std::vector<size_t>& compaction_enabled_cf_indices,
|
|
const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
|
|
TransactionDBImpl* txn_db = new TransactionDBImpl(
|
|
db, TransactionDBImpl::ValidateTxnDBOptions(txn_db_options));
|
|
*dbptr = txn_db;
|
|
Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
|
|
return s;
|
|
}
|
|
|
|
// Let TransactionLockMgr know that this column family exists so it can
|
|
// allocate a LockMap for it.
|
|
void TransactionDBImpl::AddColumnFamily(const ColumnFamilyHandle* handle) {
|
|
lock_mgr_.AddColumnFamily(handle->GetID());
|
|
}
|
|
|
|
Status TransactionDBImpl::CreateColumnFamily(
|
|
const ColumnFamilyOptions& options, const std::string& column_family_name,
|
|
ColumnFamilyHandle** handle) {
|
|
InstrumentedMutexLock l(&column_family_mutex_);
|
|
|
|
Status s = db_->CreateColumnFamily(options, column_family_name, handle);
|
|
if (s.ok()) {
|
|
lock_mgr_.AddColumnFamily((*handle)->GetID());
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
// Let TransactionLockMgr know that it can deallocate the LockMap for this
|
|
// column family.
|
|
Status TransactionDBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
|
|
InstrumentedMutexLock l(&column_family_mutex_);
|
|
|
|
Status s = db_->DropColumnFamily(column_family);
|
|
if (s.ok()) {
|
|
lock_mgr_.RemoveColumnFamily(column_family->GetID());
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
Status TransactionDBImpl::TryLock(PessimisticTxn* txn, uint32_t cfh_id,
|
|
const std::string& key, bool exclusive) {
|
|
return lock_mgr_.TryLock(txn, cfh_id, key, GetEnv(), exclusive);
|
|
}
|
|
|
|
void TransactionDBImpl::UnLock(PessimisticTxn* txn,
|
|
const TransactionKeyMap* keys) {
|
|
lock_mgr_.UnLock(txn, keys, GetEnv());
|
|
}
|
|
|
|
void TransactionDBImpl::UnLock(PessimisticTxn* txn, uint32_t cfh_id,
|
|
const std::string& key) {
|
|
lock_mgr_.UnLock(txn, cfh_id, key, GetEnv());
|
|
}
|
|
|
|
// Used when wrapping DB write operations in a transaction
|
|
Transaction* TransactionDBImpl::BeginInternalTransaction(
|
|
const WriteOptions& options) {
|
|
TransactionOptions txn_options;
|
|
Transaction* txn = BeginTransaction(options, txn_options, nullptr);
|
|
|
|
// Use default timeout for non-transactional writes
|
|
txn->SetLockTimeout(txn_db_options_.default_lock_timeout);
|
|
return txn;
|
|
}
|
|
|
|
// All user Put, Merge, Delete, and Write requests must be intercepted to make
|
|
// sure that they lock all keys that they are writing to avoid causing conflicts
|
|
// with any concurent transactions. The easiest way to do this is to wrap all
|
|
// write operations in a transaction.
|
|
//
|
|
// Put(), Merge(), and Delete() only lock a single key per call. Write() will
|
|
// sort its keys before locking them. This guarantees that TransactionDB write
|
|
// methods cannot deadlock with eachother (but still could deadlock with a
|
|
// Transaction).
|
|
Status TransactionDBImpl::Put(const WriteOptions& options,
|
|
ColumnFamilyHandle* column_family,
|
|
const Slice& key, const Slice& val) {
|
|
Status s;
|
|
|
|
Transaction* txn = BeginInternalTransaction(options);
|
|
txn->DisableIndexing();
|
|
|
|
// Since the client didn't create a transaction, they don't care about
|
|
// conflict checking for this write. So we just need to do PutUntracked().
|
|
s = txn->PutUntracked(column_family, key, val);
|
|
|
|
if (s.ok()) {
|
|
s = txn->Commit();
|
|
}
|
|
|
|
delete txn;
|
|
|
|
return s;
|
|
}
|
|
|
|
Status TransactionDBImpl::Delete(const WriteOptions& wopts,
|
|
ColumnFamilyHandle* column_family,
|
|
const Slice& key) {
|
|
Status s;
|
|
|
|
Transaction* txn = BeginInternalTransaction(wopts);
|
|
txn->DisableIndexing();
|
|
|
|
// Since the client didn't create a transaction, they don't care about
|
|
// conflict checking for this write. So we just need to do
|
|
// DeleteUntracked().
|
|
s = txn->DeleteUntracked(column_family, key);
|
|
|
|
if (s.ok()) {
|
|
s = txn->Commit();
|
|
}
|
|
|
|
delete txn;
|
|
|
|
return s;
|
|
}
|
|
|
|
Status TransactionDBImpl::Merge(const WriteOptions& options,
|
|
ColumnFamilyHandle* column_family,
|
|
const Slice& key, const Slice& value) {
|
|
Status s;
|
|
|
|
Transaction* txn = BeginInternalTransaction(options);
|
|
txn->DisableIndexing();
|
|
|
|
// Since the client didn't create a transaction, they don't care about
|
|
// conflict checking for this write. So we just need to do
|
|
// MergeUntracked().
|
|
s = txn->MergeUntracked(column_family, key, value);
|
|
|
|
if (s.ok()) {
|
|
s = txn->Commit();
|
|
}
|
|
|
|
delete txn;
|
|
|
|
return s;
|
|
}
|
|
|
|
Status TransactionDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
|
|
// Need to lock all keys in this batch to prevent write conflicts with
|
|
// concurrent transactions.
|
|
Transaction* txn = BeginInternalTransaction(opts);
|
|
txn->DisableIndexing();
|
|
|
|
auto txn_impl = static_cast_with_check<PessimisticTxn, Transaction>(txn);
|
|
|
|
// Since commitBatch sorts the keys before locking, concurrent Write()
|
|
// operations will not cause a deadlock.
|
|
// In order to avoid a deadlock with a concurrent Transaction, Transactions
|
|
// should use a lock timeout.
|
|
Status s = txn_impl->CommitBatch(updates);
|
|
|
|
delete txn;
|
|
|
|
return s;
|
|
}
|
|
|
|
void TransactionDBImpl::InsertExpirableTransaction(TransactionID tx_id,
|
|
PessimisticTxn* tx) {
|
|
assert(tx->GetExpirationTime() > 0);
|
|
std::lock_guard<std::mutex> lock(map_mutex_);
|
|
expirable_transactions_map_.insert({tx_id, tx});
|
|
}
|
|
|
|
void TransactionDBImpl::RemoveExpirableTransaction(TransactionID tx_id) {
|
|
std::lock_guard<std::mutex> lock(map_mutex_);
|
|
expirable_transactions_map_.erase(tx_id);
|
|
}
|
|
|
|
bool TransactionDBImpl::TryStealingExpiredTransactionLocks(
|
|
TransactionID tx_id) {
|
|
std::lock_guard<std::mutex> lock(map_mutex_);
|
|
|
|
auto tx_it = expirable_transactions_map_.find(tx_id);
|
|
if (tx_it == expirable_transactions_map_.end()) {
|
|
return true;
|
|
}
|
|
PessimisticTxn& tx = *(tx_it->second);
|
|
return tx.TryStealingLocks();
|
|
}
|
|
|
|
void TransactionDBImpl::ReinitializeTransaction(
|
|
Transaction* txn, const WriteOptions& write_options,
|
|
const TransactionOptions& txn_options) {
|
|
auto txn_impl = static_cast_with_check<PessimisticTxn, Transaction>(txn);
|
|
|
|
txn_impl->Reinitialize(this, write_options, txn_options);
|
|
}
|
|
|
|
Transaction* TransactionDBImpl::GetTransactionByName(
|
|
const TransactionName& name) {
|
|
std::lock_guard<std::mutex> lock(name_map_mutex_);
|
|
auto it = transactions_.find(name);
|
|
if (it == transactions_.end()) {
|
|
return nullptr;
|
|
} else {
|
|
return it->second;
|
|
}
|
|
}
|
|
|
|
void TransactionDBImpl::GetAllPreparedTransactions(
|
|
std::vector<Transaction*>* transv) {
|
|
assert(transv);
|
|
transv->clear();
|
|
std::lock_guard<std::mutex> lock(name_map_mutex_);
|
|
for (auto it = transactions_.begin(); it != transactions_.end(); it++) {
|
|
if (it->second->GetState() == Transaction::PREPARED) {
|
|
transv->push_back(it->second);
|
|
}
|
|
}
|
|
}
|
|
|
|
TransactionLockMgr::LockStatusData TransactionDBImpl::GetLockStatusData() {
|
|
return lock_mgr_.GetLockStatusData();
|
|
}
|
|
|
|
void TransactionDBImpl::RegisterTransaction(Transaction* txn) {
|
|
assert(txn);
|
|
assert(txn->GetName().length() > 0);
|
|
assert(GetTransactionByName(txn->GetName()) == nullptr);
|
|
assert(txn->GetState() == Transaction::STARTED);
|
|
std::lock_guard<std::mutex> lock(name_map_mutex_);
|
|
transactions_[txn->GetName()] = txn;
|
|
}
|
|
|
|
void TransactionDBImpl::UnregisterTransaction(Transaction* txn) {
|
|
assert(txn);
|
|
std::lock_guard<std::mutex> lock(name_map_mutex_);
|
|
auto it = transactions_.find(txn->GetName());
|
|
assert(it != transactions_.end());
|
|
transactions_.erase(it);
|
|
}
|
|
|
|
} // namespace rocksdb
|
|
#endif // ROCKSDB_LITE
|