mirror of https://github.com/facebook/rocksdb.git
Range Locking: Implementation of range locking (#7506)
Summary: Range Locking - an implementation based on the locktree library - Add a RangeTreeLockManager and RangeTreeLockTracker which implement range locking using the locktree library. - Point locks are handled as locks on single-point ranges. - Add a unit test: range_locking_test Pull Request resolved: https://github.com/facebook/rocksdb/pull/7506 Reviewed By: akankshamahajan15 Differential Revision: D25320703 Pulled By: cheng-chang fbshipit-source-id: f86347384b42ba2b0257d67eca0f45f806b69da7
This commit is contained in:
parent
f4db3e4119
commit
daab7603f6
|
@ -814,6 +814,8 @@ set(SOURCES
|
||||||
utilities/transactions/lock/lock_manager.cc
|
utilities/transactions/lock/lock_manager.cc
|
||||||
utilities/transactions/lock/point/point_lock_tracker.cc
|
utilities/transactions/lock/point/point_lock_tracker.cc
|
||||||
utilities/transactions/lock/point/point_lock_manager.cc
|
utilities/transactions/lock/point/point_lock_manager.cc
|
||||||
|
utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
|
||||||
|
utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
|
||||||
utilities/transactions/optimistic_transaction_db_impl.cc
|
utilities/transactions/optimistic_transaction_db_impl.cc
|
||||||
utilities/transactions/optimistic_transaction.cc
|
utilities/transactions/optimistic_transaction.cc
|
||||||
utilities/transactions/pessimistic_transaction.cc
|
utilities/transactions/pessimistic_transaction.cc
|
||||||
|
@ -831,7 +833,7 @@ set(SOURCES
|
||||||
utilities/write_batch_with_index/write_batch_with_index_internal.cc
|
utilities/write_batch_with_index/write_batch_with_index_internal.cc
|
||||||
$<TARGET_OBJECTS:build_version>)
|
$<TARGET_OBJECTS:build_version>)
|
||||||
|
|
||||||
list(APPEND SOURCE
|
list(APPEND SOURCES
|
||||||
utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
|
utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
|
||||||
utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
|
utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
|
||||||
utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
|
utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
|
||||||
|
@ -1225,6 +1227,7 @@ if(WITH_TESTS)
|
||||||
utilities/transactions/lock/point/point_lock_manager_test.cc
|
utilities/transactions/lock/point/point_lock_manager_test.cc
|
||||||
utilities/transactions/write_prepared_transaction_test.cc
|
utilities/transactions/write_prepared_transaction_test.cc
|
||||||
utilities/transactions/write_unprepared_transaction_test.cc
|
utilities/transactions/write_unprepared_transaction_test.cc
|
||||||
|
utilities/transactions/lock/range/range_locking_test.cc
|
||||||
utilities/ttl/ttl_test.cc
|
utilities/ttl/ttl_test.cc
|
||||||
utilities/write_batch_with_index/write_batch_with_index_test.cc
|
utilities/write_batch_with_index/write_batch_with_index_test.cc
|
||||||
)
|
)
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -558,6 +558,7 @@ PARALLEL_TEST = \
|
||||||
table_test \
|
table_test \
|
||||||
transaction_test \
|
transaction_test \
|
||||||
point_lock_manager_test \
|
point_lock_manager_test \
|
||||||
|
range_locking_test \
|
||||||
write_prepared_transaction_test \
|
write_prepared_transaction_test \
|
||||||
write_unprepared_transaction_test \
|
write_unprepared_transaction_test \
|
||||||
|
|
||||||
|
@ -1936,6 +1937,9 @@ blob_db_test: $(OBJ_DIR)/utilities/blob_db/blob_db_test.o $(TEST_LIBRARY) $(LIBR
|
||||||
repeatable_thread_test: $(OBJ_DIR)/util/repeatable_thread_test.o $(TEST_LIBRARY) $(LIBRARY)
|
repeatable_thread_test: $(OBJ_DIR)/util/repeatable_thread_test.o $(TEST_LIBRARY) $(LIBRARY)
|
||||||
$(AM_LINK)
|
$(AM_LINK)
|
||||||
|
|
||||||
|
range_locking_test: utilities/transactions/lock/range/range_locking_test.o $(TEST_LIBRARY) $(LIBRARY)
|
||||||
|
$(AM_LINK)
|
||||||
|
|
||||||
range_tombstone_fragmenter_test: $(OBJ_DIR)/db/range_tombstone_fragmenter_test.o $(TEST_LIBRARY) $(LIBRARY)
|
range_tombstone_fragmenter_test: $(OBJ_DIR)/db/range_tombstone_fragmenter_test.o $(TEST_LIBRARY) $(LIBRARY)
|
||||||
$(AM_LINK)
|
$(AM_LINK)
|
||||||
|
|
||||||
|
|
11
TARGETS
11
TARGETS
|
@ -400,6 +400,8 @@ cpp_library(
|
||||||
"utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
|
"utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
|
||||||
"utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
|
"utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
|
||||||
"utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
|
"utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
|
||||||
|
"utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
|
||||||
|
"utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
|
||||||
"utilities/transactions/optimistic_transaction.cc",
|
"utilities/transactions/optimistic_transaction.cc",
|
||||||
"utilities/transactions/optimistic_transaction_db_impl.cc",
|
"utilities/transactions/optimistic_transaction_db_impl.cc",
|
||||||
"utilities/transactions/pessimistic_transaction.cc",
|
"utilities/transactions/pessimistic_transaction.cc",
|
||||||
|
@ -702,6 +704,8 @@ cpp_library(
|
||||||
"utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
|
"utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
|
||||||
"utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
|
"utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
|
||||||
"utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
|
"utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
|
||||||
|
"utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
|
||||||
|
"utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
|
||||||
"utilities/transactions/optimistic_transaction.cc",
|
"utilities/transactions/optimistic_transaction.cc",
|
||||||
"utilities/transactions/optimistic_transaction_db_impl.cc",
|
"utilities/transactions/optimistic_transaction_db_impl.cc",
|
||||||
"utilities/transactions/pessimistic_transaction.cc",
|
"utilities/transactions/pessimistic_transaction.cc",
|
||||||
|
@ -1800,6 +1804,13 @@ ROCKS_TESTS = [
|
||||||
[],
|
[],
|
||||||
[],
|
[],
|
||||||
],
|
],
|
||||||
|
[
|
||||||
|
"range_locking_test",
|
||||||
|
"utilities/transactions/lock/range/range_locking_test.cc",
|
||||||
|
"parallel",
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"range_tombstone_fragmenter_test",
|
"range_tombstone_fragmenter_test",
|
||||||
"db/range_tombstone_fragmenter_test.cc",
|
"db/range_tombstone_fragmenter_test.cc",
|
||||||
|
|
|
@ -86,7 +86,11 @@ struct RangeDeadlockPath {
|
||||||
// RocksDB
|
// RocksDB
|
||||||
class RangeLockManagerHandle : public LockManagerHandle {
|
class RangeLockManagerHandle : public LockManagerHandle {
|
||||||
public:
|
public:
|
||||||
// Total amount of lock memory to use (per column family)
|
// Set total amount of lock memory to use.
|
||||||
|
//
|
||||||
|
// @return 0 Ok
|
||||||
|
// @return EDOM Failed to set because currently using more memory than
|
||||||
|
// specified
|
||||||
virtual int SetMaxLockMemory(size_t max_lock_memory) = 0;
|
virtual int SetMaxLockMemory(size_t max_lock_memory) = 0;
|
||||||
virtual size_t GetMaxLockMemory() = 0;
|
virtual size_t GetMaxLockMemory() = 0;
|
||||||
|
|
||||||
|
|
3
src.mk
3
src.mk
|
@ -267,6 +267,8 @@ LIB_SOURCES = \
|
||||||
utilities/transactions/lock/range/range_tree/lib/standalone_port.cc \
|
utilities/transactions/lock/range/range_tree/lib/standalone_port.cc \
|
||||||
utilities/transactions/lock/range/range_tree/lib/util/dbt.cc \
|
utilities/transactions/lock/range/range_tree/lib/util/dbt.cc \
|
||||||
utilities/transactions/lock/range/range_tree/lib/util/memarena.cc \
|
utilities/transactions/lock/range/range_tree/lib/util/memarena.cc \
|
||||||
|
utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc \
|
||||||
|
utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc \
|
||||||
utilities/transactions/optimistic_transaction.cc \
|
utilities/transactions/optimistic_transaction.cc \
|
||||||
utilities/transactions/optimistic_transaction_db_impl.cc \
|
utilities/transactions/optimistic_transaction_db_impl.cc \
|
||||||
utilities/transactions/pessimistic_transaction.cc \
|
utilities/transactions/pessimistic_transaction.cc \
|
||||||
|
@ -536,6 +538,7 @@ TEST_MAIN_SOURCES = \
|
||||||
utilities/simulator_cache/sim_cache_test.cc \
|
utilities/simulator_cache/sim_cache_test.cc \
|
||||||
utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \
|
utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \
|
||||||
utilities/transactions/optimistic_transaction_test.cc \
|
utilities/transactions/optimistic_transaction_test.cc \
|
||||||
|
utilities/transactions/lock/range/range_locking_test.cc \
|
||||||
utilities/transactions/transaction_test.cc \
|
utilities/transactions/transaction_test.cc \
|
||||||
utilities/transactions/lock/point/point_lock_manager_test.cc \
|
utilities/transactions/lock/point/point_lock_manager_test.cc \
|
||||||
utilities/transactions/write_prepared_transaction_test.cc \
|
utilities/transactions/write_prepared_transaction_test.cc \
|
||||||
|
|
|
@ -273,4 +273,47 @@ TEST_P(AnyLockManagerTest, Deadlock) {
|
||||||
delete txn1;
|
delete txn1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) {
|
||||||
|
MockColumnFamilyHandle cf(1);
|
||||||
|
locker_->AddColumnFamily(&cf);
|
||||||
|
|
||||||
|
auto txn1 = NewTxn();
|
||||||
|
ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
|
||||||
|
|
||||||
|
auto txn2 = NewTxn();
|
||||||
|
ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
|
||||||
|
|
||||||
|
auto txn3 = NewTxn();
|
||||||
|
txn3->SetLockTimeout(10000);
|
||||||
|
port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
|
||||||
|
ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true));
|
||||||
|
locker_->UnLock(txn3, 1, "k", env_);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Ok, now txn3 is waiting for lock on "k", which is owned by two
|
||||||
|
// transactions. Check that GetWaitingTxns reports this correctly
|
||||||
|
uint32_t wait_cf_id;
|
||||||
|
std::string wait_key;
|
||||||
|
auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key);
|
||||||
|
|
||||||
|
ASSERT_EQ(wait_cf_id, 1u);
|
||||||
|
ASSERT_EQ(wait_key, "k");
|
||||||
|
ASSERT_EQ(waiters.size(), 2);
|
||||||
|
bool waits_correct =
|
||||||
|
(waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) ||
|
||||||
|
(waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID());
|
||||||
|
ASSERT_EQ(waits_correct, true);
|
||||||
|
|
||||||
|
// Release locks so txn3 can proceed with execution
|
||||||
|
locker_->UnLock(txn1, 1, "k", env_);
|
||||||
|
locker_->UnLock(txn2, 1, "k", env_);
|
||||||
|
|
||||||
|
// Wait until txn3 finishes
|
||||||
|
t1.join();
|
||||||
|
|
||||||
|
delete txn1;
|
||||||
|
delete txn2;
|
||||||
|
delete txn3;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace ROCKSDB_NAMESPACE
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
|
|
@ -0,0 +1,304 @@
|
||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under both the GPLv2 (found in the
|
||||||
|
// COPYING file in the root directory) and Apache 2.0 License
|
||||||
|
// (found in the LICENSE.Apache file in the root directory).
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
#ifndef OS_WIN
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <functional>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
#include "db/db_impl/db_impl.h"
|
||||||
|
#include "port/port.h"
|
||||||
|
#include "rocksdb/db.h"
|
||||||
|
#include "rocksdb/options.h"
|
||||||
|
#include "rocksdb/perf_context.h"
|
||||||
|
#include "rocksdb/utilities/transaction.h"
|
||||||
|
#include "rocksdb/utilities/transaction_db.h"
|
||||||
|
#include "utilities/transactions/lock/point/point_lock_manager_test.h"
|
||||||
|
#include "utilities/transactions/pessimistic_transaction_db.h"
|
||||||
|
#include "utilities/transactions/transaction_test.h"
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
namespace ROCKSDB_NAMESPACE {
|
||||||
|
|
||||||
|
class RangeLockingTest : public ::testing::Test {
|
||||||
|
public:
|
||||||
|
TransactionDB* db;
|
||||||
|
std::string dbname;
|
||||||
|
Options options;
|
||||||
|
|
||||||
|
std::shared_ptr<RangeLockManagerHandle> range_lock_mgr;
|
||||||
|
TransactionDBOptions txn_db_options;
|
||||||
|
|
||||||
|
RangeLockingTest() : db(nullptr) {
|
||||||
|
options.create_if_missing = true;
|
||||||
|
dbname = test::PerThreadDBPath("range_locking_testdb");
|
||||||
|
|
||||||
|
DestroyDB(dbname, options);
|
||||||
|
|
||||||
|
range_lock_mgr.reset(NewRangeLockManager(nullptr));
|
||||||
|
txn_db_options.lock_mgr_handle = range_lock_mgr;
|
||||||
|
|
||||||
|
auto s = TransactionDB::Open(options, txn_db_options, dbname, &db);
|
||||||
|
assert(s.ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
~RangeLockingTest() {
|
||||||
|
delete db;
|
||||||
|
db = nullptr;
|
||||||
|
// This is to skip the assert statement in FaultInjectionTestEnv. There
|
||||||
|
// seems to be a bug in btrfs that the makes readdir return recently
|
||||||
|
// unlink-ed files. By using the default fs we simply ignore errors resulted
|
||||||
|
// from attempting to delete such files in DestroyDB.
|
||||||
|
DestroyDB(dbname, options);
|
||||||
|
}
|
||||||
|
|
||||||
|
PessimisticTransaction* NewTxn(
|
||||||
|
TransactionOptions txn_opt = TransactionOptions()) {
|
||||||
|
Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opt);
|
||||||
|
return reinterpret_cast<PessimisticTransaction*>(txn);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: set a smaller lock wait timeout so that the test runs faster.
|
||||||
|
TEST_F(RangeLockingTest, BasicRangeLocking) {
|
||||||
|
WriteOptions write_options;
|
||||||
|
TransactionOptions txn_options;
|
||||||
|
std::string value;
|
||||||
|
ReadOptions read_options;
|
||||||
|
auto cf = db->DefaultColumnFamily();
|
||||||
|
|
||||||
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
||||||
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
||||||
|
|
||||||
|
// Get a range lock
|
||||||
|
ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
|
||||||
|
|
||||||
|
// Check that range Lock inhibits an overlapping range lock
|
||||||
|
{
|
||||||
|
auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z"));
|
||||||
|
ASSERT_TRUE(s.IsTimedOut());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that range Lock inhibits an overlapping point lock
|
||||||
|
{
|
||||||
|
auto s = txn1->GetForUpdate(read_options, cf, Slice("b"), &value);
|
||||||
|
ASSERT_TRUE(s.IsTimedOut());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get a point lock, check that it inhibits range locks
|
||||||
|
ASSERT_OK(txn0->Put(cf, Slice("n"), Slice("value")));
|
||||||
|
{
|
||||||
|
auto s = txn1->GetRangeLock(cf, Endpoint("m"), Endpoint("p"));
|
||||||
|
ASSERT_TRUE(s.IsTimedOut());
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_OK(txn0->Commit());
|
||||||
|
txn1->Rollback();
|
||||||
|
|
||||||
|
delete txn0;
|
||||||
|
delete txn1;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(RangeLockingTest, MyRocksLikeUpdate) {
|
||||||
|
WriteOptions write_options;
|
||||||
|
TransactionOptions txn_options;
|
||||||
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
||||||
|
auto cf = db->DefaultColumnFamily();
|
||||||
|
Status s;
|
||||||
|
|
||||||
|
// Get a range lock for the range we are about to update
|
||||||
|
ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
|
||||||
|
|
||||||
|
bool try_range_lock_called = false;
|
||||||
|
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||||
|
"RangeTreeLockManager::TryRangeLock:enter",
|
||||||
|
[&](void* /*arg*/) { try_range_lock_called = true; });
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||||
|
|
||||||
|
// For performance reasons, the following must NOT call lock_mgr->TryLock():
|
||||||
|
// We verify that by checking the value of try_range_lock_called.
|
||||||
|
ASSERT_OK(txn0->Put(cf, Slice("b"), Slice("value"),
|
||||||
|
/*assume_tracked=*/true));
|
||||||
|
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||||
|
ASSERT_FALSE(try_range_lock_called);
|
||||||
|
|
||||||
|
txn0->Rollback();
|
||||||
|
|
||||||
|
delete txn0;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(RangeLockingTest, SnapshotValidation) {
|
||||||
|
Status s;
|
||||||
|
Slice key_slice = Slice("k");
|
||||||
|
ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
|
||||||
|
|
||||||
|
auto txn0 = NewTxn();
|
||||||
|
txn0->Put(key_slice, Slice("initial"));
|
||||||
|
txn0->Commit();
|
||||||
|
|
||||||
|
// txn1
|
||||||
|
auto txn1 = NewTxn();
|
||||||
|
txn1->SetSnapshot();
|
||||||
|
std::string val1;
|
||||||
|
ASSERT_OK(txn1->Get(ReadOptions(), cfh, key_slice, &val1));
|
||||||
|
ASSERT_EQ(val1, "initial");
|
||||||
|
val1 = val1 + std::string("-txn1");
|
||||||
|
|
||||||
|
ASSERT_OK(txn1->Put(cfh, key_slice, Slice(val1)));
|
||||||
|
|
||||||
|
// txn2
|
||||||
|
auto txn2 = NewTxn();
|
||||||
|
txn2->SetSnapshot();
|
||||||
|
std::string val2;
|
||||||
|
// This will see the original value as nothing is committed
|
||||||
|
// This is also Get, so it is doesn't acquire any locks.
|
||||||
|
ASSERT_OK(txn2->Get(ReadOptions(), cfh, key_slice, &val2));
|
||||||
|
ASSERT_EQ(val2, "initial");
|
||||||
|
|
||||||
|
// txn1
|
||||||
|
ASSERT_OK(txn1->Commit());
|
||||||
|
|
||||||
|
// txn2
|
||||||
|
val2 = val2 + std::string("-txn2");
|
||||||
|
// Now, this call should do Snapshot Validation and fail:
|
||||||
|
s = txn2->Put(cfh, key_slice, Slice(val2));
|
||||||
|
ASSERT_TRUE(s.IsBusy());
|
||||||
|
|
||||||
|
ASSERT_OK(txn2->Commit());
|
||||||
|
|
||||||
|
delete txn0;
|
||||||
|
delete txn1;
|
||||||
|
delete txn2;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(RangeLockingTest, MultipleTrxLockStatusData) {
|
||||||
|
WriteOptions write_options;
|
||||||
|
TransactionOptions txn_options;
|
||||||
|
auto cf = db->DefaultColumnFamily();
|
||||||
|
|
||||||
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
||||||
|
Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
|
||||||
|
|
||||||
|
// Get a range lock
|
||||||
|
ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("z"), Endpoint("z")));
|
||||||
|
ASSERT_OK(txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("e")));
|
||||||
|
|
||||||
|
auto s = range_lock_mgr->GetRangeLockStatusData();
|
||||||
|
ASSERT_EQ(s.size(), 2);
|
||||||
|
for (auto it = s.begin(); it != s.end(); ++it) {
|
||||||
|
ASSERT_EQ(it->first, cf->GetID());
|
||||||
|
auto val = it->second;
|
||||||
|
ASSERT_FALSE(val.start.inf_suffix);
|
||||||
|
ASSERT_FALSE(val.end.inf_suffix);
|
||||||
|
ASSERT_TRUE(val.exclusive);
|
||||||
|
ASSERT_EQ(val.ids.size(), 1);
|
||||||
|
if (val.ids[0] == txn0->GetID()) {
|
||||||
|
ASSERT_EQ(val.start.slice, "z");
|
||||||
|
ASSERT_EQ(val.end.slice, "z");
|
||||||
|
} else if (val.ids[0] == txn1->GetID()) {
|
||||||
|
ASSERT_EQ(val.start.slice, "b");
|
||||||
|
ASSERT_EQ(val.end.slice, "e");
|
||||||
|
} else {
|
||||||
|
FAIL(); // Unknown transaction ID.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
delete txn0;
|
||||||
|
delete txn1;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__has_feature)
|
||||||
|
#if __has_feature(thread_sanitizer)
|
||||||
|
#define SKIP_LOCK_ESCALATION_TEST 1
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef SKIP_LOCK_ESCALATION_TEST
|
||||||
|
TEST_F(RangeLockingTest, BasicLockEscalation) {
|
||||||
|
auto cf = db->DefaultColumnFamily();
|
||||||
|
|
||||||
|
auto counters = range_lock_mgr->GetStatus();
|
||||||
|
|
||||||
|
// Initially not using any lock memory
|
||||||
|
ASSERT_EQ(counters.current_lock_memory, 0);
|
||||||
|
ASSERT_EQ(counters.escalation_count, 0);
|
||||||
|
|
||||||
|
ASSERT_EQ(0, range_lock_mgr->SetMaxLockMemory(2000));
|
||||||
|
|
||||||
|
// Insert until we see lock escalations
|
||||||
|
auto txn = NewTxn();
|
||||||
|
|
||||||
|
// Get the locks until we hit an escalation
|
||||||
|
for (int i = 0; i < 2020; i++) {
|
||||||
|
char buf[32];
|
||||||
|
snprintf(buf, sizeof(buf) - 1, "%08d", i);
|
||||||
|
ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf), Endpoint(buf)));
|
||||||
|
}
|
||||||
|
counters = range_lock_mgr->GetStatus();
|
||||||
|
ASSERT_GT(counters.escalation_count, 0);
|
||||||
|
ASSERT_LE(counters.current_lock_memory, 2000);
|
||||||
|
|
||||||
|
delete txn;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void PointLockManagerTestExternalSetup(PointLockManagerTest* self) {
|
||||||
|
self->env_ = Env::Default();
|
||||||
|
self->db_dir_ = test::PerThreadDBPath("point_lock_manager_test");
|
||||||
|
ASSERT_OK(self->env_->CreateDir(self->db_dir_));
|
||||||
|
|
||||||
|
Options opt;
|
||||||
|
opt.create_if_missing = true;
|
||||||
|
TransactionDBOptions txn_opt;
|
||||||
|
txn_opt.transaction_lock_timeout = 0;
|
||||||
|
|
||||||
|
auto mutex_factory = std::make_shared<TransactionDBMutexFactoryImpl>();
|
||||||
|
self->locker_.reset(NewRangeLockManager(mutex_factory)->getLockManager());
|
||||||
|
std::shared_ptr<RangeLockManagerHandle> range_lock_mgr =
|
||||||
|
std::dynamic_pointer_cast<RangeLockManagerHandle>(self->locker_);
|
||||||
|
txn_opt.lock_mgr_handle = range_lock_mgr;
|
||||||
|
|
||||||
|
ASSERT_OK(TransactionDB::Open(opt, txn_opt, self->db_dir_, &self->db_));
|
||||||
|
self->wait_sync_point_name_ = "RangeTreeLockManager::TryRangeLock:WaitingTxn";
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(RangeLockManager, AnyLockManagerTest,
|
||||||
|
::testing::Values(PointLockManagerTestExternalSetup));
|
||||||
|
|
||||||
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
||||||
|
::testing::InitGoogleTest(&argc, argv);
|
||||||
|
return RUN_ALL_TESTS();
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // OS_WIN
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
int main(int /*argc*/, char** /*argv*/) {
|
||||||
|
fprintf(stderr, "skipped as Range Locking is not supported on Windows\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // OS_WIN
|
||||||
|
|
||||||
|
#else
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
int main(int /*argc*/, char** /*argv*/) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"skipped as transactions are not supported in rocksdb_lite\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // ROCKSDB_LITE
|
|
@ -1 +0,0 @@
|
||||||
Implement LockTracker and LockManager for range lock with range tree.
|
|
|
@ -1,5 +1,5 @@
|
||||||
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
||||||
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
|
// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
|
||||||
#ifndef ROCKSDB_LITE
|
#ifndef ROCKSDB_LITE
|
||||||
#ifndef OS_WIN
|
#ifndef OS_WIN
|
||||||
#ident "$Id$"
|
#ident "$Id$"
|
||||||
|
@ -240,7 +240,7 @@ int lock_request::wait(uint64_t wait_time_ms) {
|
||||||
|
|
||||||
int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
|
int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
|
||||||
int (*killed_callback)(void),
|
int (*killed_callback)(void),
|
||||||
void (*lock_wait_callback)(void *, TXNID, TXNID),
|
void (*lock_wait_callback)(void *, lock_wait_infos *),
|
||||||
void *callback_arg) {
|
void *callback_arg) {
|
||||||
uint64_t t_now = toku_current_time_microsec();
|
uint64_t t_now = toku_current_time_microsec();
|
||||||
uint64_t t_start = t_now;
|
uint64_t t_start = t_now;
|
||||||
|
@ -250,13 +250,11 @@ int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
|
||||||
|
|
||||||
// check again, this time locking out other retry calls
|
// check again, this time locking out other retry calls
|
||||||
if (m_state == state::PENDING) {
|
if (m_state == state::PENDING) {
|
||||||
GrowableArray<TXNID> conflicts_collector;
|
lock_wait_infos conflicts_collector;
|
||||||
conflicts_collector.init();
|
|
||||||
retry(&conflicts_collector);
|
retry(&conflicts_collector);
|
||||||
if (m_state == state::PENDING) {
|
if (m_state == state::PENDING) {
|
||||||
report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
|
report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
|
||||||
}
|
}
|
||||||
conflicts_collector.deinit();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while (m_state == state::PENDING) {
|
while (m_state == state::PENDING) {
|
||||||
|
@ -325,7 +323,7 @@ TXNID lock_request::get_conflicting_txnid(void) const {
|
||||||
return m_conflicting_txnid;
|
return m_conflicting_txnid;
|
||||||
}
|
}
|
||||||
|
|
||||||
int lock_request::retry(GrowableArray<TXNID> *conflicts_collector) {
|
int lock_request::retry(lock_wait_infos *conflicts_collector) {
|
||||||
invariant(m_state == state::PENDING);
|
invariant(m_state == state::PENDING);
|
||||||
int r;
|
int r;
|
||||||
txnid_set conflicts;
|
txnid_set conflicts;
|
||||||
|
@ -356,7 +354,7 @@ int lock_request::retry(GrowableArray<TXNID> *conflicts_collector) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void lock_request::retry_all_lock_requests(
|
void lock_request::retry_all_lock_requests(
|
||||||
locktree *lt, void (*lock_wait_callback)(void *, TXNID, TXNID),
|
locktree *lt, void (*lock_wait_callback)(void *, lock_wait_infos *),
|
||||||
void *callback_arg, void (*after_retry_all_test_callback)(void)) {
|
void *callback_arg, void (*after_retry_all_test_callback)(void)) {
|
||||||
lt_lock_request_info *info = lt->get_lock_request_info();
|
lt_lock_request_info *info = lt->get_lock_request_info();
|
||||||
|
|
||||||
|
@ -370,8 +368,7 @@ void lock_request::retry_all_lock_requests(
|
||||||
|
|
||||||
toku_mutex_lock(&info->retry_mutex);
|
toku_mutex_lock(&info->retry_mutex);
|
||||||
|
|
||||||
GrowableArray<TXNID> conflicts_collector;
|
lock_wait_infos conflicts_collector;
|
||||||
conflicts_collector.init();
|
|
||||||
|
|
||||||
// here is the group retry algorithm.
|
// here is the group retry algorithm.
|
||||||
// get the latest retry_want count and use it as the generation number of
|
// get the latest retry_want count and use it as the generation number of
|
||||||
|
@ -398,11 +395,10 @@ void lock_request::retry_all_lock_requests(
|
||||||
toku_mutex_unlock(&info->retry_mutex);
|
toku_mutex_unlock(&info->retry_mutex);
|
||||||
|
|
||||||
report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
|
report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
|
||||||
conflicts_collector.deinit();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void lock_request::retry_all_lock_requests_info(
|
void lock_request::retry_all_lock_requests_info(lt_lock_request_info *info,
|
||||||
lt_lock_request_info *info, GrowableArray<TXNID> *collector) {
|
lock_wait_infos *collector) {
|
||||||
toku_external_mutex_lock(&info->mutex);
|
toku_external_mutex_lock(&info->mutex);
|
||||||
// retry all of the pending lock requests.
|
// retry all of the pending lock requests.
|
||||||
for (uint32_t i = 0; i < info->pending_lock_requests.size();) {
|
for (uint32_t i = 0; i < info->pending_lock_requests.size();) {
|
||||||
|
@ -425,26 +421,20 @@ void lock_request::retry_all_lock_requests_info(
|
||||||
toku_external_mutex_unlock(&info->mutex);
|
toku_external_mutex_unlock(&info->mutex);
|
||||||
}
|
}
|
||||||
|
|
||||||
void lock_request::add_conflicts_to_waits(
|
void lock_request::add_conflicts_to_waits(txnid_set *conflicts,
|
||||||
txnid_set *conflicts, GrowableArray<TXNID> *wait_conflicts) {
|
lock_wait_infos *wait_conflicts) {
|
||||||
|
wait_conflicts->push_back({m_lt, get_txnid(), m_extra, {}});
|
||||||
uint32_t num_conflicts = conflicts->size();
|
uint32_t num_conflicts = conflicts->size();
|
||||||
for (uint32_t i = 0; i < num_conflicts; i++) {
|
for (uint32_t i = 0; i < num_conflicts; i++) {
|
||||||
wait_conflicts->push(m_txnid);
|
wait_conflicts->back().waitees.push_back(conflicts->get(i));
|
||||||
wait_conflicts->push(conflicts->get(i));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void lock_request::report_waits(GrowableArray<TXNID> *wait_conflicts,
|
void lock_request::report_waits(lock_wait_infos *wait_conflicts,
|
||||||
void (*lock_wait_callback)(void *, TXNID,
|
void (*lock_wait_callback)(void *,
|
||||||
TXNID),
|
lock_wait_infos *),
|
||||||
void *callback_arg) {
|
void *callback_arg) {
|
||||||
if (!lock_wait_callback) return;
|
if (lock_wait_callback) (*lock_wait_callback)(callback_arg, wait_conflicts);
|
||||||
size_t num_conflicts = wait_conflicts->get_size();
|
|
||||||
for (size_t i = 0; i < num_conflicts; i += 2) {
|
|
||||||
TXNID blocked_txnid = wait_conflicts->fetch_unchecked(i);
|
|
||||||
TXNID blocking_txnid = wait_conflicts->fetch_unchecked(i + 1);
|
|
||||||
(*lock_wait_callback)(callback_arg, blocked_txnid, blocking_txnid);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void *lock_request::get_extra(void) const { return m_extra; }
|
void *lock_request::get_extra(void) const { return m_extra; }
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
||||||
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
|
// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
|
||||||
#ident "$Id$"
|
#ident "$Id$"
|
||||||
/*======
|
/*======
|
||||||
This file is part of PerconaFT.
|
This file is part of PerconaFT.
|
||||||
|
@ -62,6 +62,18 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
|
||||||
|
|
||||||
namespace toku {
|
namespace toku {
|
||||||
|
|
||||||
|
// Information about a lock wait
|
||||||
|
struct lock_wait_info {
|
||||||
|
locktree *ltree; // the tree where wait happens
|
||||||
|
TXNID waiter; // the waiting transaction
|
||||||
|
void *m_extra; // lock_request's m_extra
|
||||||
|
|
||||||
|
// The transactions that are waited for.
|
||||||
|
std::vector<TXNID> waitees;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef std::vector<lock_wait_info> lock_wait_infos;
|
||||||
|
|
||||||
// A lock request contains the db, the key range, the lock type, and
|
// A lock request contains the db, the key range, the lock type, and
|
||||||
// the transaction id that describes a potential row range lock.
|
// the transaction id that describes a potential row range lock.
|
||||||
//
|
//
|
||||||
|
@ -101,7 +113,7 @@ class lock_request {
|
||||||
int wait(uint64_t wait_time_ms);
|
int wait(uint64_t wait_time_ms);
|
||||||
int wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
|
int wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
|
||||||
int (*killed_callback)(void),
|
int (*killed_callback)(void),
|
||||||
void (*lock_wait_callback)(void *, TXNID, TXNID) = nullptr,
|
void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
|
||||||
void *callback_arg = nullptr);
|
void *callback_arg = nullptr);
|
||||||
|
|
||||||
// return: left end-point of the lock range
|
// return: left end-point of the lock range
|
||||||
|
@ -124,11 +136,12 @@ class lock_request {
|
||||||
// up.
|
// up.
|
||||||
// The rest remain pending.
|
// The rest remain pending.
|
||||||
static void retry_all_lock_requests(
|
static void retry_all_lock_requests(
|
||||||
locktree *lt, void (*lock_wait_callback)(void *, TXNID, TXNID) = nullptr,
|
locktree *lt,
|
||||||
|
void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
|
||||||
void *callback_arg = nullptr,
|
void *callback_arg = nullptr,
|
||||||
void (*after_retry_test_callback)(void) = nullptr);
|
void (*after_retry_test_callback)(void) = nullptr);
|
||||||
static void retry_all_lock_requests_info(lt_lock_request_info *info,
|
static void retry_all_lock_requests_info(lt_lock_request_info *info,
|
||||||
GrowableArray<TXNID> *collector);
|
lock_wait_infos *collector);
|
||||||
|
|
||||||
void set_start_test_callback(void (*f)(void));
|
void set_start_test_callback(void (*f)(void));
|
||||||
void set_start_before_pending_test_callback(void (*f)(void));
|
void set_start_before_pending_test_callback(void (*f)(void));
|
||||||
|
@ -181,7 +194,7 @@ class lock_request {
|
||||||
|
|
||||||
// effect: tries again to acquire the lock described by this lock request
|
// effect: tries again to acquire the lock described by this lock request
|
||||||
// returns: 0 if retrying the request succeeded and is now complete
|
// returns: 0 if retrying the request succeeded and is now complete
|
||||||
int retry(GrowableArray<TXNID> *conflict_collector);
|
int retry(lock_wait_infos *collector);
|
||||||
|
|
||||||
void complete(int complete_r);
|
void complete(int complete_r);
|
||||||
|
|
||||||
|
@ -216,11 +229,12 @@ class lock_request {
|
||||||
static int find_by_txnid(lock_request *const &request, const TXNID &txnid);
|
static int find_by_txnid(lock_request *const &request, const TXNID &txnid);
|
||||||
|
|
||||||
// Report list of conflicts to lock wait callback.
|
// Report list of conflicts to lock wait callback.
|
||||||
static void report_waits(GrowableArray<TXNID> *wait_conflicts,
|
static void report_waits(lock_wait_infos *wait_conflicts,
|
||||||
void (*lock_wait_callback)(void *, TXNID, TXNID),
|
void (*lock_wait_callback)(void *,
|
||||||
|
lock_wait_infos *),
|
||||||
void *callback_arg);
|
void *callback_arg);
|
||||||
void add_conflicts_to_waits(txnid_set *conflicts,
|
void add_conflicts_to_waits(txnid_set *conflicts,
|
||||||
GrowableArray<TXNID> *wait_conflicts);
|
lock_wait_infos *wait_conflicts);
|
||||||
|
|
||||||
void (*m_start_test_callback)(void);
|
void (*m_start_test_callback)(void);
|
||||||
void (*m_start_before_pending_test_callback)(void);
|
void (*m_start_before_pending_test_callback)(void);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
||||||
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
|
// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
|
||||||
#ifndef ROCKSDB_LITE
|
#ifndef ROCKSDB_LITE
|
||||||
#ifndef OS_WIN
|
#ifndef OS_WIN
|
||||||
#ident "$Id$"
|
#ident "$Id$"
|
||||||
|
@ -186,7 +186,13 @@ static bool determine_conflicting_txnids(
|
||||||
const TXNID other_txnid = lock.txnid;
|
const TXNID other_txnid = lock.txnid;
|
||||||
if (other_txnid != txnid) {
|
if (other_txnid != txnid) {
|
||||||
if (conflicts) {
|
if (conflicts) {
|
||||||
conflicts->add(other_txnid);
|
if (other_txnid == TXNID_SHARED) {
|
||||||
|
for (TXNID shared_id : *lock.owners) {
|
||||||
|
conflicts->add(shared_id);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
conflicts->add(other_txnid);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
conflicts_exist = true;
|
conflicts_exist = true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,479 @@
|
||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under both the GPLv2 (found in the
|
||||||
|
// COPYING file in the root directory) and Apache 2.0 License
|
||||||
|
// (found in the LICENSE.Apache file in the root directory).
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
#ifndef OS_WIN
|
||||||
|
|
||||||
|
#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cinttypes>
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
|
#include "monitoring/perf_context_imp.h"
|
||||||
|
#include "rocksdb/slice.h"
|
||||||
|
#include "rocksdb/utilities/transaction_db_mutex.h"
|
||||||
|
#include "test_util/sync_point.h"
|
||||||
|
#include "util/cast_util.h"
|
||||||
|
#include "util/hash.h"
|
||||||
|
#include "util/thread_local.h"
|
||||||
|
#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
|
||||||
|
#include "utilities/transactions/pessimistic_transaction_db.h"
|
||||||
|
#include "utilities/transactions/transaction_db_mutex_impl.h"
|
||||||
|
|
||||||
|
namespace ROCKSDB_NAMESPACE {
|
||||||
|
|
||||||
|
RangeLockManagerHandle* NewRangeLockManager(
|
||||||
|
std::shared_ptr<TransactionDBMutexFactory> mutex_factory) {
|
||||||
|
std::shared_ptr<TransactionDBMutexFactory> use_factory;
|
||||||
|
|
||||||
|
if (mutex_factory) {
|
||||||
|
use_factory = mutex_factory;
|
||||||
|
} else {
|
||||||
|
use_factory.reset(new TransactionDBMutexFactoryImpl());
|
||||||
|
}
|
||||||
|
return new RangeTreeLockManager(use_factory);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char SUFFIX_INFIMUM = 0x0;
|
||||||
|
static const char SUFFIX_SUPREMUM = 0x1;
|
||||||
|
|
||||||
|
// Convert Endpoint into an internal format used for storing it in locktree
|
||||||
|
// (DBT structure is used for passing endpoints to locktree and getting back)
|
||||||
|
void serialize_endpoint(const Endpoint& endp, std::string* buf) {
|
||||||
|
buf->push_back(endp.inf_suffix ? SUFFIX_SUPREMUM : SUFFIX_INFIMUM);
|
||||||
|
buf->append(endp.slice.data(), endp.slice.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the endpoint from the format it is stored in the locktree (DBT) to
|
||||||
|
// one used outside (EndpointWithString)
|
||||||
|
void deserialize_endpoint(const DBT* dbt, EndpointWithString* endp) {
|
||||||
|
assert(dbt->size >= 1);
|
||||||
|
const char* dbt_data = (const char*)dbt->data;
|
||||||
|
char suffix = dbt_data[0];
|
||||||
|
assert(suffix == SUFFIX_INFIMUM || suffix == SUFFIX_SUPREMUM);
|
||||||
|
endp->inf_suffix = (suffix == SUFFIX_SUPREMUM);
|
||||||
|
endp->slice.assign(dbt_data + 1, dbt->size - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get a range lock on [start_key; end_key] range
|
||||||
|
Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn,
|
||||||
|
uint32_t column_family_id,
|
||||||
|
const Endpoint& start_endp,
|
||||||
|
const Endpoint& end_endp, Env*,
|
||||||
|
bool exclusive) {
|
||||||
|
toku::lock_request request;
|
||||||
|
request.create(mutex_factory_);
|
||||||
|
DBT start_key_dbt, end_key_dbt;
|
||||||
|
|
||||||
|
TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:enter");
|
||||||
|
std::string start_key;
|
||||||
|
std::string end_key;
|
||||||
|
serialize_endpoint(start_endp, &start_key);
|
||||||
|
serialize_endpoint(end_endp, &end_key);
|
||||||
|
|
||||||
|
toku_fill_dbt(&start_key_dbt, start_key.data(), start_key.size());
|
||||||
|
toku_fill_dbt(&end_key_dbt, end_key.data(), end_key.size());
|
||||||
|
|
||||||
|
auto lt = GetLockTreeForCF(column_family_id);
|
||||||
|
|
||||||
|
// Put the key waited on into request's m_extra. See
|
||||||
|
// wait_callback_for_locktree for details.
|
||||||
|
std::string wait_key(start_endp.slice.data(), start_endp.slice.size());
|
||||||
|
|
||||||
|
request.set(lt.get(), (TXNID)txn, &start_key_dbt, &end_key_dbt,
|
||||||
|
exclusive ? toku::lock_request::WRITE : toku::lock_request::READ,
|
||||||
|
false /* not a big txn */, &wait_key);
|
||||||
|
|
||||||
|
// This is for "periodically wake up and check if the wait is killed" feature
|
||||||
|
// which we are not using.
|
||||||
|
uint64_t killed_time_msec = 0;
|
||||||
|
uint64_t wait_time_msec = txn->GetLockTimeout();
|
||||||
|
|
||||||
|
if (wait_time_msec == static_cast<uint64_t>(-1)) {
|
||||||
|
// The transaction has no wait timeout. lock_request::wait doesn't support
|
||||||
|
// this, it needs a number of milliseconds to wait. Pass it one year to
|
||||||
|
// be safe.
|
||||||
|
wait_time_msec = uint64_t(1000) * 60 * 60 * 24 * 365;
|
||||||
|
} else {
|
||||||
|
// convert microseconds to milliseconds
|
||||||
|
wait_time_msec = (wait_time_msec + 500) / 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<RangeDeadlockInfo> di_path;
|
||||||
|
request.m_deadlock_cb = [&](TXNID txnid, bool is_exclusive,
|
||||||
|
const DBT* start_dbt, const DBT* end_dbt) {
|
||||||
|
EndpointWithString start;
|
||||||
|
EndpointWithString end;
|
||||||
|
deserialize_endpoint(start_dbt, &start);
|
||||||
|
deserialize_endpoint(end_dbt, &end);
|
||||||
|
|
||||||
|
di_path.push_back({((PessimisticTransaction*)txnid)->GetID(),
|
||||||
|
column_family_id, is_exclusive, std::move(start),
|
||||||
|
std::move(end)});
|
||||||
|
};
|
||||||
|
|
||||||
|
request.start();
|
||||||
|
|
||||||
|
const int r = request.wait(wait_time_msec, killed_time_msec,
|
||||||
|
nullptr, // killed_callback
|
||||||
|
wait_callback_for_locktree, nullptr);
|
||||||
|
|
||||||
|
// Inform the txn that we are no longer waiting:
|
||||||
|
txn->ClearWaitingTxn();
|
||||||
|
|
||||||
|
request.destroy();
|
||||||
|
switch (r) {
|
||||||
|
case 0:
|
||||||
|
break; // fall through
|
||||||
|
case DB_LOCK_NOTGRANTED:
|
||||||
|
return Status::TimedOut(Status::SubCode::kLockTimeout);
|
||||||
|
case TOKUDB_OUT_OF_LOCKS:
|
||||||
|
return Status::Busy(Status::SubCode::kLockLimit);
|
||||||
|
case DB_LOCK_DEADLOCK: {
|
||||||
|
std::reverse(di_path.begin(), di_path.end());
|
||||||
|
dlock_buffer_.AddNewPath(
|
||||||
|
RangeDeadlockPath(di_path, request.get_start_time()));
|
||||||
|
return Status::Busy(Status::SubCode::kDeadlock);
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
|
return Status::Busy(Status::SubCode::kLockLimit);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait callback that locktree library will call to inform us about
|
||||||
|
// the lock waits that are in progress.
|
||||||
|
void wait_callback_for_locktree(void*, lock_wait_infos* infos) {
|
||||||
|
for (auto wait_info : *infos) {
|
||||||
|
auto txn = (PessimisticTransaction*)wait_info.waiter;
|
||||||
|
auto cf_id = (ColumnFamilyId)wait_info.ltree->get_dict_id().dictid;
|
||||||
|
|
||||||
|
autovector<TransactionID> waitee_ids;
|
||||||
|
for (auto waitee : wait_info.waitees) {
|
||||||
|
waitee_ids.push_back(((PessimisticTransaction*)waitee)->GetID());
|
||||||
|
}
|
||||||
|
txn->SetWaitingTxn(waitee_ids, cf_id, (std::string*)wait_info.m_extra);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Here we can assume that the locktree code will now wait for some lock
|
||||||
|
TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:WaitingTxn");
|
||||||
|
}
|
||||||
|
|
||||||
|
void RangeTreeLockManager::UnLock(PessimisticTransaction* txn,
|
||||||
|
ColumnFamilyId column_family_id,
|
||||||
|
const std::string& key, Env*) {
|
||||||
|
auto locktree = GetLockTreeForCF(column_family_id);
|
||||||
|
std::string endp_image;
|
||||||
|
serialize_endpoint({key.data(), key.size(), false}, &endp_image);
|
||||||
|
|
||||||
|
DBT key_dbt;
|
||||||
|
toku_fill_dbt(&key_dbt, endp_image.data(), endp_image.size());
|
||||||
|
|
||||||
|
toku::range_buffer range_buf;
|
||||||
|
range_buf.create();
|
||||||
|
range_buf.append(&key_dbt, &key_dbt);
|
||||||
|
|
||||||
|
locktree->release_locks((TXNID)txn, &range_buf);
|
||||||
|
range_buf.destroy();
|
||||||
|
|
||||||
|
toku::lock_request::retry_all_lock_requests(
|
||||||
|
locktree.get(), wait_callback_for_locktree, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void RangeTreeLockManager::UnLock(PessimisticTransaction* txn,
|
||||||
|
const LockTracker& tracker, Env*) {
|
||||||
|
const RangeTreeLockTracker* range_tracker =
|
||||||
|
static_cast<const RangeTreeLockTracker*>(&tracker);
|
||||||
|
|
||||||
|
RangeTreeLockTracker* range_trx_tracker =
|
||||||
|
static_cast<RangeTreeLockTracker*>(&txn->GetTrackedLocks());
|
||||||
|
bool all_keys = (range_trx_tracker == range_tracker);
|
||||||
|
|
||||||
|
// tracked_locks_->range_list may hold nullptr if the transaction has never
|
||||||
|
// acquired any locks.
|
||||||
|
((RangeTreeLockTracker*)range_tracker)->ReleaseLocks(this, txn, all_keys);
|
||||||
|
}
|
||||||
|
|
||||||
|
int RangeTreeLockManager::CompareDbtEndpoints(void* arg, const DBT* a_key,
|
||||||
|
const DBT* b_key) {
|
||||||
|
const char* a = (const char*)a_key->data;
|
||||||
|
const char* b = (const char*)b_key->data;
|
||||||
|
|
||||||
|
size_t a_len = a_key->size;
|
||||||
|
size_t b_len = b_key->size;
|
||||||
|
|
||||||
|
size_t min_len = std::min(a_len, b_len);
|
||||||
|
|
||||||
|
// Compare the values. The first byte encodes the endpoint type, its value
|
||||||
|
// is either SUFFIX_INFIMUM or SUFFIX_SUPREMUM.
|
||||||
|
Comparator* cmp = (Comparator*)arg;
|
||||||
|
int res = cmp->Compare(Slice(a + 1, min_len - 1), Slice(b + 1, min_len - 1));
|
||||||
|
if (!res) {
|
||||||
|
if (b_len > min_len) {
|
||||||
|
// a is shorter;
|
||||||
|
if (a[0] == SUFFIX_INFIMUM) {
|
||||||
|
return -1; //"a is smaller"
|
||||||
|
} else {
|
||||||
|
// a is considered padded with 0xFF:FF:FF:FF...
|
||||||
|
return 1; // "a" is bigger
|
||||||
|
}
|
||||||
|
} else if (a_len > min_len) {
|
||||||
|
// the opposite of the above: b is shorter.
|
||||||
|
if (b[0] == SUFFIX_INFIMUM) {
|
||||||
|
return 1; //"b is smaller"
|
||||||
|
} else {
|
||||||
|
// b is considered padded with 0xFF:FF:FF:FF...
|
||||||
|
return -1; // "b" is bigger
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// the lengths are equal (and the key values, too)
|
||||||
|
if (a[0] < b[0]) {
|
||||||
|
return -1;
|
||||||
|
} else if (a[0] > b[0]) {
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
void UnrefLockTreeMapsCache(void* ptr) {
|
||||||
|
// Called when a thread exits or a ThreadLocalPtr gets destroyed.
|
||||||
|
auto lock_tree_map_cache = static_cast<
|
||||||
|
std::unordered_map<ColumnFamilyId, std::shared_ptr<locktree>>*>(ptr);
|
||||||
|
delete lock_tree_map_cache;
|
||||||
|
}
|
||||||
|
} // anonymous namespace
|
||||||
|
|
||||||
|
RangeTreeLockManager::RangeTreeLockManager(
|
||||||
|
std::shared_ptr<TransactionDBMutexFactory> mutex_factory)
|
||||||
|
: mutex_factory_(mutex_factory),
|
||||||
|
ltree_lookup_cache_(new ThreadLocalPtr(&UnrefLockTreeMapsCache)),
|
||||||
|
dlock_buffer_(10) {
|
||||||
|
ltm_.create(on_create, on_destroy, on_escalate, nullptr, mutex_factory_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void RangeTreeLockManager::SetRangeDeadlockInfoBufferSize(
|
||||||
|
uint32_t target_size) {
|
||||||
|
dlock_buffer_.Resize(target_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void RangeTreeLockManager::Resize(uint32_t target_size) {
|
||||||
|
SetRangeDeadlockInfoBufferSize(target_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<RangeDeadlockPath>
|
||||||
|
RangeTreeLockManager::GetRangeDeadlockInfoBuffer() {
|
||||||
|
return dlock_buffer_.PrepareBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<DeadlockPath> RangeTreeLockManager::GetDeadlockInfoBuffer() {
|
||||||
|
std::vector<DeadlockPath> res;
|
||||||
|
std::vector<RangeDeadlockPath> data = GetRangeDeadlockInfoBuffer();
|
||||||
|
// report left endpoints
|
||||||
|
for (auto it = data.begin(); it != data.end(); ++it) {
|
||||||
|
std::vector<DeadlockInfo> path;
|
||||||
|
|
||||||
|
for (auto it2 = it->path.begin(); it2 != it->path.end(); ++it2) {
|
||||||
|
path.push_back(
|
||||||
|
{it2->m_txn_id, it2->m_cf_id, it2->m_exclusive, it2->m_start.slice});
|
||||||
|
}
|
||||||
|
res.push_back(DeadlockPath(path, it->deadlock_time));
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
// @brief Lock Escalation Callback function
|
||||||
|
//
|
||||||
|
// @param txnid Transaction whose locks got escalated
|
||||||
|
// @param lt Lock Tree where escalation is happening
|
||||||
|
// @param buffer Escalation result: list of locks that this transaction now
|
||||||
|
// owns in this lock tree.
|
||||||
|
// @param void* Callback context
|
||||||
|
void RangeTreeLockManager::on_escalate(TXNID txnid, const locktree* lt,
|
||||||
|
const range_buffer& buffer, void*) {
|
||||||
|
auto txn = (PessimisticTransaction*)txnid;
|
||||||
|
((RangeTreeLockTracker*)&txn->GetTrackedLocks())->ReplaceLocks(lt, buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
RangeTreeLockManager::~RangeTreeLockManager() {
|
||||||
|
autovector<void*> local_caches;
|
||||||
|
ltree_lookup_cache_->Scrape(&local_caches, nullptr);
|
||||||
|
for (auto cache : local_caches) {
|
||||||
|
delete static_cast<LockTreeMap*>(cache);
|
||||||
|
}
|
||||||
|
ltree_map_.clear(); // this will call release_lt() for all locktrees
|
||||||
|
ltm_.destroy();
|
||||||
|
}
|
||||||
|
|
||||||
|
RangeLockManagerHandle::Counters RangeTreeLockManager::GetStatus() {
|
||||||
|
LTM_STATUS_S ltm_status_test;
|
||||||
|
ltm_.get_status(<m_status_test);
|
||||||
|
Counters res;
|
||||||
|
|
||||||
|
// Searching status variable by its string name is how Toku's unit tests
|
||||||
|
// do it (why didn't they make LTM_ESCALATION_COUNT constant visible?)
|
||||||
|
// lookup keyname in status
|
||||||
|
for (int i = 0; i < LTM_STATUS_S::LTM_STATUS_NUM_ROWS; i++) {
|
||||||
|
TOKU_ENGINE_STATUS_ROW status = <m_status_test.status[i];
|
||||||
|
if (strcmp(status->keyname, "LTM_ESCALATION_COUNT") == 0) {
|
||||||
|
res.escalation_count = status->value.num;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (strcmp(status->keyname, "LTM_SIZE_CURRENT") == 0) {
|
||||||
|
res.current_lock_memory = status->value.num;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<locktree> RangeTreeLockManager::MakeLockTreePtr(locktree* lt) {
|
||||||
|
locktree_manager* ltm = <m_;
|
||||||
|
return std::shared_ptr<locktree>(lt,
|
||||||
|
[ltm](locktree* p) { ltm->release_lt(p); });
|
||||||
|
}
|
||||||
|
|
||||||
|
void RangeTreeLockManager::AddColumnFamily(const ColumnFamilyHandle* cfh) {
|
||||||
|
uint32_t column_family_id = cfh->GetID();
|
||||||
|
|
||||||
|
InstrumentedMutexLock l(<ree_map_mutex_);
|
||||||
|
if (ltree_map_.find(column_family_id) == ltree_map_.end()) {
|
||||||
|
DICTIONARY_ID dict_id = {.dictid = column_family_id};
|
||||||
|
toku::comparator cmp;
|
||||||
|
cmp.create(CompareDbtEndpoints, (void*)cfh->GetComparator());
|
||||||
|
toku::locktree* ltree = ltm_.get_lt(dict_id, cmp,
|
||||||
|
/* on_create_extra*/ nullptr);
|
||||||
|
// This is ok to because get_lt has copied the comparator:
|
||||||
|
cmp.destroy();
|
||||||
|
|
||||||
|
ltree_map_.insert({column_family_id, MakeLockTreePtr(ltree)});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void RangeTreeLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cfh) {
|
||||||
|
uint32_t column_family_id = cfh->GetID();
|
||||||
|
// Remove lock_map for this column family. Since the lock map is stored
|
||||||
|
// as a shared ptr, concurrent transactions can still keep using it
|
||||||
|
// until they release their references to it.
|
||||||
|
|
||||||
|
// TODO what if one drops a column family while transaction(s) still have
|
||||||
|
// locks in it?
|
||||||
|
// locktree uses column family'c Comparator* as the criteria to do tree
|
||||||
|
// ordering. If the comparator is gone, we won't even be able to remove the
|
||||||
|
// elements from the locktree.
|
||||||
|
// A possible solution might be to remove everything right now:
|
||||||
|
// - wait until everyone traversing the locktree are gone
|
||||||
|
// - remove everything from the locktree.
|
||||||
|
// - some transactions may have acquired locks in their LockTracker objects.
|
||||||
|
// Arrange something so we don't blow up when they try to release them.
|
||||||
|
// - ...
|
||||||
|
// This use case (drop column family while somebody is using it) doesn't seem
|
||||||
|
// the priority, though.
|
||||||
|
|
||||||
|
{
|
||||||
|
InstrumentedMutexLock l(<ree_map_mutex_);
|
||||||
|
|
||||||
|
auto lock_maps_iter = ltree_map_.find(column_family_id);
|
||||||
|
assert(lock_maps_iter != ltree_map_.end());
|
||||||
|
ltree_map_.erase(lock_maps_iter);
|
||||||
|
} // lock_map_mutex_
|
||||||
|
|
||||||
|
autovector<void*> local_caches;
|
||||||
|
ltree_lookup_cache_->Scrape(&local_caches, nullptr);
|
||||||
|
for (auto cache : local_caches) {
|
||||||
|
delete static_cast<LockTreeMap*>(cache);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<locktree> RangeTreeLockManager::GetLockTreeForCF(
|
||||||
|
ColumnFamilyId column_family_id) {
|
||||||
|
// First check thread-local cache
|
||||||
|
if (ltree_lookup_cache_->Get() == nullptr) {
|
||||||
|
ltree_lookup_cache_->Reset(new LockTreeMap());
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ltree_map_cache = static_cast<LockTreeMap*>(ltree_lookup_cache_->Get());
|
||||||
|
|
||||||
|
auto it = ltree_map_cache->find(column_family_id);
|
||||||
|
if (it != ltree_map_cache->end()) {
|
||||||
|
// Found lock map for this column family.
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Not found in local cache, grab mutex and check shared LockMaps
|
||||||
|
InstrumentedMutexLock l(<ree_map_mutex_);
|
||||||
|
|
||||||
|
it = ltree_map_.find(column_family_id);
|
||||||
|
if (it == ltree_map_.end()) {
|
||||||
|
return nullptr;
|
||||||
|
} else {
|
||||||
|
// Found lock map. Store in thread-local cache and return.
|
||||||
|
ltree_map_cache->insert({column_family_id, it->second});
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct LOCK_PRINT_CONTEXT {
|
||||||
|
RangeLockManagerHandle::RangeLockStatus* data; // Save locks here
|
||||||
|
uint32_t cfh_id; // Column Family whose tree we are traversing
|
||||||
|
};
|
||||||
|
|
||||||
|
// Report left endpoints of the acquired locks
|
||||||
|
LockManager::PointLockStatus RangeTreeLockManager::GetPointLockStatus() {
|
||||||
|
PointLockStatus res;
|
||||||
|
LockManager::RangeLockStatus data = GetRangeLockStatus();
|
||||||
|
// report left endpoints
|
||||||
|
for (auto it = data.begin(); it != data.end(); ++it) {
|
||||||
|
auto& val = it->second;
|
||||||
|
res.insert({it->first, {val.start.slice, val.ids, val.exclusive}});
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void push_into_lock_status_data(void* param, const DBT* left,
|
||||||
|
const DBT* right, TXNID txnid_arg,
|
||||||
|
bool is_shared, TxnidVector* owners) {
|
||||||
|
struct LOCK_PRINT_CONTEXT* ctx = (LOCK_PRINT_CONTEXT*)param;
|
||||||
|
struct RangeLockInfo info;
|
||||||
|
|
||||||
|
info.exclusive = !is_shared;
|
||||||
|
|
||||||
|
deserialize_endpoint(left, &info.start);
|
||||||
|
deserialize_endpoint(right, &info.end);
|
||||||
|
|
||||||
|
if (txnid_arg != TXNID_SHARED) {
|
||||||
|
TXNID txnid = ((PessimisticTransaction*)txnid_arg)->GetID();
|
||||||
|
info.ids.push_back(txnid);
|
||||||
|
} else {
|
||||||
|
for (auto it : *owners) {
|
||||||
|
TXNID real_id = ((PessimisticTransaction*)it)->GetID();
|
||||||
|
info.ids.push_back(real_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ctx->data->insert({ctx->cfh_id, info});
|
||||||
|
}
|
||||||
|
|
||||||
|
LockManager::RangeLockStatus RangeTreeLockManager::GetRangeLockStatus() {
|
||||||
|
LockManager::RangeLockStatus data;
|
||||||
|
{
|
||||||
|
InstrumentedMutexLock l(<ree_map_mutex_);
|
||||||
|
for (auto it : ltree_map_) {
|
||||||
|
LOCK_PRINT_CONTEXT ctx = {&data, it.first};
|
||||||
|
it.second->dump_locks((void*)&ctx, push_into_lock_status_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
#endif // OS_WIN
|
||||||
|
#endif // ROCKSDB_LITE
|
|
@ -0,0 +1,130 @@
|
||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under both the GPLv2 (found in the
|
||||||
|
// COPYING file in the root directory) and Apache 2.0 License
|
||||||
|
// (found in the LICENSE.Apache file in the root directory).
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
#ifndef OS_WIN
|
||||||
|
|
||||||
|
// For DeadlockInfoBuffer:
|
||||||
|
#include "util/thread_local.h"
|
||||||
|
#include "utilities/transactions/lock/point/point_lock_manager.h"
|
||||||
|
#include "utilities/transactions/lock/range/range_lock_manager.h"
|
||||||
|
|
||||||
|
// Lock Tree library:
|
||||||
|
#include "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h"
|
||||||
|
#include "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h"
|
||||||
|
#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
|
||||||
|
|
||||||
|
namespace ROCKSDB_NAMESPACE {
|
||||||
|
|
||||||
|
using namespace toku;
|
||||||
|
|
||||||
|
typedef DeadlockInfoBufferTempl<RangeDeadlockPath> RangeDeadlockInfoBuffer;
|
||||||
|
|
||||||
|
// A Range Lock Manager that uses PerconaFT's locktree library
|
||||||
|
class RangeTreeLockManager : public RangeLockManagerBase,
|
||||||
|
public RangeLockManagerHandle {
|
||||||
|
public:
|
||||||
|
LockManager* getLockManager() override { return this; }
|
||||||
|
|
||||||
|
void AddColumnFamily(const ColumnFamilyHandle* cfh) override;
|
||||||
|
void RemoveColumnFamily(const ColumnFamilyHandle* cfh) override;
|
||||||
|
|
||||||
|
void Resize(uint32_t) override;
|
||||||
|
std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
|
||||||
|
|
||||||
|
std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() override;
|
||||||
|
void SetRangeDeadlockInfoBufferSize(uint32_t target_size) override;
|
||||||
|
|
||||||
|
// Get a lock on a range
|
||||||
|
// @note only exclusive locks are currently supported (requesting a
|
||||||
|
// non-exclusive lock will get an exclusive one)
|
||||||
|
using LockManager::TryLock;
|
||||||
|
Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
|
||||||
|
const Endpoint& start_endp, const Endpoint& end_endp, Env* env,
|
||||||
|
bool exclusive) override;
|
||||||
|
|
||||||
|
void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
|
||||||
|
Env* env) override;
|
||||||
|
void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
|
||||||
|
const std::string& key, Env* env) override;
|
||||||
|
void UnLock(PessimisticTransaction*, ColumnFamilyId, const Endpoint&,
|
||||||
|
const Endpoint&, Env*) override {
|
||||||
|
// TODO: range unlock does nothing...
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit RangeTreeLockManager(
|
||||||
|
std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
|
||||||
|
|
||||||
|
~RangeTreeLockManager() override;
|
||||||
|
|
||||||
|
int SetMaxLockMemory(size_t max_lock_memory) override {
|
||||||
|
return ltm_.set_max_lock_memory(max_lock_memory);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t GetMaxLockMemory() override { return ltm_.get_max_lock_memory(); }
|
||||||
|
|
||||||
|
Counters GetStatus() override;
|
||||||
|
|
||||||
|
bool IsPointLockSupported() const override {
|
||||||
|
// One could have acquired a point lock (it is reduced to range lock)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
PointLockStatus GetPointLockStatus() override;
|
||||||
|
|
||||||
|
// This is from LockManager
|
||||||
|
LockManager::RangeLockStatus GetRangeLockStatus() override;
|
||||||
|
|
||||||
|
// This has the same meaning as GetRangeLockStatus but is from
|
||||||
|
// RangeLockManagerHandle
|
||||||
|
RangeLockManagerHandle::RangeLockStatus GetRangeLockStatusData() override {
|
||||||
|
return GetRangeLockStatus();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool IsRangeLockSupported() const override { return true; }
|
||||||
|
|
||||||
|
const LockTrackerFactory& GetLockTrackerFactory() const override {
|
||||||
|
return RangeTreeLockTrackerFactory::Get();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the locktree which stores locks for the Column Family with given cf_id
|
||||||
|
std::shared_ptr<locktree> GetLockTreeForCF(ColumnFamilyId cf_id);
|
||||||
|
|
||||||
|
private:
|
||||||
|
toku::locktree_manager ltm_;
|
||||||
|
|
||||||
|
std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
|
||||||
|
|
||||||
|
// Map from cf_id to locktree*. Can only be accessed while holding the
|
||||||
|
// ltree_map_mutex_. Must use a custom deleter that calls ltm_.release_lt
|
||||||
|
using LockTreeMap =
|
||||||
|
std::unordered_map<ColumnFamilyId, std::shared_ptr<locktree>>;
|
||||||
|
LockTreeMap ltree_map_;
|
||||||
|
|
||||||
|
InstrumentedMutex ltree_map_mutex_;
|
||||||
|
|
||||||
|
// Per-thread cache of ltree_map_.
|
||||||
|
// (uses the same approach as TransactionLockMgr::lock_maps_cache_)
|
||||||
|
std::unique_ptr<ThreadLocalPtr> ltree_lookup_cache_;
|
||||||
|
|
||||||
|
RangeDeadlockInfoBuffer dlock_buffer_;
|
||||||
|
|
||||||
|
std::shared_ptr<locktree> MakeLockTreePtr(locktree* lt);
|
||||||
|
static int CompareDbtEndpoints(void* arg, const DBT* a_key, const DBT* b_key);
|
||||||
|
|
||||||
|
// Callbacks
|
||||||
|
static int on_create(locktree*, void*) { return 0; /* no error */ }
|
||||||
|
static void on_destroy(locktree*) {}
|
||||||
|
static void on_escalate(TXNID txnid, const locktree* lt,
|
||||||
|
const range_buffer& buffer, void* extra);
|
||||||
|
};
|
||||||
|
|
||||||
|
void serialize_endpoint(const Endpoint& endp, std::string* buf);
|
||||||
|
void wait_callback_for_locktree(void* cdata, lock_wait_infos* infos);
|
||||||
|
|
||||||
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
#endif // OS_WIN
|
||||||
|
#endif // ROCKSDB_LITE
|
|
@ -0,0 +1,156 @@
|
||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under both the GPLv2 (found in the
|
||||||
|
// COPYING file in the root directory) and Apache 2.0 License
|
||||||
|
// (found in the LICENSE.Apache file in the root directory).
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
#ifndef OS_WIN
|
||||||
|
|
||||||
|
#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
|
||||||
|
|
||||||
|
#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h"
|
||||||
|
|
||||||
|
namespace ROCKSDB_NAMESPACE {
|
||||||
|
|
||||||
|
RangeLockList *RangeTreeLockTracker::getOrCreateList() {
|
||||||
|
if (range_list_) return range_list_.get();
|
||||||
|
|
||||||
|
// Doesn't exist, create
|
||||||
|
range_list_.reset(new RangeLockList());
|
||||||
|
return range_list_.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
void RangeTreeLockTracker::Track(const PointLockRequest &lock_req) {
|
||||||
|
DBT key_dbt;
|
||||||
|
std::string key;
|
||||||
|
serialize_endpoint(Endpoint(lock_req.key, false), &key);
|
||||||
|
toku_fill_dbt(&key_dbt, key.data(), key.size());
|
||||||
|
RangeLockList *rl = getOrCreateList();
|
||||||
|
rl->Append(lock_req.column_family_id, &key_dbt, &key_dbt);
|
||||||
|
}
|
||||||
|
|
||||||
|
void RangeTreeLockTracker::Track(const RangeLockRequest &lock_req) {
|
||||||
|
DBT start_dbt, end_dbt;
|
||||||
|
std::string start_key, end_key;
|
||||||
|
|
||||||
|
serialize_endpoint(lock_req.start_endp, &start_key);
|
||||||
|
serialize_endpoint(lock_req.end_endp, &end_key);
|
||||||
|
|
||||||
|
toku_fill_dbt(&start_dbt, start_key.data(), start_key.size());
|
||||||
|
toku_fill_dbt(&end_dbt, end_key.data(), end_key.size());
|
||||||
|
|
||||||
|
RangeLockList *rl = getOrCreateList();
|
||||||
|
rl->Append(lock_req.column_family_id, &start_dbt, &end_dbt);
|
||||||
|
}
|
||||||
|
|
||||||
|
PointLockStatus RangeTreeLockTracker::GetPointLockStatus(
|
||||||
|
ColumnFamilyId /*cf_id*/, const std::string & /*key*/) const {
|
||||||
|
// This function is not expected to be called as RangeTreeLockTracker::
|
||||||
|
// IsPointLockSupported() returns false. Return the status which indicates
|
||||||
|
// the point is not locked.
|
||||||
|
PointLockStatus p;
|
||||||
|
p.locked = false;
|
||||||
|
p.exclusive = true;
|
||||||
|
p.seq = 0;
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
void RangeTreeLockTracker::Clear() { range_list_.reset(); }
|
||||||
|
|
||||||
|
void RangeLockList::Append(ColumnFamilyId cf_id, const DBT *left_key,
|
||||||
|
const DBT *right_key) {
|
||||||
|
MutexLock l(&mutex_);
|
||||||
|
// Only the transaction owner thread calls this function.
|
||||||
|
// The same thread does the lock release, so we can be certain nobody is
|
||||||
|
// releasing the locks concurrently.
|
||||||
|
assert(!releasing_locks_.load());
|
||||||
|
auto it = buffers_.find(cf_id);
|
||||||
|
if (it == buffers_.end()) {
|
||||||
|
// create a new one
|
||||||
|
it = buffers_.emplace(cf_id, std::make_shared<toku::range_buffer>()).first;
|
||||||
|
it->second->create();
|
||||||
|
}
|
||||||
|
it->second->append(left_key, right_key);
|
||||||
|
}
|
||||||
|
|
||||||
|
void RangeLockList::ReleaseLocks(RangeTreeLockManager *mgr,
|
||||||
|
PessimisticTransaction *txn,
|
||||||
|
bool all_trx_locks) {
|
||||||
|
{
|
||||||
|
MutexLock l(&mutex_);
|
||||||
|
// The lt->release_locks() call below will walk range_list->buffer_. We
|
||||||
|
// need to prevent lock escalation callback from replacing
|
||||||
|
// range_list->buffer_ while we are doing that.
|
||||||
|
//
|
||||||
|
// Additional complication here is internal mutex(es) in the locktree
|
||||||
|
// (let's call them latches):
|
||||||
|
// - Lock escalation first obtains latches on the lock tree
|
||||||
|
// - Then, it calls RangeTreeLockManager::on_escalate to replace
|
||||||
|
// transaction's range_list->buffer_. = Access to that buffer must be
|
||||||
|
// synchronized, so it will want to acquire the range_list->mutex_.
|
||||||
|
//
|
||||||
|
// While in this function we would want to do the reverse:
|
||||||
|
// - Acquire range_list->mutex_ to prevent access to the range_list.
|
||||||
|
// - Then, lt->release_locks() call will walk through the range_list
|
||||||
|
// - and acquire latches on parts of the lock tree to remove locks from
|
||||||
|
// it.
|
||||||
|
//
|
||||||
|
// How do we avoid the deadlock? The idea is that here we set
|
||||||
|
// releasing_locks_=true, and release the mutex.
|
||||||
|
// All other users of the range_list must:
|
||||||
|
// - Acquire the mutex, then check that releasing_locks_=false.
|
||||||
|
// (the code in this function doesnt do that as there's only one thread
|
||||||
|
// that releases transaction's locks)
|
||||||
|
releasing_locks_.store(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto it : buffers_) {
|
||||||
|
// Don't try to call release_locks() if the buffer is empty! if we are
|
||||||
|
// not holding any locks, the lock tree might be in the STO-mode with
|
||||||
|
// another transaction, and our attempt to release an empty set of locks
|
||||||
|
// will cause an assertion failure.
|
||||||
|
if (it.second->get_num_ranges()) {
|
||||||
|
auto lt_ptr = mgr->GetLockTreeForCF(it.first);
|
||||||
|
toku::locktree *lt = lt_ptr.get();
|
||||||
|
|
||||||
|
lt->release_locks((TXNID)txn, it.second.get(), all_trx_locks);
|
||||||
|
|
||||||
|
it.second->destroy();
|
||||||
|
it.second->create();
|
||||||
|
|
||||||
|
toku::lock_request::retry_all_lock_requests(lt,
|
||||||
|
wait_callback_for_locktree);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Clear();
|
||||||
|
releasing_locks_.store(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
void RangeLockList::ReplaceLocks(const toku::locktree *lt,
|
||||||
|
const toku::range_buffer &buffer) {
|
||||||
|
MutexLock l(&mutex_);
|
||||||
|
if (releasing_locks_.load()) {
|
||||||
|
// Do nothing. The transaction is releasing its locks, so it will not care
|
||||||
|
// about having a correct list of ranges. (In TokuDB,
|
||||||
|
// toku_db_txn_escalate_callback() makes use of this property, too)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ColumnFamilyId cf_id = (ColumnFamilyId)lt->get_dict_id().dictid;
|
||||||
|
|
||||||
|
auto it = buffers_.find(cf_id);
|
||||||
|
it->second->destroy();
|
||||||
|
it->second->create();
|
||||||
|
|
||||||
|
toku::range_buffer::iterator iter(&buffer);
|
||||||
|
toku::range_buffer::iterator::record rec;
|
||||||
|
while (iter.current(&rec)) {
|
||||||
|
it->second->append(rec.get_left_key(), rec.get_right_key());
|
||||||
|
iter.next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
#endif // OS_WIN
|
||||||
|
#endif // ROCKSDB_LITE
|
|
@ -0,0 +1,146 @@
|
||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under both the GPLv2 (found in the
|
||||||
|
// COPYING file in the root directory) and Apache 2.0 License
|
||||||
|
// (found in the LICENSE.Apache file in the root directory).
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include "util/mutexlock.h"
|
||||||
|
#include "utilities/transactions/lock/lock_tracker.h"
|
||||||
|
#include "utilities/transactions/pessimistic_transaction.h"
|
||||||
|
|
||||||
|
// Range Locking:
|
||||||
|
#include "lib/locktree/lock_request.h"
|
||||||
|
#include "lib/locktree/locktree.h"
|
||||||
|
|
||||||
|
namespace ROCKSDB_NAMESPACE {
|
||||||
|
|
||||||
|
class RangeTreeLockManager;
|
||||||
|
|
||||||
|
// Storage for locks that are currently held by a transaction.
|
||||||
|
//
|
||||||
|
// Locks are kept in toku::range_buffer because toku::locktree::release_locks()
|
||||||
|
// accepts that as an argument.
|
||||||
|
//
|
||||||
|
// Note: the list of locks may differ slighly from the contents of the lock
|
||||||
|
// tree, due to concurrency between lock acquisition, lock release, and lock
|
||||||
|
// escalation. See MDEV-18227 and RangeTreeLockManager::UnLock for details.
|
||||||
|
// This property is currently harmless.
|
||||||
|
//
|
||||||
|
// Append() and ReleaseLocks() are not thread-safe, as they are expected to be
|
||||||
|
// called only by the owner transaction. ReplaceLocks() is safe to call from
|
||||||
|
// other threads.
|
||||||
|
class RangeLockList {
|
||||||
|
public:
|
||||||
|
~RangeLockList() { Clear(); }
|
||||||
|
|
||||||
|
RangeLockList() : releasing_locks_(false) {}
|
||||||
|
|
||||||
|
void Append(ColumnFamilyId cf_id, const DBT* left_key, const DBT* right_key);
|
||||||
|
void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn,
|
||||||
|
bool all_trx_locks);
|
||||||
|
void ReplaceLocks(const toku::locktree* lt, const toku::range_buffer& buffer);
|
||||||
|
|
||||||
|
private:
|
||||||
|
void Clear() {
|
||||||
|
for (auto it : buffers_) {
|
||||||
|
it.second->destroy();
|
||||||
|
}
|
||||||
|
buffers_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::range_buffer>>
|
||||||
|
buffers_;
|
||||||
|
port::Mutex mutex_;
|
||||||
|
std::atomic<bool> releasing_locks_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// A LockTracker-based object that is used together with RangeTreeLockManager.
|
||||||
|
class RangeTreeLockTracker : public LockTracker {
|
||||||
|
public:
|
||||||
|
RangeTreeLockTracker() : range_list_(nullptr) {}
|
||||||
|
|
||||||
|
RangeTreeLockTracker(const RangeTreeLockTracker&) = delete;
|
||||||
|
RangeTreeLockTracker& operator=(const RangeTreeLockTracker&) = delete;
|
||||||
|
|
||||||
|
void Track(const PointLockRequest&) override;
|
||||||
|
void Track(const RangeLockRequest&) override;
|
||||||
|
|
||||||
|
bool IsPointLockSupported() const override {
|
||||||
|
// This indicates that we don't implement GetPointLockStatus()
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
bool IsRangeLockSupported() const override { return true; }
|
||||||
|
|
||||||
|
// a Not-supported dummy implementation.
|
||||||
|
UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override {
|
||||||
|
return UntrackStatus::NOT_TRACKED;
|
||||||
|
}
|
||||||
|
|
||||||
|
UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) override {
|
||||||
|
return UntrackStatus::NOT_TRACKED;
|
||||||
|
}
|
||||||
|
|
||||||
|
// "If this method is not supported, leave it as a no-op."
|
||||||
|
void Merge(const LockTracker&) override {}
|
||||||
|
|
||||||
|
// "If this method is not supported, leave it as a no-op."
|
||||||
|
void Subtract(const LockTracker&) override {}
|
||||||
|
|
||||||
|
void Clear() override;
|
||||||
|
|
||||||
|
// "If this method is not supported, returns nullptr."
|
||||||
|
virtual LockTracker* GetTrackedLocksSinceSavePoint(
|
||||||
|
const LockTracker&) const override {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id,
|
||||||
|
const std::string& key) const override;
|
||||||
|
|
||||||
|
// The return value is only used for tests
|
||||||
|
uint64_t GetNumPointLocks() const override { return 0; }
|
||||||
|
|
||||||
|
ColumnFamilyIterator* GetColumnFamilyIterator() const override {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
KeyIterator* GetKeyIterator(
|
||||||
|
ColumnFamilyId /*column_family_id*/) const override {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn,
|
||||||
|
bool all_trx_locks) {
|
||||||
|
if (range_list_) range_list_->ReleaseLocks(mgr, txn, all_trx_locks);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ReplaceLocks(const toku::locktree* lt,
|
||||||
|
const toku::range_buffer& buffer) {
|
||||||
|
// range_list_ cannot be NULL here
|
||||||
|
range_list_->ReplaceLocks(lt, buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
RangeLockList* getOrCreateList();
|
||||||
|
std::unique_ptr<RangeLockList> range_list_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class RangeTreeLockTrackerFactory : public LockTrackerFactory {
|
||||||
|
public:
|
||||||
|
static const RangeTreeLockTrackerFactory& Get() {
|
||||||
|
static const RangeTreeLockTrackerFactory instance;
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
LockTracker* Create() const override { return new RangeTreeLockTracker(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
RangeTreeLockTrackerFactory() {}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace ROCKSDB_NAMESPACE
|
|
@ -250,6 +250,8 @@ class TransactionBaseImpl : public Transaction {
|
||||||
|
|
||||||
WriteBatch* GetCommitTimeWriteBatch() override;
|
WriteBatch* GetCommitTimeWriteBatch() override;
|
||||||
|
|
||||||
|
LockTracker& GetTrackedLocks() { return *tracked_locks_; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// Add a key to the list of tracked keys.
|
// Add a key to the list of tracked keys.
|
||||||
//
|
//
|
||||||
|
|
Loading…
Reference in New Issue