diff --git a/CMakeLists.txt b/CMakeLists.txt index ab5c136acc..1d14083492 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,6 +221,10 @@ set(SOURCES utilities/table_properties_collectors/compact_on_deletion_collector.cc utilities/transactions/optimistic_transaction_impl.cc utilities/transactions/optimistic_transaction_db_impl.cc + utilities/transactions/transaction_impl.cc + utilities/transactions/transaction_db_impl.cc + utilities/transactions/transaction_lock_mgr.cc + utilities/transactions/transaction_util.cc utilities/ttl/db_ttl_impl.cc utilities/write_batch_with_index/write_batch_with_index.cc utilities/write_batch_with_index/write_batch_with_index_internal.cc @@ -333,6 +337,7 @@ set(TESTS utilities/spatialdb/spatial_db_test.cc utilities/table_properties_collectors/compact_on_deletion_collector_test.cc utilities/transactions/optimistic_transaction_test.cc + utilities/transactions/transaction_test.cc utilities/ttl/ttl_test.cc utilities/write_batch_with_index/write_batch_with_index_test.cc ) diff --git a/HISTORY.md b/HISTORY.md index d590e3f911..d0eaac53f4 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -17,6 +17,7 @@ ## 3.12.0 (7/2/2015) ### New Features +* Added experimental support for pessimistic transactions. See include/rocksdb/utilities/transaction.h for more info. * Added experimental support for optimistic transactions. See include/rocksdb/utilities/optimistic_transaction.h for more info. * Added a new way to report QPS from db_bench (check out --report_file and --report_interval_seconds) * Added a cache for individual rows. See DBOptions::row_cache for more info. diff --git a/Makefile b/Makefile index 4dbc0bfd50..2a956766ac 100644 --- a/Makefile +++ b/Makefile @@ -304,7 +304,8 @@ TESTS = \ write_callback_test \ heap_test \ compact_on_deletion_collector_test \ - compaction_job_stats_test + compaction_job_stats_test \ + transaction_test SUBSET := $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/) @@ -919,6 +920,9 @@ write_callback_test: db/write_callback_test.o $(LIBOBJECTS) $(TESTHARNESS) heap_test: util/heap_test.o $(GTEST) $(AM_LINK) +transaction_test: utilities/transactions/transaction_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + sst_dump: tools/sst_dump.o $(LIBOBJECTS) $(AM_LINK) diff --git a/db/db_bench.cc b/db/db_bench.cc index 2332afdbf7..3c47a7839e 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -54,7 +54,8 @@ int main() { #include "rocksdb/slice_transform.h" #include "rocksdb/perf_context.h" #include "rocksdb/utilities/flashcache.h" -#include "rocksdb/utilities/optimistic_transaction.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "port/port.h" #include "port/stack_trace.h" @@ -448,10 +449,14 @@ DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/" DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Ignored. Left here for backward compatibility"); -DEFINE_bool(transaction_db, false, +DEFINE_bool(optimistic_transaction_db, false, "Open a OptimisticTransactionDB instance. " "Required for randomtransaction benchmark."); +DEFINE_bool(transaction_db, false, + "Open a TransactionDB instance. " + "Required for randomtransaction benchmark."); + DEFINE_uint64(transaction_sets, 2, "Number of keys each transaction will " "modify (use in RandomTransaction only). Max: 9999"); @@ -919,7 +924,7 @@ static void AppendWithSpace(std::string* str, Slice msg) { struct DBWithColumnFamilies { std::vector cfh; DB* db; - OptimisticTransactionDB* txn_db; + OptimisticTransactionDB* opt_txn_db; std::atomic num_created; // Need to be updated after all the // new entries in cfh are set. size_t num_hot; // Number of column families to be queried at each moment. @@ -927,7 +932,7 @@ struct DBWithColumnFamilies { // Column families will be created and used to be queried. port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf() - DBWithColumnFamilies() : db(nullptr), txn_db(nullptr) { + DBWithColumnFamilies() : db(nullptr), opt_txn_db(nullptr) { cfh.clear(); num_created = 0; num_hot = 0; @@ -936,7 +941,7 @@ struct DBWithColumnFamilies { DBWithColumnFamilies(const DBWithColumnFamilies& other) : cfh(other.cfh), db(other.db), - txn_db(other.txn_db), + opt_txn_db(other.opt_txn_db), num_created(other.num_created.load()), num_hot(other.num_hot) {} @@ -944,9 +949,9 @@ struct DBWithColumnFamilies { std::for_each(cfh.begin(), cfh.end(), [](ColumnFamilyHandle* cfhi) { delete cfhi; }); cfh.clear(); - if (txn_db) { - delete txn_db; - txn_db = nullptr; + if (opt_txn_db) { + delete opt_txn_db; + opt_txn_db = nullptr; } else { delete db; } @@ -2445,11 +2450,19 @@ class Benchmark { if (FLAGS_readonly) { s = DB::OpenForReadOnly(options, db_name, column_families, &db->cfh, &db->db); - } else if (FLAGS_transaction_db) { + } else if (FLAGS_optimistic_transaction_db) { s = OptimisticTransactionDB::Open(options, db_name, column_families, - &db->cfh, &db->txn_db); + &db->cfh, &db->opt_txn_db); if (s.ok()) { - db->db = db->txn_db->GetBaseDB(); + db->db = db->opt_txn_db->GetBaseDB(); + } + } else if (FLAGS_transaction_db) { + TransactionDB* ptr; + TransactionDBOptions txn_db_options; + s = TransactionDB::Open(options, txn_db_options, db_name, + column_families, &db->cfh, &ptr); + if (s.ok()) { + db->db = ptr; } } else { s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); @@ -2459,11 +2472,19 @@ class Benchmark { db->num_hot = num_hot; } else if (FLAGS_readonly) { s = DB::OpenForReadOnly(options, db_name, &db->db); - } else if (FLAGS_transaction_db) { - s = OptimisticTransactionDB::Open(options, db_name, &db->txn_db); + } else if (FLAGS_optimistic_transaction_db) { + s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db); if (s.ok()) { - db->db = db->txn_db->GetBaseDB(); + db->db = db->opt_txn_db->GetBaseDB(); } + } else if (FLAGS_transaction_db) { + TransactionDB* ptr; + TransactionDBOptions txn_db_options; + s = TransactionDB::Open(options, txn_db_options, db_name, &ptr); + if (s.ok()) { + db->db = ptr; + } + } else { s = DB::Open(options, db_name, &db->db); } @@ -3530,7 +3551,6 @@ class Benchmark { uint64_t transactions_aborted = 0; Status s; uint64_t num_prefix_ranges = FLAGS_transaction_sets; - bool use_txn = FLAGS_transaction_db; if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) { fprintf(stderr, "invalid value for transaction_sets\n"); @@ -3545,12 +3565,17 @@ class Benchmark { } while (!duration.Done(1)) { - OptimisticTransaction* txn = nullptr; + Transaction* txn = nullptr; WriteBatch* batch = nullptr; - if (use_txn) { - txn = db_.txn_db->BeginTransaction(write_options_); + if (FLAGS_optimistic_transaction_db) { + txn = db_.opt_txn_db->BeginTransaction(write_options_); assert(txn); + } else if (FLAGS_transaction_db) { + TransactionDB* txn_db = reinterpret_cast(db_.db); + TransactionOptions txn_options; + txn_options.expiration = 10000000; + txn = txn_db->BeginTransaction(write_options_, txn_options); } else { batch = new WriteBatch(); } @@ -3558,6 +3583,7 @@ class Benchmark { // pick a random number to use to increment a key in each set uint64_t incr = (thread->rand.Next() % 100) + 1; + bool failed = false; // For each set, pick a key at random and increment it for (uint8_t i = 0; i < num_prefix_ranges; i++) { uint64_t int_value; @@ -3572,8 +3598,8 @@ class Benchmark { std::string full_key = std::string(prefix_buf) + base_key.ToString(); Slice key(full_key); - if (use_txn) { - s = txn->Get(read_options, key, &value); + if (txn) { + s = txn->GetForUpdate(read_options, key, &value); } else { s = db->Get(read_options, key, &value); } @@ -3599,15 +3625,23 @@ class Benchmark { } std::string sum = ToString(int_value + incr); - if (use_txn) { - txn->Put(key, sum); + if (txn) { + s = txn->Put(key, sum); + if (!s.ok()) { + failed = true; + break; + } } else { batch->Put(key, sum); } } - if (use_txn) { - s = txn->Commit(); + if (txn) { + if (failed) { + txn->Rollback(); + } else { + s = txn->Commit(); + } } else { s = db->Write(write_options_, batch); } @@ -3616,7 +3650,7 @@ class Benchmark { // Ideally, we'd want to run this stress test with enough concurrency // on a small enough set of keys that we get some failed transactions // due to conflicts. - if (use_txn && s.IsBusy()) { + if (txn && s.IsBusy()) { transactions_aborted++; } else { fprintf(stderr, "Unexpected write error: %s\n", s.ToString().c_str()); @@ -3635,7 +3669,7 @@ class Benchmark { } char msg[100]; - if (use_txn) { + if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) { snprintf(msg, sizeof(msg), "( transactions:%" PRIu64 " aborts:%" PRIu64 ")", transactions_done, transactions_aborted); @@ -3653,7 +3687,7 @@ class Benchmark { // Since each iteration of RandomTransaction() incremented a key in each set // by the same value, the sum of the keys in each set should be the same. void RandomTransactionVerify() { - if (!FLAGS_transaction_db) { + if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) { // transactions not used, nothing to verify. return; } diff --git a/db/db_impl.cc b/db/db_impl.cc index 7652a66d61..9837ed3b4a 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -3686,7 +3686,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, mutex_.Lock(); } - if (db_options_.paranoid_checks && !status.ok() && + if (db_options_.paranoid_checks && !status.ok() && !status.IsTimedOut() && !status.IsBusy() && bg_error_.ok()) { bg_error_ = status; // stop compaction & fail any further writes } @@ -3944,6 +3944,22 @@ SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) { return GetAndRefSuperVersion(cfd); } +// REQUIRED: mutex is NOT held +SuperVersion* DBImpl::GetAndRefSuperVersionUnlocked(uint32_t column_family_id) { + ColumnFamilyData* cfd; + { + InstrumentedMutexLock l(&mutex_); + auto column_family_set = versions_->GetColumnFamilySet(); + cfd = column_family_set->GetColumnFamily(column_family_id); + } + + if (!cfd) { + return nullptr; + } + + return GetAndRefSuperVersion(cfd); +} + void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv) { bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); @@ -3974,6 +3990,22 @@ void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id, ReturnAndCleanupSuperVersion(cfd, sv); } +// REQUIRED: Mutex should NOT be held. +void DBImpl::ReturnAndCleanupSuperVersionUnlocked(uint32_t column_family_id, + SuperVersion* sv) { + ColumnFamilyData* cfd; + { + InstrumentedMutexLock l(&mutex_); + auto column_family_set = versions_->GetColumnFamilySet(); + cfd = column_family_set->GetColumnFamily(column_family_id); + } + + // If SuperVersion is held, and we successfully fetched a cfd using + // GetAndRefSuperVersion(), it must still exist. + assert(cfd != nullptr); + ReturnAndCleanupSuperVersion(cfd, sv); +} + // REQUIRED: this function should only be called on the write thread or if the // mutex is held. ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { @@ -3986,6 +4018,20 @@ ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { return cf_memtables->GetColumnFamilyHandle(); } +// REQUIRED: mutex is NOT held. +ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked( + uint32_t column_family_id) { + ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get(); + + InstrumentedMutexLock l(&mutex_); + + if (!cf_memtables->Seek(column_family_id)) { + return nullptr; + } + + return cf_memtables->GetColumnFamilyHandle(); +} + void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, const Range* range, int n, uint64_t* sizes, bool include_memtable) { diff --git a/db/db_impl.h b/db/db_impl.h index d2f4b868dc..6a7d1c3f55 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -326,6 +326,9 @@ class DBImpl : public DB { // mutex is held. SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id); + // Same as above, should called without mutex held and not on write thread. + SuperVersion* GetAndRefSuperVersionUnlocked(uint32_t column_family_id); + // Un-reference the super version and return it to thread local cache if // needed. If it is the last reference of the super version. Clean it up // after un-referencing it. @@ -336,11 +339,18 @@ class DBImpl : public DB { // REQUIRED: this function should only be called on the write thread. void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv); + // Same as above, should called without mutex held and not on write thread. + void ReturnAndCleanupSuperVersionUnlocked(uint32_t colun_family_id, + SuperVersion* sv); + // REQUIRED: this function should only be called on the write thread or if the // mutex is held. Return value only valid until next call to this function or // mutex is released. ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id); + // Same as above, should called without mutex held and not on write thread. + ColumnFamilyHandle* GetColumnFamilyHandleUnlocked(uint32_t column_family_id); + protected: Env* const env_; const std::string dbname_; diff --git a/examples/Makefile b/examples/Makefile index 1535d9b29d..0757f5f032 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -2,7 +2,7 @@ include ../make_config.mk .PHONY: clean -all: simple_example column_families_example compact_files_example c_simple_example transaction_example +all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example simple_example: simple_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) @@ -19,8 +19,11 @@ compact_files_example: compact_files_example.cc c_simple_example: c_simple_example.o $(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS) +optimistic_transaction_example: optimistic_transaction_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + transaction_example: transaction_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) clean: - rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o ./transaction_example + rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example diff --git a/examples/optimistic_transaction_example.cc b/examples/optimistic_transaction_example.cc new file mode 100644 index 0000000000..e9ab0e5ee2 --- /dev/null +++ b/examples/optimistic_transaction_example.cc @@ -0,0 +1,142 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" + +using namespace rocksdb; + +std::string kDBPath = "/tmp/rocksdb_transaction_example"; + +int main() { + // open DB + Options options; + options.create_if_missing = true; + DB* db; + OptimisticTransactionDB* txn_db; + + Status s = OptimisticTransactionDB::Open(options, kDBPath, &txn_db); + assert(s.ok()); + db = txn_db->GetBaseDB(); + + WriteOptions write_options; + ReadOptions read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + //////////////////////////////////////////////////////// + // + // Simple OptimisticTransaction Example ("Read Committed") + // + //////////////////////////////////////////////////////// + + // Start a transaction + Transaction* txn = txn_db->BeginTransaction(write_options); + assert(txn); + + // Read a key in this transaction + s = txn->Get(read_options, "abc", &value); + assert(s.IsNotFound()); + + // Write a key in this transaction + txn->Put("abc", "def"); + + // Read a key OUTSIDE this transaction. Does not affect txn. + s = db->Get(read_options, "abc", &value); + + // Write a key OUTSIDE of this transaction. + // Does not affect txn since this is an unrelated key. If we wrote key 'abc' + // here, the transaction would fail to commit. + s = db->Put(write_options, "xyz", "zzz"); + + // Commit transaction + s = txn->Commit(); + assert(s.ok()); + delete txn; + + //////////////////////////////////////////////////////// + // + // "Repeatable Read" (Snapshot Isolation) Example + // -- Using a single Snapshot + // + //////////////////////////////////////////////////////// + + // Set a snapshot at start of transaction by setting set_snapshot=true + txn_options.set_snapshot = true; + txn = txn_db->BeginTransaction(write_options, txn_options); + + const Snapshot* snapshot = txn->GetSnapshot(); + + // Write a key OUTSIDE of transaction + db->Put(write_options, "abc", "xyz"); + + // Read a key using the snapshot + read_options.snapshot = snapshot; + s = txn->GetForUpdate(read_options, "abc", &value); + assert(value == "def"); + + // Attempt to commit transaction + s = txn->Commit(); + + // Transaction could not commit since the write outside of the txn conflicted + // with the read! + assert(s.IsBusy()); + + delete txn; + // Clear snapshot from read options since it is no longer valid + read_options.snapshot = nullptr; + snapshot = nullptr; + + //////////////////////////////////////////////////////// + // + // "Read Committed" (Monotonic Atomic Views) Example + // --Using multiple Snapshots + // + //////////////////////////////////////////////////////// + + // In this example, we set the snapshot multiple times. This is probably + // only necessary if you have very strict isolation requirements to + // implement. + + // Set a snapshot at start of transaction + txn_options.set_snapshot = true; + txn = txn_db->BeginTransaction(write_options, txn_options); + + // Do some reads and writes to key "x" + read_options.snapshot = db->GetSnapshot(); + s = txn->Get(read_options, "x", &value); + txn->Put("x", "x"); + + // Do a write outside of the transaction to key "y" + s = db->Put(write_options, "y", "y"); + + // Set a new snapshot in the transaction + txn->SetSnapshot(); + read_options.snapshot = db->GetSnapshot(); + + // Do some reads and writes to key "y" + s = txn->GetForUpdate(read_options, "y", &value); + txn->Put("y", "y"); + + // Commit. Since the snapshot was advanced, the write done outside of the + // transaction does not prevent this transaction from Committing. + s = txn->Commit(); + assert(s.ok()); + delete txn; + // Clear snapshot from read options since it is no longer valid + read_options.snapshot = nullptr; + + // Cleanup + delete txn_db; + DestroyDB(kDBPath, options); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/examples/transaction_example.cc b/examples/transaction_example.cc index 02f309c595..a7d5061293 100644 --- a/examples/transaction_example.cc +++ b/examples/transaction_example.cc @@ -8,8 +8,8 @@ #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" -#include "rocksdb/utilities/optimistic_transaction.h" -#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" using namespace rocksdb; @@ -18,17 +18,16 @@ std::string kDBPath = "/tmp/rocksdb_transaction_example"; int main() { // open DB Options options; + TransactionDBOptions txn_db_options; options.create_if_missing = true; - DB* db; - OptimisticTransactionDB* txn_db; + TransactionDB* txn_db; - Status s = OptimisticTransactionDB::Open(options, kDBPath, &txn_db); + Status s = TransactionDB::Open(options, txn_db_options, kDBPath, &txn_db); assert(s.ok()); - db = txn_db->GetBaseDB(); WriteOptions write_options; ReadOptions read_options; - OptimisticTransactionOptions txn_options; + TransactionOptions txn_options; std::string value; //////////////////////////////////////////////////////// @@ -38,7 +37,7 @@ int main() { //////////////////////////////////////////////////////// // Start a transaction - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); assert(txn); // Read a key in this transaction @@ -46,15 +45,16 @@ int main() { assert(s.IsNotFound()); // Write a key in this transaction - txn->Put("abc", "def"); + s = txn->Put("abc", "def"); + assert(s.ok()); // Read a key OUTSIDE this transaction. Does not affect txn. - s = db->Get(read_options, "abc", &value); + s = txn_db->Get(read_options, "abc", &value); // Write a key OUTSIDE of this transaction. // Does not affect txn since this is an unrelated key. If we wrote key 'abc' // here, the transaction would fail to commit. - s = db->Put(write_options, "xyz", "zzz"); + s = txn_db->Put(write_options, "xyz", "zzz"); // Commit transaction s = txn->Commit(); @@ -75,20 +75,17 @@ int main() { const Snapshot* snapshot = txn->GetSnapshot(); // Write a key OUTSIDE of transaction - db->Put(write_options, "abc", "xyz"); + s = txn_db->Put(write_options, "abc", "xyz"); + assert(s.ok()); - // Read a key using the snapshot + // Attempt to read a key using the snapshot. This will fail since + // the previous write outside this txn conflicts with this read. read_options.snapshot = snapshot; s = txn->GetForUpdate(read_options, "abc", &value); - assert(value == "def"); - - // Attempt to commit transaction - s = txn->Commit(); - - // Transaction could not commit since the write outside of the txn conflicted - // with the read! assert(s.IsBusy()); + txn->Rollback(); + delete txn; // Clear snapshot from read options since it is no longer valid read_options.snapshot = nullptr; @@ -110,23 +107,28 @@ int main() { txn = txn_db->BeginTransaction(write_options, txn_options); // Do some reads and writes to key "x" - read_options.snapshot = db->GetSnapshot(); + read_options.snapshot = txn_db->GetSnapshot(); s = txn->Get(read_options, "x", &value); txn->Put("x", "x"); // Do a write outside of the transaction to key "y" - s = db->Put(write_options, "y", "y"); + s = txn_db->Put(write_options, "y", "y"); // Set a new snapshot in the transaction txn->SetSnapshot(); - read_options.snapshot = db->GetSnapshot(); + txn->SetSavePoint(); + read_options.snapshot = txn_db->GetSnapshot(); // Do some reads and writes to key "y" + // Since the snapshot was advanced, the write done outside of the + // transaction does not conflict. s = txn->GetForUpdate(read_options, "y", &value); txn->Put("y", "y"); - // Commit. Since the snapshot was advanced, the write done outside of the - // transaction does not prevent this transaction from Committing. + // Decide we want to revert the last write from this transaction. + txn->RollbackToSavePoint(); + + // Commit. s = txn->Commit(); assert(s.ok()); delete txn; diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index d99fabac0d..888f212665 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -80,6 +80,10 @@ class Status { static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kBusy, msg, msg2); } + static Status TimedOut() { return Status(kTimedOut); } + static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kTimedOut, msg, msg2); + } // Returns true iff the status indicates success. bool ok() const { return code() == kOk; } diff --git a/include/rocksdb/utilities/optimistic_transaction.h b/include/rocksdb/utilities/optimistic_transaction.h deleted file mode 100644 index c3f18f3563..0000000000 --- a/include/rocksdb/utilities/optimistic_transaction.h +++ /dev/null @@ -1,233 +0,0 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once - -#ifndef ROCKSDB_LITE - -#include -#include - -#include "rocksdb/comparator.h" -#include "rocksdb/db.h" -#include "rocksdb/status.h" - -namespace rocksdb { - -class OptimisticTransactionDB; -class WriteBatchWithIndex; - -// Provides BEGIN/COMMIT/ROLLBACK transactions for batched writes. -// -// The current implementation provides optimistic concurrency control. -// Transactional reads/writes will not block other operations in the -// db. At commit time, the batch of writes will only be written if there have -// been no other writes to any keys read or written by this transaction. -// Otherwise, the commit will return an error. -// -// A new optimistic transaction is created by calling -// OptimisticTransactionDB::BeginTransaction(). -// Only reads/writes done through this transaction object will be a part of the -// transaction. Any other reads/writes will not be tracked by this -// transaction. -// -// For example, reading data via OptimisticTransaction::GetForUpdate() will -// prevent the transaction from committing if this key is written to outside of -// this transaction. Any reads done via DB::Get() will not be checked for -// conflicts at commit time. -// -// It is up to the caller to synchronize access to this object. -// -// See examples/transaction_example.cc for some simple examples. -// -// TODO(agiardullo): Not yet implemented: -// -Transaction support for iterators -// -Ensuring memtable holds large enough history to check for conflicts -// -Support for using Transactions with DBWithTTL - -// Options to use when starting an Optimistic Transaction -struct OptimisticTransactionOptions { - // Setting set_snapshot=true is the same as calling SetSnapshot(). - bool set_snapshot = false; - - // Should be set if the DB has a non-default comparator. - // See comment in WriteBatchWithIndex constructor. - const Comparator* cmp = BytewiseComparator(); -}; - -class OptimisticTransaction { - public: - virtual ~OptimisticTransaction() {} - - // If SetSnapshot() is not called, all keys read/written through this - // transaction will only be committed if there have been no writes to - // these keys outside of this transaction *since the time each key - // was first read/written* in this transaction. - // - // When SetSnapshot() is called, this transaction will create a Snapshot - // to use for conflict validation of all future operations in the transaction. - // All future keys read/written will only be committed if there have been - // no writes to these keys outside of this transaction *since SetSnapshot() - // was called.* Otherwise, Commit() will not succeed. - // - // It is not necessary to call SetSnapshot() if you only care about other - // writes happening on keys *after* they have first been read/written in this - // transaction. However, you should set a snapshot if you are concerned - // with any other writes happening since a particular time (such as - // the start of the transaction). - // - // SetSnapshot() may be called multiple times if you would like to change - // the snapshot used for different operations in this transaction. - // - // Calling SetSnapshot will not affect the version of Data returned by Get() - // methods. See OptimisticTransaction::Get() for more details. - // - // TODO(agiardullo): add better documentation here once memtable change are - // committed - virtual void SetSnapshot() = 0; - - // Returns the Snapshot created by the last call to SetSnapshot(). - // - // REQUIRED: The returned Snapshot is only valid up until the next time - // SetSnapshot() is called or the OptimisticTransaction is deleted. - virtual const Snapshot* GetSnapshot() const = 0; - - // Write all batched keys to the db atomically if there have not been any - // other writes performed on the keys read/written by this transaction. - // - // Currently, Commit() only checks the memtables to verify that there are no - // other writes to these keys. If the memtable's history is not long - // enough to verify that there are no conflicts, Commit() will return - // a non-OK status. - // - // Returns OK on success, non-OK on failure. - virtual Status Commit() = 0; - - // Discard all batched writes in this transaction. - virtual void Rollback() = 0; - - // This function is similar to DB::Get() except it will also read pending - // changes in this transaction. - // - // If read_options.snapshot is not set, the current version of the key will - // be read. Calling SetSnapshot() does not affect the version of the data - // returned. - // - // Note that setting read_options.snapshot will affect what is read from the - // DB but will NOT change which keys are read from this transaction (the keys - // in this transaction do not yet belong to any snapshot and will be fetched - // regardless). - // - virtual Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - std::string* value) = 0; - - virtual Status Get(const ReadOptions& options, const Slice& key, - std::string* value) = 0; - - virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector& column_family, - const std::vector& keys, std::vector* values) = 0; - - virtual std::vector MultiGet(const ReadOptions& options, - const std::vector& keys, - std::vector* values) = 0; - - // Read this key and ensure that this transaction will only - // be able to be committed if this key is not written outside this - // transaction after it has first been read (or after the snapshot if a - // snapshot is set in this transaction). - - // This function is similar to OptimisticTransaction::Get() except it will - // affect whether this transaction will be able to be committed. - virtual Status GetForUpdate(const ReadOptions& options, - ColumnFamilyHandle* column_family, - const Slice& key, std::string* value) = 0; - - virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, - std::string* value) = 0; - - virtual std::vector MultiGetForUpdate( - const ReadOptions& options, - const std::vector& column_family, - const std::vector& keys, std::vector* values) = 0; - - virtual std::vector MultiGetForUpdate( - const ReadOptions& options, const std::vector& keys, - std::vector* values) = 0; - - // Put, Merge, and Delete behave similarly to their corresponding - // functions in WriteBatch. In addition, this transaction will only - // be able to be committed if these keys are not written outside of this - // transaction after they have been written by this transaction (or after the - // snapshot if a snapshot is set in this transaction). - virtual void Put(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) = 0; - virtual void Put(const Slice& key, const Slice& value) = 0; - virtual void Put(ColumnFamilyHandle* column_family, const SliceParts& key, - const SliceParts& value) = 0; - virtual void Put(const SliceParts& key, const SliceParts& value) = 0; - - virtual void Merge(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) = 0; - virtual void Merge(const Slice& key, const Slice& value) = 0; - - virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key) = 0; - virtual void Delete(const Slice& key) = 0; - virtual void Delete(ColumnFamilyHandle* column_family, - const SliceParts& key) = 0; - virtual void Delete(const SliceParts& key) = 0; - - // PutUntracked() will write a Put to the batch of operations to be committed - // in this transaction. This write will only happen if this transaction - // gets committed successfully. But unlike OptimisticTransaction::Put(), - // no conflict checking will be done for this key. So any other writes to - // this key outside of this transaction will not prevent this transaction from - // committing. - virtual void PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) = 0; - virtual void PutUntracked(const Slice& key, const Slice& value) = 0; - virtual void PutUntracked(ColumnFamilyHandle* column_family, - const SliceParts& key, const SliceParts& value) = 0; - virtual void PutUntracked(const SliceParts& key, const SliceParts& value) = 0; - - virtual void MergeUntracked(ColumnFamilyHandle* column_family, - const Slice& key, const Slice& value) = 0; - virtual void MergeUntracked(const Slice& key, const Slice& value) = 0; - - virtual void DeleteUntracked(ColumnFamilyHandle* column_family, - const Slice& key) = 0; - - virtual void DeleteUntracked(const Slice& key) = 0; - virtual void DeleteUntracked(ColumnFamilyHandle* column_family, - const SliceParts& key) = 0; - virtual void DeleteUntracked(const SliceParts& key) = 0; - - // Similar to WriteBatch::PutLogData - virtual void PutLogData(const Slice& blob) = 0; - - // Fetch the underlying write batch that contains all pending changes to be - // committed. - // - // Note: You should not write or delete anything from the batch directly and - // should only use the the functions in the OptimisticTransaction class to - // write to this transaction. - virtual WriteBatchWithIndex* GetWriteBatch() = 0; - - protected: - // To begin a new transaction, see OptimisticTransactionDB::BeginTransaction() - explicit OptimisticTransaction(const OptimisticTransactionDB* db) {} - OptimisticTransaction() {} - - private: - // No copying allowed - OptimisticTransaction(const OptimisticTransaction&); - void operator=(const OptimisticTransaction&); -}; - -} // namespace rocksdb - -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h index 677f391222..772e645490 100644 --- a/include/rocksdb/utilities/optimistic_transaction_db.h +++ b/include/rocksdb/utilities/optimistic_transaction_db.h @@ -11,16 +11,25 @@ #include "rocksdb/comparator.h" #include "rocksdb/db.h" -#include "rocksdb/utilities/optimistic_transaction.h" namespace rocksdb { -class OptimisticTransaction; +class Transaction; // Database with Transaction support. // // See optimistic_transaction.h and examples/transaction_example.cc +// Options to use when starting an Optimistic Transaction +struct OptimisticTransactionOptions { + // Setting set_snapshot=true is the same as calling SetSnapshot(). + bool set_snapshot = false; + + // Should be set if the DB has a non-default comparator. + // See comment in WriteBatchWithIndex constructor. + const Comparator* cmp = BytewiseComparator(); +}; + class OptimisticTransactionDB { public: // Open an OptimisticTransactionDB similar to DB::Open(). @@ -34,13 +43,12 @@ class OptimisticTransactionDB { virtual ~OptimisticTransactionDB() {} - // Starts a new OptimisticTransaction. Passing set_snapshot=true has the same - // effect + // Starts a new Transaction. Passing set_snapshot=true has the same effect // as calling SetSnapshot(). // // Caller should delete the returned transaction after calling // Commit() or Rollback(). - virtual OptimisticTransaction* BeginTransaction( + virtual Transaction* BeginTransaction( const WriteOptions& write_options, const OptimisticTransactionOptions& txn_options = OptimisticTransactionOptions()) = 0; diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h new file mode 100644 index 0000000000..86345efeda --- /dev/null +++ b/include/rocksdb/utilities/transaction.h @@ -0,0 +1,260 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class Iterator; +class TransactionDB; +class WriteBatchWithIndex; + +// Provides BEGIN/COMMIT/ROLLBACK transactions. +// +// To use transactions, you must first create either an OptimisticTransactionDB +// or a TransactionDB. See examples/[optimistic_]transaction_example.cc for +// more information. +// +// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction(). +// +// It is up to the caller to synchronize access to this object. +// +// See examples/transaction_example.cc for some simple examples. +// +// TODO(agiardullo): Not yet implemented +// -PerfContext statistics +// -Support for using Transactions with DBWithTTL +class Transaction { + public: + virtual ~Transaction() {} + + // If a transaction has a snapshot set, the transaction will ensure that + // any keys successfully written(or fetched via GetForUpdate()) have not + // been modified outside of this transaction since the time the snapshot was + // set. + // If a snapshot has not been set, the transaction guarantees that keys have + // not been modified since the time each key was first written (or fetched via + // GetForUpdate()). + // + // Using SetSnapshot() will provide stricter isolation guarantees at the + // expense of potentially more transaction failures due to conflicts with + // other writes. + // + // Calling SetSnapshot() has no effect on keys written before this function + // has been called. + // + // SetSnapshot() may be called multiple times if you would like to change + // the snapshot used for different operations in this transaction. + // + // Calling SetSnapshot will not affect the version of Data returned by Get() + // methods. See Transaction::Get() for more details. + virtual void SetSnapshot() = 0; + + // Returns the Snapshot created by the last call to SetSnapshot(). + // + // REQUIRED: The returned Snapshot is only valid up until the next time + // SetSnapshot() is called or the Transaction is deleted. + virtual const Snapshot* GetSnapshot() const = 0; + + // Write all batched keys to the db atomically. + // + // Returns OK on success. + // + // May return any error status that could be returned by DB:Write(). + // + // If this transaction was created by an OptimisticTransactionDB(), + // Status::Busy() may be returned if the transaction could not guarantee + // that there are no write conflicts. + // + // If this transaction was created by a TransactionDB(), Status::TimedOut() + // may be returned if this transaction has lived for longer than + // TransactionOptions.expiration. + virtual Status Commit() = 0; + + // Discard all batched writes in this transaction. + virtual void Rollback() = 0; + + // Records the state of the transaction for future calls to + // RollbackToSavePoint(). May be called multiple times to set multiple save + // points. + virtual void SetSavePoint() = 0; + + // Undo all operations in this transaction (Put, Merge, Delete, PutLogData) + // since the + // most recent call to SetSavePoint() and removes the most recent + // SetSavePoint(). + // If there is no previous call to SetSavePoint(), behaves the same as + // Rollback() + virtual void RollbackToSavePoint() = 0; + + // This function is similar to DB::Get() except it will also read pending + // changes in this transaction. + // + // If read_options.snapshot is not set, the current version of the key will + // be read. Calling SetSnapshot() does not affect the version of the data + // returned. + // + // Note that setting read_options.snapshot will affect what is read from the + // DB but will NOT change which keys are read from this transaction (the keys + // in this transaction do not yet belong to any snapshot and will be fetched + // regardless). + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) = 0; + + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) = 0; + + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) = 0; + + // Read this key and ensure that this transaction will only + // be able to be committed if this key is not written outside this + // transaction after it has first been read (or after the snapshot if a + // snapshot is set in this transaction). The transaction behavior is the + // same regardless of whether the key exists or not. + // + // The values returned by this function are similar to Transaction::Get(). + // If value==nullptr, then this function will not read any data, but will + // still ensure that this key cannot be written to by outside of this + // transaction. + // + // If this transaction was created by a TransactionDB, Status::Busy() may be + // returned. + // If this transaction was created by an OptimisticTransaction, GetForUpdate() + // could cause commit() to later return Status::Busy(). + virtual Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) = 0; + + virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, + std::string* value) = 0; + + virtual std::vector MultiGetForUpdate( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + + virtual std::vector MultiGetForUpdate( + const ReadOptions& options, const std::vector& keys, + std::vector* values) = 0; + + // Returns an iterator that will iterate on all keys in the default + // column family including both keys in the DB and uncommitted keys in this + // transaction. + // + // Setting read_options.snapshot will affect what is read from the + // DB but will NOT change which keys are read from this transaction (the keys + // in this transaction do not yet belong to any snapshot and will be fetched + // regardless). + // + // Caller is reponsible for deleting the returned Iterator. + // + // The returned iterator is only valid until Commit(), Rollback(), or + // RollbackToSavePoint() is called. + // NOTE: Transaction::Put/Merge/Delete will currently invalidate this iterator + // until + // the following issue is fixed: + // https://github.com/facebook/rocksdb/issues/616 + virtual Iterator* GetIterator(const ReadOptions& read_options) = 0; + + virtual Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) = 0; + + // Put, Merge, and Delete behave similarly to their corresponding + // functions in WriteBatch, but will also do conflict checking on the + // keys being written. + // + // If this Transaction was created on an OptimisticTransactionDB, these + // functions should always return Status::OK(). + // If this Transaction was created on a TransactionDB, the functions can + // return Status::Busy() if they could not acquire a lock. + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Put(const Slice& key, const Slice& value) = 0; + virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) = 0; + virtual Status Put(const SliceParts& key, const SliceParts& value) = 0; + + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Merge(const Slice& key, const Slice& value) = 0; + + virtual Status Delete(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status Delete(const Slice& key) = 0; + virtual Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) = 0; + virtual Status Delete(const SliceParts& key) = 0; + + // PutUntracked() will write a Put to the batch of operations to be committed + // in this transaction. This write will only happen if this transaction + // gets committed successfully. But unlike Transaction::Put(), + // no conflict checking will be done for this key. + // + // If this Transaction was created on a TransactionDB, this function will + // still acquire locks necessary to make sure this write doesn't cause + // conflicts in + // other transactions and may return Status::Busy(). + virtual Status PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) = 0; + virtual Status PutUntracked(const Slice& key, const Slice& value) = 0; + virtual Status PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) = 0; + virtual Status PutUntracked(const SliceParts& key, + const SliceParts& value) = 0; + + virtual Status MergeUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) = 0; + virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0; + + virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + virtual Status DeleteUntracked(const Slice& key) = 0; + virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) = 0; + virtual Status DeleteUntracked(const SliceParts& key) = 0; + + // Similar to WriteBatch::PutLogData + virtual void PutLogData(const Slice& blob) = 0; + + // Fetch the underlying write batch that contains all pending changes to be + // committed. + // + // Note: You should not write or delete anything from the batch directly and + // should only use the the functions in the Transaction class to + // write to this transaction. + virtual WriteBatchWithIndex* GetWriteBatch() = 0; + + protected: + explicit Transaction(const TransactionDB* db) {} + Transaction() {} + + private: + // No copying allowed + Transaction(const Transaction&); + void operator=(const Transaction&); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h new file mode 100644 index 0000000000..0f9a1773ec --- /dev/null +++ b/include/rocksdb/utilities/transaction_db.h @@ -0,0 +1,130 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/stackable_db.h" +#include "rocksdb/utilities/transaction.h" + +// Database with Transaction support. +// +// See transaction.h and examples/transaction_example.cc + +namespace rocksdb { + +struct TransactionDBOptions { + // Specifies the maximum number of keys that can be locked at the same time + // per column family. + // If the number of locked keys is greater than max_num_locks, transaction + // writes (or GetForUpdate) will return an error. + // If this value is not positive, no limit will be enforced. + int64_t max_num_locks = -1; + + // Increasing this value will increase the concurrency by dividing the lock + // table (per column family) into more sub-tables, each with their own + // separate + // mutex. + size_t num_stripes = 16; + + // If positive, specifies the default wait timeout in milliseconds when + // a transaction attempts to lock a key if not specified by + // TransactionOptions::lock_timeout. + // + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, there is no timeout. Not using a timeout is not recommended + // as it can lead to deadlocks. Currently, there is no deadlock-detection to + // recover + // from a deadlock. + int64_t transaction_lock_timeout = 1000; // 1 second + + // If positive, specifies the wait timeout in milliseconds when writing a key + // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write() + // directly). + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, there is no timeout and will block indefinitely when acquiring + // a lock. + // + // Not using a a timeout can lead to deadlocks. Currently, there + // is no deadlock-detection to recover from a deadlock. While DB writes + // cannot deadlock with other DB writes, they can deadlock with a transaction. + // A negative timeout should only be used if all transactions have an small + // expiration set. + int64_t default_lock_timeout = 1000; // 1 second +}; + +struct TransactionOptions { + // Setting set_snapshot=true is the same as calling + // Transaction::SetSnapshot(). + bool set_snapshot = false; + + + // TODO(agiardullo): TransactionDB does not yet support comparators that allow + // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only + // return 0 if + // a.compare(b) returns 0. + + + // If positive, specifies the wait timeout in milliseconds when + // a transaction attempts to lock a key. + // + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, TransactionDBOptions::transaction_lock_timeout will be used. + int64_t lock_timeout = -1; + + // Expiration duration in milliseconds. If non-negative, transactions that + // last longer than this many milliseconds will fail to commit. If not set, + // a forgotten transaction that is never committed, rolled back, or deleted + // will never relinquish any locks it holds. This could prevent keys from + // being + // written by other writers. + // + // TODO(agiardullo): Improve performance of checking expiration time. + int64_t expiration = -1; +}; + +class TransactionDB : public StackableDB { + public: + // Open a TransactionDB similar to DB::Open(). + static Status Open(const Options& options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, TransactionDB** dbptr); + + static Status Open(const DBOptions& db_options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + TransactionDB** dbptr); + + virtual ~TransactionDB() {} + + // Starts a new Transaction. Passing set_snapshot=true has the same effect + // as calling Transaction::SetSnapshot(). + // + // Caller should delete the returned transaction after calling + // Transaction::Commit() or Transaction::Rollback(). + virtual Transaction* BeginTransaction( + const WriteOptions& write_options, + const TransactionOptions& txn_options = TransactionOptions()) = 0; + + protected: + // To Create an TransactionDB, call Open() + explicit TransactionDB(DB* db) : StackableDB(db) {} + + private: + // No copying allowed + TransactionDB(const TransactionDB&); + void operator=(const TransactionDB&); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src.mk b/src.mk index 4e94c7d941..8a6c4dc7f1 100644 --- a/src.mk +++ b/src.mk @@ -118,6 +118,10 @@ LIB_SOURCES = \ utilities/table_properties_collectors/compact_on_deletion_collector.cc \ utilities/transactions/optimistic_transaction_impl.cc \ utilities/transactions/optimistic_transaction_db_impl.cc \ + utilities/transactions/transaction_db_impl.cc \ + utilities/transactions/transaction_lock_mgr.cc \ + utilities/transactions/transaction_impl.cc \ + utilities/transactions/transaction_util.cc \ utilities/ttl/db_ttl_impl.cc \ utilities/write_batch_with_index/write_batch_with_index.cc \ utilities/write_batch_with_index/write_batch_with_index_internal.cc \ @@ -235,6 +239,7 @@ TEST_BENCH_SOURCES = \ utilities/spatialdb/spatial_db_test.cc \ utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \ utilities/transactions/optimistic_transaction_test.cc \ + utilities/transactions/transaction_test.cc \ utilities/ttl/ttl_test.cc \ utilities/write_batch_with_index/write_batch_with_index_test.cc \ util/log_write_bench.cc \ diff --git a/util/status.cc b/util/status.cc index d956eb476d..3fe292dd38 100644 --- a/util/status.cc +++ b/util/status.cc @@ -67,6 +67,9 @@ std::string Status::ToString() const { case kShutdownInProgress: type = "Shutdown in progress: "; break; + case kTimedOut: + type = "Operation timed out: "; + break; case kAborted: type = "Operation aborted: "; break; diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc index 56f6120216..ca98972111 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.cc +++ b/utilities/transactions/optimistic_transaction_db_impl.cc @@ -18,10 +18,10 @@ namespace rocksdb { -OptimisticTransaction* OptimisticTransactionDBImpl::BeginTransaction( +Transaction* OptimisticTransactionDBImpl::BeginTransaction( const WriteOptions& write_options, const OptimisticTransactionOptions& txn_options) { - OptimisticTransaction* txn = + Transaction* txn = new OptimisticTransactionImpl(this, write_options, txn_options); return txn; diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h index bfd4529903..ec5b428234 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.h +++ b/utilities/transactions/optimistic_transaction_db_impl.h @@ -19,7 +19,7 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB { ~OptimisticTransactionDBImpl() {} - OptimisticTransaction* BeginTransaction( + Transaction* BeginTransaction( const WriteOptions& write_options, const OptimisticTransactionOptions& txn_options) override; diff --git a/utilities/transactions/optimistic_transaction_impl.cc b/utilities/transactions/optimistic_transaction_impl.cc index d45117236c..1defd32a6d 100644 --- a/utilities/transactions/optimistic_transaction_impl.cc +++ b/utilities/transactions/optimistic_transaction_impl.cc @@ -7,11 +7,7 @@ #include "utilities/transactions/optimistic_transaction_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include @@ -22,6 +18,7 @@ #include "rocksdb/status.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "util/string_util.h" +#include "utilities/transactions/transaction_util.h" namespace rocksdb { @@ -34,7 +31,8 @@ OptimisticTransactionImpl::OptimisticTransactionImpl( db_(txn_db->GetBaseDB()), write_options_(write_options), snapshot_(nullptr), - write_batch_(txn_options.cmp, 0, true) { + cmp_(txn_options.cmp), + write_batch_(new WriteBatchWithIndex(txn_options.cmp, 0, true)) { if (txn_options.set_snapshot) { SetSnapshot(); } else { @@ -72,11 +70,12 @@ Status OptimisticTransactionImpl::Commit() { } Status s = db_impl->WriteWithCallback( - write_options_, write_batch_.GetWriteBatch(), &callback); + write_options_, write_batch_->GetWriteBatch(), &callback); if (s.ok()) { tracked_keys_.clear(); - write_batch_.Clear(); + write_batch_->Clear(); + num_entries_ = 0; } return s; @@ -84,7 +83,57 @@ Status OptimisticTransactionImpl::Commit() { void OptimisticTransactionImpl::Rollback() { tracked_keys_.clear(); - write_batch_.Clear(); + write_batch_->Clear(); + num_entries_ = 0; +} + +void OptimisticTransactionImpl::SetSavePoint() { + if (num_entries_ > 0) { + // If transaction is empty, no need to record anything. + + if (save_points_ == nullptr) { + save_points_.reset(new std::stack()); + } + save_points_->push(num_entries_); + } +} + +void OptimisticTransactionImpl::RollbackToSavePoint() { + size_t savepoint_entries = 0; + + if (save_points_ != nullptr && save_points_->size() > 0) { + savepoint_entries = save_points_->top(); + save_points_->pop(); + } + + assert(savepoint_entries <= num_entries_); + + if (savepoint_entries == num_entries_) { + // No changes to rollback + } else if (savepoint_entries == 0) { + // Rollback everything + Rollback(); + } else { + DBImpl* db_impl = dynamic_cast(db_->GetRootDB()); + assert(db_impl); + + WriteBatchWithIndex* new_batch = new WriteBatchWithIndex(cmp_, 0, true); + Status s = TransactionUtil::CopyFirstN( + savepoint_entries, write_batch_.get(), new_batch, db_impl); + + if (!s.ok()) { + // TODO: Should we change this function to return a Status or should we + // somehow make it + // so RollbackToSavePoint() can never fail?? + // Consider moving this functionality into WriteBatchWithIndex + fprintf(stderr, "STATUS: %s \n", s.ToString().c_str()); + delete new_batch; + } else { + write_batch_.reset(new_batch); + } + + num_entries_ = savepoint_entries; + } } // Record this key so that we can check it for conflicts at commit time. @@ -135,8 +184,8 @@ void OptimisticTransactionImpl::RecordOperation( Status OptimisticTransactionImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { - return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key, - value); + return write_batch_->GetFromBatchAndDB(db_, read_options, column_family, key, + value); } Status OptimisticTransactionImpl::GetForUpdate( @@ -145,7 +194,11 @@ Status OptimisticTransactionImpl::GetForUpdate( // Regardless of whether the Get succeeded, track this key. RecordOperation(column_family, key); - return Get(read_options, column_family, key, value); + if (value == nullptr) { + return Status::OK(); + } else { + return Get(read_options, column_family, key, value); + } } std::vector OptimisticTransactionImpl::MultiGet( @@ -159,7 +212,7 @@ std::vector OptimisticTransactionImpl::MultiGet( // TODO(agiardullo): optimize multiget? std::vector stat_list(num_keys); for (size_t i = 0; i < num_keys; ++i) { - std::string* value = &(*values)[i]; + std::string* value = values ? &(*values)[i] : nullptr; stat_list[i] = Get(read_options, column_family[i], keys[i], value); } @@ -180,169 +233,141 @@ std::vector OptimisticTransactionImpl::MultiGetForUpdate( // Regardless of whether the Get succeeded, track this key. RecordOperation(column_family[i], keys[i]); - std::string* value = &(*values)[i]; + std::string* value = values ? &(*values)[i] : nullptr; stat_list[i] = Get(read_options, column_family[i], keys[i], value); } return stat_list; } -void OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family, - const Slice& key, const Slice& value) { - RecordOperation(column_family, key); +Iterator* OptimisticTransactionImpl::GetIterator( + const ReadOptions& read_options) { + Iterator* db_iter = db_->NewIterator(read_options); + assert(db_iter); - write_batch_.Put(column_family, key, value); + return write_batch_->NewIteratorWithBase(db_iter); } -void OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family, - const SliceParts& key, - const SliceParts& value) { - RecordOperation(column_family, key); +Iterator* OptimisticTransactionImpl::GetIterator( + const ReadOptions& read_options, ColumnFamilyHandle* column_family) { + Iterator* db_iter = db_->NewIterator(read_options, column_family); + assert(db_iter); - write_batch_.Put(column_family, key, value); + return write_batch_->NewIteratorWithBase(column_family, db_iter); } -void OptimisticTransactionImpl::Merge(ColumnFamilyHandle* column_family, +Status OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { RecordOperation(column_family, key); - write_batch_.Merge(column_family, key, value); + write_batch_->Put(column_family, key, value); + num_entries_++; + + return Status::OK(); } -void OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family, - const Slice& key) { +Status OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) { RecordOperation(column_family, key); - write_batch_.Delete(column_family, key); + write_batch_->Put(column_family, key, value); + num_entries_++; + + return Status::OK(); } -void OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family, - const SliceParts& key) { +Status OptimisticTransactionImpl::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { RecordOperation(column_family, key); - write_batch_.Delete(column_family, key); + write_batch_->Merge(column_family, key, value); + + return Status::OK(); } -void OptimisticTransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, - const Slice& key, - const Slice& value) { - write_batch_.Put(column_family, key, value); +Status OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family, + const Slice& key) { + RecordOperation(column_family, key); + + write_batch_->Delete(column_family, key); + + return Status::OK(); } -void OptimisticTransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, - const SliceParts& key, - const SliceParts& value) { - write_batch_.Put(column_family, key, value); +Status OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + RecordOperation(column_family, key); + + write_batch_->Delete(column_family, key); + + return Status::OK(); } -void OptimisticTransactionImpl::MergeUntracked( +Status OptimisticTransactionImpl::PutUntracked( ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { - write_batch_.Merge(column_family, key, value); + write_batch_->Put(column_family, key, value); + num_entries_++; + + return Status::OK(); } -void OptimisticTransactionImpl::DeleteUntracked( +Status OptimisticTransactionImpl::PutUntracked( + ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) { + write_batch_->Put(column_family, key, value); + num_entries_++; + + return Status::OK(); +} + +Status OptimisticTransactionImpl::MergeUntracked( + ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { + write_batch_->Merge(column_family, key, value); + num_entries_++; + + return Status::OK(); +} + +Status OptimisticTransactionImpl::DeleteUntracked( ColumnFamilyHandle* column_family, const Slice& key) { - write_batch_.Delete(column_family, key); + write_batch_->Delete(column_family, key); + num_entries_++; + + return Status::OK(); } -void OptimisticTransactionImpl::DeleteUntracked( +Status OptimisticTransactionImpl::DeleteUntracked( ColumnFamilyHandle* column_family, const SliceParts& key) { - write_batch_.Delete(column_family, key); + write_batch_->Delete(column_family, key); + num_entries_++; + + return Status::OK(); } void OptimisticTransactionImpl::PutLogData(const Slice& blob) { - write_batch_.PutLogData(blob); + write_batch_->PutLogData(blob); + num_entries_++; } WriteBatchWithIndex* OptimisticTransactionImpl::GetWriteBatch() { - return &write_batch_; + return write_batch_.get(); } // Returns OK if it is safe to commit this transaction. Returns Status::Busy // if there are read or write conflicts that would prevent us from committing OR // if we can not determine whether there would be any such conflicts. // -// Should only be called on writer thread. +// Should only be called on writer thread in order to avoid any race conditions +// in detecting +// write conflicts. Status OptimisticTransactionImpl::CheckTransactionForConflicts(DB* db) { Status result; assert(dynamic_cast(db) != nullptr); auto db_impl = reinterpret_cast(db); - for (auto& tracked_keys_iter : tracked_keys_) { - uint32_t cf_id = tracked_keys_iter.first; - const auto& keys = tracked_keys_iter.second; - - SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf_id); - if (sv == nullptr) { - result = - Status::Busy("Could not access column family " + ToString(cf_id)); - break; - } - - SequenceNumber earliest_seq = - db_impl->GetEarliestMemTableSequenceNumber(sv, true); - - // For each of the keys in this transaction, check to see if someone has - // written to this key since the start of the transaction. - for (const auto& key_iter : keys) { - const auto& key = key_iter.first; - const SequenceNumber key_seq = key_iter.second; - - // Since it would be too slow to check the SST files, we will only use - // the memtables to check whether there have been any recent writes - // to this key after it was accessed in this transaction. But if the - // Memtables do not contain a long enough history, we must fail the - // transaction. - if (earliest_seq == kMaxSequenceNumber) { - // The age of this memtable is unknown. Cannot rely on it to check - // for recent writes. This error shouldn't happen often in practice as - // the - // Memtable should have a valid earliest sequence number except in some - // corner cases (such as error cases during recovery). - result = Status::Busy( - "Could not commit transaction with as the MemTable does not " - "countain a long enough history to check write at SequenceNumber: ", - ToString(key_seq)); - - } else if (key_seq < earliest_seq) { - // The age of this memtable is too new to use to check for recent - // writes. - char msg[255]; - snprintf( - msg, sizeof(msg), - "Could not commit transaction with write at SequenceNumber %" PRIu64 - " as the MemTable only contains changes newer than SequenceNumber " - "%" PRIu64 - ". Increasing the value of the " - "max_write_buffer_number_to_maintain option could reduce the " - "frequency " - "of this error.", - key_seq, earliest_seq); - result = Status::Busy(msg); - } else { - SequenceNumber seq = kMaxSequenceNumber; - Status s = db_impl->GetLatestSequenceForKeyFromMemtable(sv, key, &seq); - if (!s.ok()) { - result = s; - } else if (seq != kMaxSequenceNumber && seq > key_seq) { - result = Status::Busy(); - } - } - - if (!result.ok()) { - break; - } - } - - db_impl->ReturnAndCleanupSuperVersion(cf_id, sv); - - if (!result.ok()) { - break; - } - } - - return result; + return TransactionUtil::CheckKeysForConflicts(db_impl, &tracked_keys_); } } // namespace rocksdb diff --git a/utilities/transactions/optimistic_transaction_impl.h b/utilities/transactions/optimistic_transaction_impl.h index 30272b97bb..faf6a57948 100644 --- a/utilities/transactions/optimistic_transaction_impl.h +++ b/utilities/transactions/optimistic_transaction_impl.h @@ -7,6 +7,7 @@ #ifndef ROCKSDB_LITE +#include #include #include #include @@ -16,17 +17,14 @@ #include "rocksdb/slice.h" #include "rocksdb/status.h" #include "rocksdb/types.h" -#include "rocksdb/utilities/optimistic_transaction.h" +#include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" +#include "utilities/transactions/transaction_util.h" namespace rocksdb { -using TransactionKeyMap = - std::unordered_map>; - -class OptimisticTransactionImpl : public OptimisticTransaction { +class OptimisticTransactionImpl : public Transaction { public: OptimisticTransactionImpl(OptimisticTransactionDB* db, const WriteOptions& write_options, @@ -38,6 +36,10 @@ class OptimisticTransactionImpl : public OptimisticTransaction { void Rollback() override; + void SetSavePoint() override; + + void RollbackToSavePoint() override; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) override; @@ -84,57 +86,61 @@ class OptimisticTransactionImpl : public OptimisticTransaction { keys, values); } - void Put(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override; - void Put(const Slice& key, const Slice& value) override { - Put(nullptr, key, value); - } + Iterator* GetIterator(const ReadOptions& read_options) override; + Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) override; - void Put(ColumnFamilyHandle* column_family, const SliceParts& key, - const SliceParts& value) override; - void Put(const SliceParts& key, const SliceParts& value) override { - Put(nullptr, key, value); - } - - void Merge(ColumnFamilyHandle* column_family, const Slice& key, + Status Put(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) override; - void Merge(const Slice& key, const Slice& value) override { - Merge(nullptr, key, value); + Status Put(const Slice& key, const Slice& value) override { + return Put(nullptr, key, value); } - void Delete(ColumnFamilyHandle* column_family, const Slice& key) override; - void Delete(const Slice& key) override { Delete(nullptr, key); } - void Delete(ColumnFamilyHandle* column_family, - const SliceParts& key) override; - void Delete(const SliceParts& key) override { Delete(nullptr, key); } - - void PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override; - void PutUntracked(const Slice& key, const Slice& value) override { - PutUntracked(nullptr, key, value); + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status Put(const SliceParts& key, const SliceParts& value) override { + return Put(nullptr, key, value); } - void PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, - const SliceParts& value) override; - void PutUntracked(const SliceParts& key, const SliceParts& value) override { - PutUntracked(nullptr, key, value); + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Merge(const Slice& key, const Slice& value) override { + return Merge(nullptr, key, value); } - void MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, + Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; + Status Delete(const Slice& key) override { return Delete(nullptr, key); } + Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } + + Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) override; - void MergeUntracked(const Slice& key, const Slice& value) override { - MergeUntracked(nullptr, key, value); + Status PutUntracked(const Slice& key, const Slice& value) override { + return PutUntracked(nullptr, key, value); } - void DeleteUntracked(ColumnFamilyHandle* column_family, - const Slice& key) override; - void DeleteUntracked(const Slice& key) override { - DeleteUntracked(nullptr, key); + Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status PutUntracked(const SliceParts& key, const SliceParts& value) override { + return PutUntracked(nullptr, key, value); } - void DeleteUntracked(ColumnFamilyHandle* column_family, - const SliceParts& key) override; - void DeleteUntracked(const SliceParts& key) override { - DeleteUntracked(nullptr, key); + + Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status MergeUntracked(const Slice& key, const Slice& value) override { + return MergeUntracked(nullptr, key, value); + } + + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status DeleteUntracked(const Slice& key) override { + return DeleteUntracked(nullptr, key); + } + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status DeleteUntracked(const SliceParts& key) override { + return DeleteUntracked(nullptr, key); } void PutLogData(const Slice& blob) override; @@ -153,12 +159,24 @@ class OptimisticTransactionImpl : public OptimisticTransaction { const WriteOptions write_options_; const Snapshot* snapshot_; SequenceNumber start_sequence_number_; - WriteBatchWithIndex write_batch_; + const Comparator* cmp_; + std::unique_ptr write_batch_; private: - // Map of Column Family IDs to keys and their sequence numbers + // Map of Column Family IDs to keys and corresponding sequence numbers. + // The sequence number stored for a key will be used during commit to make + // sure this key has + // not changed since this sequence number. TransactionKeyMap tracked_keys_; + // Records the number of entries currently in the WriteBatch including calls + // to + // Put, Merge, Delete, and PutLogData() + size_t num_entries_ = 0; + + // Stack of number of entries in write_batch at each save point + std::unique_ptr> save_points_; + friend class OptimisticTransactionCallback; // Returns OK if it is safe to commit this transaction. Returns Status::Busy diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index b610a9ba86..09b2ee1d6f 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -8,7 +8,7 @@ #include #include "rocksdb/db.h" -#include "rocksdb/utilities/optimistic_transaction.h" +#include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "util/logging.h" #include "util/testharness.h" @@ -34,7 +34,6 @@ class OptimisticTransactionTest : public testing::Test { assert(s.ok()); db = txn_db->GetBaseDB(); } - ~OptimisticTransactionTest() { delete txn_db; DestroyDB(dbname, options); @@ -50,7 +49,7 @@ TEST_F(OptimisticTransactionTest, SuccessTest) { db->Put(write_options, Slice("foo"), Slice("bar")); db->Put(write_options, Slice("foo2"), Slice("bar")); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); txn->GetForUpdate(read_options, "foo", &value); @@ -79,7 +78,7 @@ TEST_F(OptimisticTransactionTest, WriteConflictTest) { db->Put(write_options, "foo", "bar"); db->Put(write_options, "foo2", "bar"); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); txn->Put("foo", "bar2"); @@ -114,8 +113,7 @@ TEST_F(OptimisticTransactionTest, WriteConflictTest2) { db->Put(write_options, "foo2", "bar"); txn_options.set_snapshot = true; - OptimisticTransaction* txn = - txn_db->BeginTransaction(write_options, txn_options); + Transaction* txn = txn_db->BeginTransaction(write_options, txn_options); ASSERT_TRUE(txn); // This Put outside of a transaction will conflict with a later write @@ -150,8 +148,7 @@ TEST_F(OptimisticTransactionTest, ReadConflictTest) { db->Put(write_options, "foo2", "bar"); txn_options.set_snapshot = true; - OptimisticTransaction* txn = - txn_db->BeginTransaction(write_options, txn_options); + Transaction* txn = txn_db->BeginTransaction(write_options, txn_options); ASSERT_TRUE(txn); txn->SetSnapshot(); @@ -188,7 +185,7 @@ TEST_F(OptimisticTransactionTest, TxnOnlyTest) { string value; Status s; - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); txn->Put("x", "y"); @@ -208,7 +205,7 @@ TEST_F(OptimisticTransactionTest, FlushTest) { db->Put(write_options, Slice("foo"), Slice("bar")); db->Put(write_options, Slice("foo2"), Slice("bar")); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); snapshot_read_options.snapshot = txn->GetSnapshot(); @@ -248,7 +245,7 @@ TEST_F(OptimisticTransactionTest, FlushTest2) { db->Put(write_options, Slice("foo"), Slice("bar")); db->Put(write_options, Slice("foo2"), Slice("bar")); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); snapshot_read_options.snapshot = txn->GetSnapshot(); @@ -302,7 +299,7 @@ TEST_F(OptimisticTransactionTest, NoSnapshotTest) { db->Put(write_options, "AAA", "bar"); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); // Modify key after transaction start @@ -333,7 +330,7 @@ TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) { db->Put(write_options, "BBB", "bar"); db->Put(write_options, "CCC", "bar"); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); db->Put(write_options, "AAA", "bar1"); @@ -410,8 +407,7 @@ TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) { OptimisticTransactionOptions txn_options; txn_options.set_snapshot = true; - OptimisticTransaction* txn2 = - txn_db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options); txn2->SetSnapshot(); // This should not conflict in txn since the snapshot is later than the @@ -467,15 +463,14 @@ TEST_F(OptimisticTransactionTest, ColumnFamiliesTest) { ASSERT_OK(s); db = txn_db->GetBaseDB(); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_TRUE(txn); txn->SetSnapshot(); snapshot_read_options.snapshot = txn->GetSnapshot(); txn_options.set_snapshot = true; - OptimisticTransaction* txn2 = - txn_db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options); ASSERT_TRUE(txn2); // Write some data to the db @@ -594,7 +589,7 @@ TEST_F(OptimisticTransactionTest, EmptyTest) { s = db->Put(write_options, "aaa", "aaa"); ASSERT_OK(s); - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); s = txn->Commit(); ASSERT_OK(s); delete txn; @@ -630,11 +625,10 @@ TEST_F(OptimisticTransactionTest, PredicateManyPreceders) { Status s; txn_options.set_snapshot = true; - OptimisticTransaction* txn1 = - txn_db->BeginTransaction(write_options, txn_options); + Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options); read_options1.snapshot = txn1->GetSnapshot(); - OptimisticTransaction* txn2 = txn_db->BeginTransaction(write_options); + Transaction* txn2 = txn_db->BeginTransaction(write_options); txn2->SetSnapshot(); read_options2.snapshot = txn2->GetSnapshot(); @@ -697,8 +691,8 @@ TEST_F(OptimisticTransactionTest, LostUpdate) { // Test 2 transactions writing to the same key in multiple orders and // with/without snapshots - OptimisticTransaction* txn1 = txn_db->BeginTransaction(write_options); - OptimisticTransaction* txn2 = txn_db->BeginTransaction(write_options); + Transaction* txn1 = txn_db->BeginTransaction(write_options); + Transaction* txn2 = txn_db->BeginTransaction(write_options); txn1->Put("1", "1"); txn2->Put("1", "2"); @@ -792,7 +786,7 @@ TEST_F(OptimisticTransactionTest, UntrackedWrites) { Status s; // Verify transaction rollback works for untracked keys. - OptimisticTransaction* txn = txn_db->BeginTransaction(write_options); + Transaction* txn = txn_db->BeginTransaction(write_options); txn->PutUntracked("untracked", "0"); txn->Rollback(); s = db->Get(read_options, "untracked", &value); @@ -836,6 +830,280 @@ TEST_F(OptimisticTransactionTest, UntrackedWrites) { delete txn; } +TEST_F(OptimisticTransactionTest, IteratorTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + string value; + Status s; + + // Write some keys to the db + s = db->Put(write_options, "A", "a"); + ASSERT_OK(s); + + s = db->Put(write_options, "G", "g"); + ASSERT_OK(s); + + s = db->Put(write_options, "F", "f"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "c"); + ASSERT_OK(s); + + s = db->Put(write_options, "D", "d"); + ASSERT_OK(s); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + // Write some keys in a txn + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Put("H", "h"); + ASSERT_OK(s); + + s = txn->Delete("D"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + txn->SetSnapshot(); + const Snapshot* snapshot = txn->GetSnapshot(); + + // Write some keys to the db after the snapshot + s = db->Put(write_options, "BB", "xx"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "xx"); + ASSERT_OK(s); + + read_options.snapshot = snapshot; + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + + // Read all keys via iter and lock them all + std::string results[] = {"a", "b", "c", "e", "f", "g", "h"}; + for (int i = 0; i < 7; i++) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(results[i], iter->value().ToString()); + + s = txn->GetForUpdate(read_options, iter->key(), nullptr); + ASSERT_OK(s); + + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + + iter->Seek("G"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("g", iter->value().ToString()); + + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("f", iter->value().ToString()); + + iter->Seek("D"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek("C"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("c", iter->value().ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->value().ToString()); + + iter->Seek("X"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + + iter->SeekToLast(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("h", iter->value().ToString()); + + // key "C" was modified in the db after txn's snapshot. txn will not commit. + s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete iter; + delete txn; +} + +TEST_F(OptimisticTransactionTest, SavepointTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + string value; + Status s; + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + txn->RollbackToSavePoint(); + + txn->SetSavePoint(); // 1 + + txn->RollbackToSavePoint(); // Rollback to beginning of txn + txn->RollbackToSavePoint(); + + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + delete txn; + txn = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("B", "bb"); + ASSERT_OK(s); + + s = txn->Put("C", "c"); + ASSERT_OK(s); + + txn->SetSavePoint(); // 2 + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Put("C", "cc"); + ASSERT_OK(s); + + s = txn->Put("D", "d"); + ASSERT_OK(s); + + txn->RollbackToSavePoint(); // Rollback to 2 + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("bb", value); + + s = txn->Get(read_options, "C", &value); + ASSERT_OK(s); + ASSERT_EQ("c", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + txn->RollbackToSavePoint(); // Rollback to beginning of txn + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "aa"); + ASSERT_OK(s); + + s = txn->Put("F", "f"); + ASSERT_OK(s); + + txn->SetSavePoint(); // 3 + txn->SetSavePoint(); // 4 + + s = txn->Put("G", "g"); + ASSERT_OK(s); + + s = txn->Delete("F"); + ASSERT_OK(s); + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = txn->Get(read_options, "F", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + txn->RollbackToSavePoint(); // Rollback to 3 + + s = txn->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = txn->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = db->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = db->Get(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/utilities/transactions/transaction_db_impl.cc b/utilities/transactions/transaction_db_impl.cc new file mode 100644 index 0000000000..84baf4b40a --- /dev/null +++ b/utilities/transactions/transaction_db_impl.cc @@ -0,0 +1,254 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "utilities/transactions/transaction_db_impl.h" + +#include "db/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "utilities/transactions/transaction_impl.h" + +namespace rocksdb { + +TransactionDBImpl::TransactionDBImpl(DB* db, + const TransactionDBOptions& txn_db_options) + : TransactionDB(db), + txn_db_options_(txn_db_options), + lock_mgr_(txn_db_options_.num_stripes, txn_db_options.max_num_locks) {} + +Transaction* TransactionDBImpl::BeginTransaction( + const WriteOptions& write_options, const TransactionOptions& txn_options) { + Transaction* txn = new TransactionImpl(this, write_options, txn_options); + + return txn; +} + +TransactionDBOptions TransactionDBImpl::ValidateTxnDBOptions( + const TransactionDBOptions& txn_db_options) { + TransactionDBOptions validated = txn_db_options; + + if (txn_db_options.num_stripes == 0) { + validated.num_stripes = 1; + } + + return validated; +} + +Status TransactionDB::Open(const Options& options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, TransactionDB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = TransactionDB::Open(db_options, txn_db_options, dbname, + column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + + return s; +} + +Status TransactionDB::Open( + const DBOptions& db_options, const TransactionDBOptions& txn_db_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, TransactionDB** dbptr) { + Status s; + DB* db; + + std::vector column_families_copy = column_families; + + // Enable MemTable History if not already enabled + for (auto& column_family : column_families_copy) { + ColumnFamilyOptions* options = &column_family.options; + + if (options->max_write_buffer_number_to_maintain == 0) { + // Setting to -1 will set the History size to max_write_buffer_number. + options->max_write_buffer_number_to_maintain = -1; + } + } + + s = DB::Open(db_options, dbname, column_families, handles, &db); + + if (s.ok()) { + TransactionDBImpl* txn_db = new TransactionDBImpl( + db, TransactionDBImpl::ValidateTxnDBOptions(txn_db_options)); + + for (auto cf_ptr : *handles) { + txn_db->AddColumnFamily(cf_ptr); + } + + *dbptr = txn_db; + } + + return s; +} + +// Let TransactionLockMgr know that this column family exists so it can +// allocate a LockMap for it. +void TransactionDBImpl::AddColumnFamily(const ColumnFamilyHandle* handle) { + lock_mgr_.AddColumnFamily(handle->GetID()); +} + +Status TransactionDBImpl::CreateColumnFamily( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = db_->CreateColumnFamily(options, column_family_name, handle); + if (s.ok()) { + lock_mgr_.AddColumnFamily((*handle)->GetID()); + } + + return s; +} + +// Let TransactionLockMgr know that it can deallocate the LockMap for this +// column family. +Status TransactionDBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = db_->DropColumnFamily(column_family); + if (s.ok()) { + lock_mgr_.RemoveColumnFamily(column_family->GetID()); + } + + return s; +} + +Status TransactionDBImpl::TryLock(TransactionImpl* txn, uint32_t cfh_id, + const std::string& key) { + return lock_mgr_.TryLock(txn, cfh_id, key, GetEnv()); +} + +void TransactionDBImpl::UnLock(TransactionImpl* txn, TransactionKeyMap* keys) { + lock_mgr_.UnLock(txn, keys, GetEnv()); +} + +void TransactionDBImpl::UnLock(TransactionImpl* txn, uint32_t cfh_id, + const std::string& key) { + lock_mgr_.UnLock(txn, cfh_id, key, GetEnv()); +} + +// Used when wrapping DB write operations in a transaction +Transaction* TransactionDBImpl::BeginInternalTransaction( + const WriteOptions& options) { + TransactionOptions txn_options; + Transaction* txn = BeginTransaction(options, txn_options); + + assert(dynamic_cast(txn) != nullptr); + auto txn_impl = reinterpret_cast(txn); + + // Use default timeout for non-transactional writes + txn_impl->SetLockTimeout(txn_db_options_.default_lock_timeout); + + return txn; +} + +// All user Put, Merge, Delete, and Write requests must be intercepted to make +// sure that they lock all keys that they are writing to avoid causing conflicts +// with any concurent transactions. The easiest way to do this is to wrap all +// write operations in a transaction. +// +// Put(), Merge(), and Delete() only lock a single key per call. Write() will +// sort its keys before locking them. This guarantees that TransactionDB write +// methods cannot deadlock with eachother (but still could deadlock with a +// Transaction). +Status TransactionDBImpl::Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + Status s; + + Transaction* txn = BeginInternalTransaction(options); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do PutUntracked(). + s = txn->PutUntracked(column_family, key, val); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status TransactionDBImpl::Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) { + Status s; + + Transaction* txn = BeginInternalTransaction(wopts); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // DeleteUntracked(). + s = txn->DeleteUntracked(column_family, key); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status TransactionDBImpl::Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + Status s; + + Transaction* txn = BeginInternalTransaction(options); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // MergeUntracked(). + s = txn->MergeUntracked(column_family, key, value); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status TransactionDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) { + // Need to lock all keys in this batch to prevent write conflicts with + // concurrent transactions. + Transaction* txn = BeginInternalTransaction(opts); + + assert(dynamic_cast(txn) != nullptr); + auto txn_impl = reinterpret_cast(txn); + + // Since commitBatch sorts the keys before locking, concurrent Write() + // operations will not cause a deadlock. + // In order to avoid a deadlock with a concurrent Transaction, Transactions + // should use a lock timeout. + Status s = txn_impl->CommitBatch(updates); + + delete txn; + + return s; +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_db_impl.h b/utilities/transactions/transaction_db_impl.h new file mode 100644 index 0000000000..c4b69d29e7 --- /dev/null +++ b/utilities/transactions/transaction_db_impl.h @@ -0,0 +1,80 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "utilities/transactions/transaction_impl.h" +#include "utilities/transactions/transaction_lock_mgr.h" + +namespace rocksdb { + +class TransactionDBImpl : public TransactionDB { + public: + explicit TransactionDBImpl(DB* db, + const TransactionDBOptions& txn_db_options); + + ~TransactionDBImpl() {} + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options) override; + + using StackableDB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override; + + using StackableDB::Delete; + virtual Status Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override; + + using StackableDB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + using StackableDB::Write; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + using StackableDB::CreateColumnFamily; + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override; + + using StackableDB::DropColumnFamily; + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; + + Status TryLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key); + + void UnLock(TransactionImpl* txn, TransactionKeyMap* keys); + void UnLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key); + + void AddColumnFamily(const ColumnFamilyHandle* handle); + + static TransactionDBOptions ValidateTxnDBOptions( + const TransactionDBOptions& txn_db_options); + + const TransactionDBOptions& GetTxnDBOptions() const { + return txn_db_options_; + } + + private: + const TransactionDBOptions txn_db_options_; + TransactionLockMgr lock_mgr_; + + // Must be held when adding/dropping column families. + InstrumentedMutex column_family_mutex_; + Transaction* BeginInternalTransaction(const WriteOptions& options); + Status WriteHelper(WriteBatch* updates, TransactionImpl* txn_impl); +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_impl.cc b/utilities/transactions/transaction_impl.cc new file mode 100644 index 0000000000..1bbdfcac24 --- /dev/null +++ b/utilities/transactions/transaction_impl.cc @@ -0,0 +1,598 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_impl.h" + +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/string_util.h" +#include "utilities/transactions/transaction_db_impl.h" +#include "utilities/transactions/transaction_util.h" + +namespace rocksdb { + +struct WriteOptions; + +std::atomic TransactionImpl::txn_id_counter_(1); + +TransactionID TransactionImpl::GenTxnID() { + return txn_id_counter_.fetch_add(1); +} + +TransactionImpl::TransactionImpl(TransactionDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) + : db_(txn_db), + txn_db_impl_(nullptr), + txn_id_(GenTxnID()), + write_options_(write_options), + snapshot_(nullptr), + cmp_(GetColumnFamilyUserComparator(txn_db->DefaultColumnFamily())), + write_batch_(new WriteBatchWithIndex(cmp_, 0, true)), + start_time_( + txn_options.expiration >= 0 ? db_->GetEnv()->NowMicros() / 1000 : 0), + expiration_time_(txn_options.expiration >= 0 + ? start_time_ + txn_options.expiration + : 0), + lock_timeout_(txn_options.lock_timeout) { + txn_db_impl_ = dynamic_cast(txn_db); + assert(txn_db_impl_); + + if (lock_timeout_ < 0) { + // Lock timeout not set, use default + lock_timeout_ = txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout; + } + + if (txn_options.set_snapshot) { + SetSnapshot(); + } +} + +TransactionImpl::~TransactionImpl() { + Cleanup(); + + if (snapshot_ != nullptr) { + db_->ReleaseSnapshot(snapshot_); + } +} + +void TransactionImpl::SetSnapshot() { + if (snapshot_ != nullptr) { + db_->ReleaseSnapshot(snapshot_); + } + + snapshot_ = db_->GetSnapshot(); +} + +void TransactionImpl::Cleanup() { + write_batch_->Clear(); + num_entries_ = 0; + txn_db_impl_->UnLock(this, &tracked_keys_); + tracked_keys_.clear(); + save_points_.reset(nullptr); +} + +bool TransactionImpl::IsExpired() const { + if (expiration_time_ > 0) { + if (db_->GetEnv()->NowMicros() >= expiration_time_ * 1000) { + // Transaction is expired. + return true; + } + } + + return false; +} + +Status TransactionImpl::CommitBatch(WriteBatch* batch) { + TransactionKeyMap keys_to_unlock; + + Status s = LockBatch(batch, &keys_to_unlock); + + if (s.ok()) { + s = DoCommit(batch); + + txn_db_impl_->UnLock(this, &keys_to_unlock); + } + + return s; +} + +Status TransactionImpl::Commit() { + Status s = DoCommit(write_batch_->GetWriteBatch()); + + Cleanup(); + + return s; +} + +Status TransactionImpl::DoCommit(WriteBatch* batch) { + Status s; + + // Do write directly on base db as TransctionDB::Write() would attempt to + // do conflict checking that we've already done. + DB* db = db_->GetBaseDB(); + + if (expiration_time_ > 0) { + // We cannot commit a transaction that is expired as its locks might have + // been released. + // To avoid race conditions, we need to use a WriteCallback to check the + // expiration time once we're on the writer thread. + TransactionCallback callback(this); + + assert(dynamic_cast(db) != nullptr); + auto db_impl = reinterpret_cast(db); + s = db_impl->WriteWithCallback(write_options_, batch, &callback); + } else { + s = db->Write(write_options_, batch); + } + + return s; +} + +void TransactionImpl::Rollback() { Cleanup(); } + +void TransactionImpl::SetSavePoint() { + if (num_entries_ > 0) { + // If transaction is empty, no need to record anything. + + if (save_points_ == nullptr) { + save_points_.reset(new std::stack()); + } + save_points_->push(num_entries_); + } +} + +void TransactionImpl::RollbackToSavePoint() { + size_t savepoint_entries = 0; + + if (save_points_ != nullptr && save_points_->size() > 0) { + savepoint_entries = save_points_->top(); + save_points_->pop(); + } + + assert(savepoint_entries <= num_entries_); + + if (savepoint_entries == num_entries_) { + // No changes to rollback + } else if (savepoint_entries == 0) { + // Rollback everything + Rollback(); + } else { + assert(dynamic_cast(db_->GetBaseDB()) != nullptr); + auto db_impl = reinterpret_cast(db_->GetBaseDB()); + + WriteBatchWithIndex* new_batch = new WriteBatchWithIndex(cmp_, 0, true); + Status s = TransactionUtil::CopyFirstN( + savepoint_entries, write_batch_.get(), new_batch, db_impl); + if (!s.ok()) { + // TODO: Should we change this function to return a Status or should we + // somehow make it so RollbackToSavePoint() can never fail?? Not easy to + // handle the case where a client accesses a column family that's been + // dropped. + // After chatting with Siying, I'm going to send a diff that adds + // savepoint support in WriteBatchWithIndex and let reviewers decide which + // approach is cleaner. + fprintf(stderr, "STATUS: %s \n", s.ToString().c_str()); + delete new_batch; + } else { + write_batch_.reset(new_batch); + } + + num_entries_ = savepoint_entries; + } +} + +// Lock all keys in this batch. +// On success, caller should unlock keys_to_unlock +Status TransactionImpl::LockBatch(WriteBatch* batch, + TransactionKeyMap* keys_to_unlock) { + class Handler : public WriteBatch::Handler { + public: + // Sorted map of column_family_id to sorted set of keys. + // Since LockBatch() always locks keys in sorted order, it cannot deadlock + // with itself. We're not using a comparator here since it doesn't matter + // what the sorting is as long as it's consistent. + std::map> keys_; + + Handler() {} + + void RecordKey(uint32_t column_family_id, const Slice& key) { + std::string key_str = key.ToString(); + + auto iter = (keys_)[column_family_id].find(key_str); + if (iter == (keys_)[column_family_id].end()) { + // key not yet seen, store it. + (keys_)[column_family_id].insert({std::move(key_str)}); + } + } + + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + }; + + // Iterating on this handler will add all keys in this batch into keys + Handler handler; + batch->Iterate(&handler); + + Status s; + + // Attempt to lock all keys + for (const auto& cf_iter : handler.keys_) { + uint32_t cfh_id = cf_iter.first; + auto& cfh_keys = cf_iter.second; + + for (const auto& key_iter : cfh_keys) { + const std::string& key = key_iter; + + s = txn_db_impl_->TryLock(this, cfh_id, key); + if (!s.ok()) { + break; + } + (*keys_to_unlock)[cfh_id].insert({std::move(key), kMaxSequenceNumber}); + } + + if (!s.ok()) { + break; + } + } + + if (!s.ok()) { + txn_db_impl_->UnLock(this, keys_to_unlock); + } + + return s; +} + +Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family, + const SliceParts& key, bool check_snapshot) { + size_t key_size = 0; + for (int i = 0; i < key.num_parts; ++i) { + key_size += key.parts[i].size(); + } + + std::string str; + str.reserve(key_size); + + for (int i = 0; i < key.num_parts; ++i) { + str.append(key.parts[i].data(), key.parts[i].size()); + } + + return TryLock(column_family, str, check_snapshot); +} + +// Attempt to lock this key. +// Returns OK if the key has been successfully locked. Non-ok, otherwise. +// If check_shapshot is true and this transaction has a snapshot set, +// this key will only be locked if there have been no writes to this key since +// the snapshot time. +Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family, + const Slice& key, bool check_snapshot) { + uint32_t cfh_id = GetColumnFamilyID(column_family); + std::string key_str = key.ToString(); + bool previously_locked; + Status s; + + // lock this key if this transactions hasn't already locked it + auto iter = tracked_keys_[cfh_id].find(key_str); + if (iter == tracked_keys_[cfh_id].end()) { + previously_locked = false; + + s = txn_db_impl_->TryLock(this, cfh_id, key_str); + + if (s.ok()) { + // Record that we've locked this key + auto result = tracked_keys_[cfh_id].insert({key_str, kMaxSequenceNumber}); + iter = result.first; + } + } else { + previously_locked = true; + } + + if (s.ok()) { + // If a snapshot is set, we need to make sure the key hasn't been modified + // since the snapshot. This must be done after we locked the key. + if (!check_snapshot || snapshot_ == nullptr) { + // Need to remember the earliest sequence number that we know that this + // key has not been modified after. This is useful if this same + // transaction + // later tries to lock this key again. + if (iter->second == kMaxSequenceNumber) { + // Since we haven't checked a snapshot, we only know this key has not + // been modified since after we locked it. + iter->second = db_->GetLatestSequenceNumber(); + } + } else { + // If the key has been previous validated at a sequence number earlier + // than the curent snapshot's sequence number, we already know it has not + // been modified. + bool already_validated = iter->second <= snapshot_->GetSequenceNumber(); + + if (!already_validated) { + s = CheckKeySequence(column_family, key); + + if (s.ok()) { + // Record that there have been no writes to this key after this + // sequence. + iter->second = snapshot_->GetSequenceNumber(); + } else { + // Failed to validate key + if (!previously_locked) { + // Unlock key we just locked + txn_db_impl_->UnLock(this, cfh_id, key.ToString()); + tracked_keys_[cfh_id].erase(iter); + } + } + } + } + } + + return s; +} + +// Return OK() if this key has not been modified more recently than the +// transaction snapshot_. +Status TransactionImpl::CheckKeySequence(ColumnFamilyHandle* column_family, + const Slice& key) { + Status result; + if (snapshot_ != nullptr) { + assert(dynamic_cast(db_->GetBaseDB()) != nullptr); + auto db_impl = reinterpret_cast(db_->GetBaseDB()); + + ColumnFamilyHandle* cfh = column_family ? column_family : + db_impl->DefaultColumnFamily(); + + result = TransactionUtil::CheckKeyForConflicts( + db_impl, cfh, key.ToString(), + snapshot_->GetSequenceNumber()); + } + + return result; +} + +Status TransactionImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { + return write_batch_->GetFromBatchAndDB(db_, read_options, column_family, key, + value); +} + +Status TransactionImpl::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) { + Status s = TryLock(column_family, key); + + if (s.ok() && value != nullptr) { + s = Get(read_options, column_family, key, value); + } + return s; +} + +std::vector TransactionImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + size_t num_keys = keys.size(); + values->resize(num_keys); + + std::vector stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + std::string* value = values ? &(*values)[i] : nullptr; + stat_list[i] = Get(read_options, column_family[i], keys[i], value); + } + + return stat_list; +} + +std::vector TransactionImpl::MultiGetForUpdate( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + // Regardless of whether the MultiGet succeeded, track these keys. + size_t num_keys = keys.size(); + values->resize(num_keys); + + // Lock all keys + for (size_t i = 0; i < num_keys; ++i) { + Status s = TryLock(column_family[i], keys[i]); + if (!s.ok()) { + // Fail entire multiget if we cannot lock all keys + return std::vector(num_keys, s); + } + } + + // TODO(agiardullo): optimize multiget? + std::vector stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + std::string* value = values ? &(*values)[i] : nullptr; + stat_list[i] = Get(read_options, column_family[i], keys[i], value); + } + + return stat_list; +} + +Iterator* TransactionImpl::GetIterator(const ReadOptions& read_options) { + Iterator* db_iter = db_->NewIterator(read_options); + assert(db_iter); + + return write_batch_->NewIteratorWithBase(db_iter); +} + +Iterator* TransactionImpl::GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + Iterator* db_iter = db_->NewIterator(read_options, column_family); + assert(db_iter); + + return write_batch_->NewIteratorWithBase(column_family, db_iter); +} + +Status TransactionImpl::Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + Status s = TryLock(column_family, key); + + if (s.ok()) { + write_batch_->Put(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value) { + Status s = TryLock(column_family, key); + + if (s.ok()) { + write_batch_->Put(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + Status s = TryLock(column_family, key); + + if (s.ok()) { + write_batch_->Merge(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::Delete(ColumnFamilyHandle* column_family, + const Slice& key) { + Status s = TryLock(column_family, key); + + if (s.ok()) { + write_batch_->Delete(column_family, key); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + Status s = TryLock(column_family, key); + + if (s.ok()) { + write_batch_->Delete(column_family, key); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + // Even though we do not care about doing conflict checking for this write, + // we still need to take a lock to make sure we do not cause a conflict with + // some other write. However, we do not need to check if there have been + // any writes since this transaction's snapshot. + bool check_snapshot = false; + + // TODO(agiardullo): could optimize by supporting shared txn locks in the + // future + Status s = TryLock(column_family, key, check_snapshot); + + if (s.ok()) { + write_batch_->Put(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) { + bool check_snapshot = false; + Status s = TryLock(column_family, key, check_snapshot); + + if (s.ok()) { + write_batch_->Put(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::MergeUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + bool check_snapshot = false; + Status s = TryLock(column_family, key, check_snapshot); + + if (s.ok()) { + write_batch_->Merge(column_family, key, value); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) { + bool check_snapshot = false; + Status s = TryLock(column_family, key, check_snapshot); + + if (s.ok()) { + write_batch_->Delete(column_family, key); + num_entries_++; + } + + return s; +} + +Status TransactionImpl::DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) { + bool check_snapshot = false; + Status s = TryLock(column_family, key, check_snapshot); + + if (s.ok()) { + write_batch_->Delete(column_family, key); + num_entries_++; + } + + return s; +} + +void TransactionImpl::PutLogData(const Slice& blob) { + write_batch_->PutLogData(blob); + num_entries_++; +} + +WriteBatchWithIndex* TransactionImpl::GetWriteBatch() { + return write_batch_.get(); +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_impl.h b/utilities/transactions/transaction_impl.h new file mode 100644 index 0000000000..c30c9f1b79 --- /dev/null +++ b/utilities/transactions/transaction_impl.h @@ -0,0 +1,263 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "db/write_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "utilities/transactions/transaction_util.h" + +namespace rocksdb { + +using TransactionID = uint64_t; + +class TransactionDBImpl; + +class TransactionImpl : public Transaction { + public: + TransactionImpl(TransactionDB* db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + + virtual ~TransactionImpl(); + + Status Commit() override; + + Status CommitBatch(WriteBatch* batch); + + void Rollback() override; + + void SetSavePoint() override; + + void RollbackToSavePoint() override; + + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) override; + + Status Get(const ReadOptions& options, const Slice& key, + std::string* value) override { + return Get(options, db_->DefaultColumnFamily(), key, value); + } + + Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) override; + + Status GetForUpdate(const ReadOptions& options, const Slice& key, + std::string* value) override { + return GetForUpdate(options, db_->DefaultColumnFamily(), key, value); + } + + std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) override { + return MultiGet(options, std::vector( + keys.size(), db_->DefaultColumnFamily()), + keys, values); + } + + std::vector MultiGetForUpdate( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + std::vector MultiGetForUpdate( + const ReadOptions& options, const std::vector& keys, + std::vector* values) override { + return MultiGetForUpdate(options, + std::vector( + keys.size(), db_->DefaultColumnFamily()), + keys, values); + } + + Iterator* GetIterator(const ReadOptions& read_options) override; + Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) override; + + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Put(const Slice& key, const Slice& value) override { + return Put(nullptr, key, value); + } + + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status Put(const SliceParts& key, const SliceParts& value) override { + return Put(nullptr, key, value); + } + + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Merge(const Slice& key, const Slice& value) override { + return Merge(nullptr, key, value); + } + + Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; + Status Delete(const Slice& key) override { return Delete(nullptr, key); } + Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } + + Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status PutUntracked(const Slice& key, const Slice& value) override { + return PutUntracked(nullptr, key, value); + } + + Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status PutUntracked(const SliceParts& key, const SliceParts& value) override { + return PutUntracked(nullptr, key, value); + } + + Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status MergeUntracked(const Slice& key, const Slice& value) override { + return MergeUntracked(nullptr, key, value); + } + + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status DeleteUntracked(const Slice& key) override { + return DeleteUntracked(nullptr, key); + } + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status DeleteUntracked(const SliceParts& key) override { + return DeleteUntracked(nullptr, key); + } + + void PutLogData(const Slice& blob) override; + + const Snapshot* GetSnapshot() const override { return snapshot_; } + + void SetSnapshot() override; + + WriteBatchWithIndex* GetWriteBatch() override; + + // Generate a new unique transaction identifier + static TransactionID GenTxnID(); + + TransactionID GetTxnID() const { return txn_id_; } + + // Returns the time (in milliseconds according to Env->GetMicros()*1000) + // that this transaction will be expired. Returns 0 if this transaction does + // not expire. + uint64_t GetExpirationTime() const { return expiration_time_; } + + // returns true if this transaction has an expiration_time and has expired. + bool IsExpired() const; + + // Returns the number of milliseconds a transaction can wait on acquiring a + // lock or -1 if there is no timeout. + int64_t GetLockTimeout() const { return lock_timeout_; } + void SetLockTimeout(int64_t timeout) { lock_timeout_ = timeout; } + + private: + TransactionDB* const db_; + + TransactionDBImpl* txn_db_impl_; + + // Used to create unique ids for transactions. + static std::atomic txn_id_counter_; + + // Unique ID for this transaction + const TransactionID txn_id_; + + const WriteOptions write_options_; + + // If snapshot_ is set, all keys that locked must also have not been written + // since this snapshot + const Snapshot* snapshot_; + + const Comparator* cmp_; + + std::unique_ptr write_batch_; + + // If expiration_ is non-zero, start_time_ stores that time the txn was + // constructed, + // in milliseconds. + const uint64_t start_time_; + + // If non-zero, this transaction should not be committed after this time (in + // milliseconds) + const uint64_t expiration_time_; + + // Timeout in microseconds when locking a key or -1 if there is no timeout. + int64_t lock_timeout_; + + // Map from column_family_id to map of keys to Sequence Numbers. Stores keys + // that have been locked. + // The key is known to not have been modified after the Sequence Number + // stored. + TransactionKeyMap tracked_keys_; + + // Records the number of entries currently in the WriteBatch include calls to + // PutLogData() + size_t num_entries_ = 0; + + // Stack of number of entries in write_batch at each save point + std::unique_ptr> save_points_; + + Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, + bool check_snapshot = true); + Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key, + bool check_snapshot = true); + void Cleanup(); + + Status CheckKeySequence(ColumnFamilyHandle* column_family, const Slice& key); + + Status LockBatch(WriteBatch* batch, TransactionKeyMap* keys_to_unlock); + + Status DoCommit(WriteBatch* batch); + + void RollbackLastN(size_t num); + + // No copying allowed + TransactionImpl(const TransactionImpl&); + void operator=(const TransactionImpl&); +}; + +// Used at commit time to check whether transaction is committing before its +// expiration time. +class TransactionCallback : public WriteCallback { + public: + explicit TransactionCallback(TransactionImpl* txn) : txn_(txn) {} + + Status Callback(DB* db) override { + if (txn_->IsExpired()) { + return Status::TimedOut(); + } else { + return Status::OK(); + } + } + + private: + TransactionImpl* txn_; +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc new file mode 100644 index 0000000000..b6cc9eb79a --- /dev/null +++ b/utilities/transactions/transaction_lock_mgr.cc @@ -0,0 +1,443 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include "utilities/transactions/transaction_lock_mgr.h" + +#include + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/slice.h" +#include "util/autovector.h" +#include "util/murmurhash.h" +#include "util/thread_local.h" + +namespace rocksdb { + +struct LockInfo { + TransactionID txn_id; + uint64_t + expiration_time; // Transaction locks are not valid after this time in ms + LockInfo(TransactionID id, uint64_t time) + : txn_id(id), expiration_time(time) {} + LockInfo(const LockInfo& lock_info) + : txn_id(lock_info.txn_id), expiration_time(lock_info.expiration_time) {} +}; + +struct LockMapStripe { + // Mutex must be held before modifying keys map + std::timed_mutex stripe_mutex; + + // Condition Variable per stripe for waiting on a lock + std::condition_variable_any stripe_cv; + + // Locked keys mapped to the info about the transactions that locked them. + // TODO(agiardullo): Explore performance of other data structures. + std::unordered_map keys; +}; + +// Map of #num_stripes LockMapStripes +struct LockMap { + explicit LockMap(size_t num_stripes) + : num_stripes_(num_stripes), lock_map_stripes_(num_stripes) {} + + LockMap(const LockMap& lock_map) + : num_stripes_(lock_map.num_stripes_), lock_map_stripes_(num_stripes_) {} + + // Number of sepearate LockMapStripes to create, each with their own Mutex + const size_t num_stripes_; + + // Count of keys that are currently locked in this column family. + // (Only maintained if TransactionLockMgr::max_num_locks_ is positive.) + std::atomic lock_cnt{0}; + + std::vector lock_map_stripes_; + + size_t GetStripe(const std::string& key) const; +}; + +namespace { +void UnrefLockMapsCache(void* ptr) { + // Called when a thread exits or a ThreadLocalPtr gets destroyed. + auto lock_maps_cache = + static_cast>*>(ptr); + delete lock_maps_cache; +} +} // anonymous namespace + +TransactionLockMgr::TransactionLockMgr(size_t default_num_stripes, + int64_t max_num_locks) + : default_num_stripes_(default_num_stripes), + max_num_locks_(max_num_locks), + lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)) {} + +TransactionLockMgr::~TransactionLockMgr() {} + +size_t LockMap::GetStripe(const std::string& key) const { + assert(num_stripes_ > 0); + static murmur_hash hash; + size_t stripe = hash(key) % num_stripes_; + return stripe; +} + +void TransactionLockMgr::AddColumnFamily(uint32_t column_family_id) { + InstrumentedMutexLock l(&lock_map_mutex_); + + if (lock_maps_.find(column_family_id) == lock_maps_.end()) { + lock_maps_.emplace( + column_family_id, + std::shared_ptr(new LockMap(default_num_stripes_))); + } else { + // column_family already exists in lock map + assert(false); + } +} + +void TransactionLockMgr::RemoveColumnFamily(uint32_t column_family_id) { + // Remove lock_map for this column family. Since the lock map is stored + // as a shared ptr, concurrent transactions can still keep keep using it + // until they release their reference to it. + { + InstrumentedMutexLock l(&lock_map_mutex_); + + auto lock_maps_iter = lock_maps_.find(column_family_id); + assert(lock_maps_iter != lock_maps_.end()); + + lock_maps_.erase(lock_maps_iter); + } // lock_map_mutex_ + + // Clear all thread-local caches + autovector local_caches; + lock_maps_cache_->Scrape(&local_caches, nullptr); + for (auto cache : local_caches) { + delete static_cast(cache); + } +} + +// Look up the LockMap shared_ptr for a given column_family_id. +// Note: The LockMap is only valid as long as the caller is still holding on +// to the returned shared_ptr. +std::shared_ptr TransactionLockMgr::GetLockMap( + uint32_t column_family_id) { + // First check thread-local cache + if (lock_maps_cache_->Get() == nullptr) { + lock_maps_cache_->Reset(new LockMaps()); + } + + auto lock_maps_cache = static_cast(lock_maps_cache_->Get()); + + auto lock_map_iter = lock_maps_cache->find(column_family_id); + if (lock_map_iter != lock_maps_cache->end()) { + // Found lock map for this column family. + return lock_map_iter->second; + } + + // Not found in local cache, grab mutex and check shared LockMaps + InstrumentedMutexLock l(&lock_map_mutex_); + + lock_map_iter = lock_maps_.find(column_family_id); + if (lock_map_iter == lock_maps_.end()) { + return std::shared_ptr(nullptr); + } else { + // Found lock map. Store in thread-local cache and return. + std::shared_ptr& lock_map = lock_map_iter->second; + lock_maps_cache->insert({column_family_id, lock_map}); + + return lock_map; + } +} + +// Returns true if this lock has expired and can be acquired by another +// transaction. +// If false, returns the number of microseconds until expiration in +// *wait_time_us, or 0 if no expiration. +bool TransactionLockMgr::IsLockExpired(const LockInfo& lock_info, Env* env, + uint64_t* wait_time_us) { + auto now = env->NowMicros(); + + bool expired = (lock_info.expiration_time > 0 && + lock_info.expiration_time * 1000 <= now); + + if (!expired && lock_info.expiration_time > 0 && wait_time_us != nullptr) { + // return how many microseconds until lock will be expired + *wait_time_us = (lock_info.expiration_time * 1000 - now); + } + + return expired; +} + +Status TransactionLockMgr::TryLock(const TransactionImpl* txn, + uint32_t column_family_id, + const std::string& key, Env* env) { + // Lookup lock map for this column family id + std::shared_ptr lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + if (lock_map == nullptr) { + char msg[255]; + snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32, + column_family_id); + + return Status::InvalidArgument(msg); + } + + // Need to lock the mutex for the stripe that this key hashes to + size_t stripe_num = lock_map->GetStripe(key); + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num); + + LockInfo lock_info(txn->GetTxnID(), txn->GetExpirationTime()); + int64_t timeout = txn->GetLockTimeout(); + + return AcquireWithTimeout(lock_map, stripe, key, env, timeout, lock_info); +} + +// Helper function for TryLock(). +Status TransactionLockMgr::AcquireWithTimeout(LockMap* lock_map, + LockMapStripe* stripe, + const std::string& key, Env* env, + int64_t timeout, + const LockInfo& lock_info) { + std::chrono::system_clock::time_point end_time; + + if (timeout > 0) { + end_time = + std::chrono::system_clock::now() + std::chrono::milliseconds(timeout); + } + + bool locked = true; + if (timeout == 0) { + // If timeout is 0, we do not wait to acquire the lock if it is not + // available + locked = stripe->stripe_mutex.try_lock(); + } else if (timeout < 0) { + // If timeout is negative, we wait indefinitely to acquire the lock + stripe->stripe_mutex.lock(); + } else { + // If timeout is positive, we attempt to acquire the lock unless we timeout + locked = stripe->stripe_mutex.try_lock_until(end_time); + } + + if (!locked) { + // timeout acquiring mutex + return Status::Busy(); + } + + // Acquire lock if we are able to + uint64_t wait_time_us = 0; + Status result = + AcquireLocked(lock_map, stripe, key, env, lock_info, &wait_time_us); + + if (result.IsBusy() && timeout != 0) { + // If we weren't able to acquire the lock, we will keep retrying as long + // as the + // timeout allows. + bool timed_out = false; + do { + // Check to see if the lock expires sooner than our timeout. + std::chrono::system_clock::time_point wait_time_end; + if (wait_time_us > 0 && + (timeout < 0 || + wait_time_us < static_cast(timeout * 1000))) { + wait_time_end = std::chrono::system_clock::now() + + std::chrono::microseconds(wait_time_us); + if (timeout > 0 && wait_time_end >= end_time) { + // lock expiration time is after our timeout. + wait_time_us = 0; + } + } else { + wait_time_us = 0; + } + + if (wait_time_us > 0) { + // Wait up to the locks current expiration time + stripe->stripe_cv.wait_until(stripe->stripe_mutex, wait_time_end); + } else if (timeout > 0) { + // Wait until we timeout + auto cv_status = + stripe->stripe_cv.wait_until(stripe->stripe_mutex, end_time); + + if (cv_status == std::cv_status::timeout) { + timed_out = true; + // Even though we timed out, we will still make one more attempt to + // acquire lock below (it is possible the lock expired and we + // were never signaled). + } + } else { + // No wait timeout. + stripe->stripe_cv.wait(stripe->stripe_mutex); + } + + result = + AcquireLocked(lock_map, stripe, key, env, lock_info, &wait_time_us); + } while (result.IsBusy() && !timed_out); + } + + stripe->stripe_mutex.unlock(); + + return result; +} + +// Try to lock this key after we have acquired the mutex. +// Returns the number of microseconds until expiration in *wait_time_us, +// or 0 if no expiration. +// REQUIRED: Stripe mutex must be held. +Status TransactionLockMgr::AcquireLocked(LockMap* lock_map, + LockMapStripe* stripe, + const std::string& key, Env* env, + const LockInfo& txn_lock_info, + uint64_t* wait_time_us) { + Status result; + // Check if this key is already locked + if (stripe->keys.find(key) != stripe->keys.end()) { + // Lock already held + + LockInfo& lock_info = stripe->keys.at(key); + if (lock_info.txn_id != txn_lock_info.txn_id) { + // locked by another txn. Check if it's expired + if (IsLockExpired(lock_info, env, wait_time_us)) { + // lock is expired, can steal it + lock_info.txn_id = txn_lock_info.txn_id; + lock_info.expiration_time = txn_lock_info.expiration_time; + // lock_cnt does not change + } else { + result = Status::Busy(); + } + } + } else { // Lock not held. + // Check lock limit + if (max_num_locks_ > 0 && + lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) { + result = + Status::Busy("Failed to acquire lock due to max_num_locks limit"); + } else { + // acquire lock + stripe->keys.insert({key, txn_lock_info}); + + // Maintain lock count if there is a limit on the number of locks + if (max_num_locks_) { + lock_map->lock_cnt++; + } + } + } + + return result; +} + +void TransactionLockMgr::UnLock(TransactionImpl* txn, uint32_t column_family_id, + const std::string& key, Env* env) { + std::shared_ptr lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + if (lock_map == nullptr) { + // Column Family must have been dropped. + return; + } + + // Lock the mutex for the stripe that this key hashes to + size_t stripe_num = lock_map->GetStripe(key); + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num); + + TransactionID txn_id = txn->GetTxnID(); + { + std::lock_guard lock(stripe->stripe_mutex); + + const auto& iter = stripe->keys.find(key); + if (iter != stripe->keys.end() && iter->second.txn_id == txn_id) { + // Found the key we locked. unlock it. + stripe->keys.erase(iter); + if (max_num_locks_ > 0) { + // Maintain lock count if there is a limit on the number of locks. + assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0); + lock_map->lock_cnt--; + } + } else { + // This key is either not locked or locked by someone else. This should + // only happen if the unlocking transaction has expired. + assert(txn->GetExpirationTime() > 0 && + txn->GetExpirationTime() * 1000 < env->NowMicros()); + } + } // stripe_mutex unlocked + + // Signal waiting threads to retry locking + stripe->stripe_cv.notify_all(); +} + +void TransactionLockMgr::UnLock(const TransactionImpl* txn, + const TransactionKeyMap* key_map, Env* env) { + TransactionID txn_id = txn->GetTxnID(); + + for (auto& key_map_iter : *key_map) { + uint32_t column_family_id = key_map_iter.first; + auto& keys = key_map_iter.second; + + std::shared_ptr lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + + if (lock_map == nullptr) { + // Column Family must have been dropped. + return; + } + + // Bucket keys by lock_map_ stripe + std::unordered_map> keys_by_stripe( + std::max(keys.size(), lock_map->num_stripes_)); + + for (auto& key_iter : keys) { + const std::string& key = key_iter.first; + + size_t stripe_num = lock_map->GetStripe(key); + keys_by_stripe[stripe_num].push_back(&key); + } + + // For each stripe, grab the stripe mutex and unlock all keys in this stripe + for (auto& stripe_iter : keys_by_stripe) { + size_t stripe_num = stripe_iter.first; + auto& stripe_keys = stripe_iter.second; + + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = &lock_map->lock_map_stripes_.at(stripe_num); + + { + std::lock_guard lock(stripe->stripe_mutex); + + for (const std::string* key : stripe_keys) { + const auto& iter = stripe->keys.find(*key); + if (iter != stripe->keys.end() && iter->second.txn_id == txn_id) { + // Found the key we locked. unlock it. + stripe->keys.erase(iter); + if (max_num_locks_ > 0) { + // Maintain lock count if there is a limit on the number of locks. + assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0); + lock_map->lock_cnt--; + } + } else { + // This key is either not locked or locked by someone else. This + // should only + // happen if the unlocking transaction has expired. + assert(txn->GetExpirationTime() > 0 && + txn->GetExpirationTime() * 1000 < env->NowMicros()); + } + } + } // stripe_mutex unlocked + + // Signal waiting threads to retry locking + stripe->stripe_cv.notify_all(); + } + } +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_lock_mgr.h b/utilities/transactions/transaction_lock_mgr.h new file mode 100644 index 0000000000..7768496a24 --- /dev/null +++ b/utilities/transactions/transaction_lock_mgr.h @@ -0,0 +1,90 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include + +#include "rocksdb/utilities/transaction.h" +#include "util/instrumented_mutex.h" +#include "util/thread_local.h" +#include "utilities/transactions/transaction_impl.h" + +namespace rocksdb { + +class ColumnFamilyHandle; +struct LockInfo; +struct LockMap; +struct LockMapStripe; + +class Slice; + +class TransactionLockMgr { + public: + TransactionLockMgr(size_t default_num_stripes, int64_t max_num_locks); + + ~TransactionLockMgr(); + + // Creates a new LockMap for this column family. Caller should guarantee + // that this column family does not already exist. + void AddColumnFamily(uint32_t column_family_id); + + // Deletes the LockMap for this column family. Caller should guarantee that + // this column family is no longer in use. + void RemoveColumnFamily(uint32_t column_family_id); + + // Attempt to lock key. If OK status is returned, the caller is responsible + // for calling UnLock() on this key. + Status TryLock(const TransactionImpl* txn, uint32_t column_family_id, + const std::string& key, Env* env); + + // Unlock a key locked by TryLock(). txn must be the same Transaction that + // locked this key. + void UnLock(const TransactionImpl* txn, const TransactionKeyMap* keys, + Env* env); + void UnLock(TransactionImpl* txn, uint32_t column_family_id, + const std::string& key, Env* env); + + private: + // Default number of lock map stripes per column family + const size_t default_num_stripes_; + + // Limit on number of keys locked per column family + const int64_t max_num_locks_; + + // Must be held when accessing/modifying lock_maps_ + InstrumentedMutex lock_map_mutex_; + + // Map of ColumnFamilyId to locked key info + using LockMaps = std::unordered_map>; + LockMaps lock_maps_; + + // Thread-local cache of entries in lock_maps_. This is an optimization + // to avoid acquiring a mutex in order to look up a LockMap + std::unique_ptr lock_maps_cache_; + + bool IsLockExpired(const LockInfo& lock_info, Env* env, uint64_t* wait_time); + + std::shared_ptr GetLockMap(uint32_t column_family_id); + + Status AcquireWithTimeout(LockMap* lock_map, LockMapStripe* stripe, + const std::string& key, Env* env, int64_t timeout, + const LockInfo& lock_info); + + Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, + const std::string& key, Env* env, + const LockInfo& lock_info, uint64_t* wait_time); + + // No copying allowed + TransactionLockMgr(const TransactionLockMgr&); + void operator=(const TransactionLockMgr&); +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc new file mode 100644 index 0000000000..8aef74ffd4 --- /dev/null +++ b/utilities/transactions/transaction_test.cc @@ -0,0 +1,1587 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/db.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/logging.h" +#include "util/testharness.h" + +using std::string; + +namespace rocksdb { + +class TransactionTest : public testing::Test { + public: + TransactionDB* db; + string dbname; + Options options; + + TransactionDBOptions txn_db_options; + + TransactionTest() { + options.create_if_missing = true; + options.max_write_buffer_number = 2; + dbname = test::TmpDir() + "/transaction_testdb"; + + DestroyDB(dbname, options); + txn_db_options.transaction_lock_timeout = 0; + txn_db_options.default_lock_timeout = 0; + Status s = TransactionDB::Open(options, txn_db_options, dbname, &db); + assert(s.ok()); + } + + ~TransactionTest() { + delete db; + DestroyDB(dbname, options); + } +}; + +TEST_F(TransactionTest, SuccessTest) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + db->Put(write_options, Slice("foo"), Slice("bar")); + db->Put(write_options, Slice("foo2"), Slice("bar")); + + Transaction* txn = db->BeginTransaction(write_options, TransactionOptions()); + ASSERT_TRUE(txn); + + s = txn->GetForUpdate(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + + s = txn->Put(Slice("foo"), Slice("bar2")); + ASSERT_OK(s); + + s = txn->GetForUpdate(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_F(TransactionTest, WriteConflictTest) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + db->Put(write_options, "foo", "A"); + db->Put(write_options, "foo2", "B"); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("foo", "A2"); + ASSERT_OK(s); + + s = txn->Put("foo2", "B2"); + ASSERT_OK(s); + + // This Put outside of a transaction will conflict with the previous write + s = db->Put(write_options, "foo", "xxx"); + ASSERT_NOK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "A"); + + s = txn->Commit(); + ASSERT_OK(s); + + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "A2"); + db->Get(read_options, "foo2", &value); + ASSERT_EQ(value, "B2"); + + delete txn; +} + +TEST_F(TransactionTest, WriteConflictTest2) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + string value; + Status s; + + db->Put(write_options, "foo", "bar"); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + // This Put outside of a transaction will conflict with a later write + s = db->Put(write_options, "foo", "barz"); + ASSERT_OK(s); + + s = txn->Put("foo2", "X"); + ASSERT_OK(s); + + s = txn->Put("foo", + "bar2"); // Conflicts with write done after snapshot taken + ASSERT_NOK(s); + + s = txn->Put("foo3", "Y"); + ASSERT_OK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "barz"); + + s = txn->Commit(); + ASSERT_OK(s); // Txn should commit, but only write foo2 and foo3 + + // Verify that transaction wrote foo2 and foo3 but not foo + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "barz"); + + db->Get(read_options, "foo2", &value); + ASSERT_EQ(value, "X"); + + db->Get(read_options, "foo3", &value); + ASSERT_EQ(value, "Y"); + + delete txn; +} + +TEST_F(TransactionTest, ReadConflictTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + db->Put(write_options, "foo", "bar"); + db->Put(write_options, "foo2", "bar"); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn->GetForUpdate(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + // This Put outside of a transaction will conflict with the previous read + s = db->Put(write_options, "foo", "barz"); + ASSERT_NOK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; +} + +TEST_F(TransactionTest, TxnOnlyTest) { + // Test to make sure transactions work when there are no other writes in an + // empty db. + + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("x", "y"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; +} + +TEST_F(TransactionTest, FlushTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + string value; + Status s; + + db->Put(write_options, Slice("foo"), Slice("bar")); + db->Put(write_options, Slice("foo2"), Slice("bar")); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn->GetForUpdate(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Put(Slice("foo"), Slice("bar2")); + ASSERT_OK(s); + + txn->GetForUpdate(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + // Put a random key so we have a memtable to flush + s = db->Put(write_options, "dummy", "dummy"); + ASSERT_OK(s); + + // force a memtable flush + FlushOptions flush_ops; + db->Flush(flush_ops); + + s = txn->Commit(); + // txn should commit since the flushed table is still in MemtableList History + ASSERT_OK(s); + + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_F(TransactionTest, FlushTest2) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + db->Put(write_options, Slice("foo"), Slice("bar")); + db->Put(write_options, Slice("foo2"), Slice("bar")); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn->GetForUpdate(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Put(Slice("foo"), Slice("bar2")); + ASSERT_OK(s); + + txn->GetForUpdate(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + // Put a random key so we have a MemTable to flush + s = db->Put(write_options, "dummy", "dummy"); + ASSERT_OK(s); + + // force a memtable flush + FlushOptions flush_ops; + db->Flush(flush_ops); + + // Put a random key so we have a MemTable to flush + s = db->Put(write_options, "dummy", "dummy2"); + ASSERT_OK(s); + + // force a memtable flush + db->Flush(flush_ops); + + s = db->Put(write_options, "dummy", "dummy3"); + ASSERT_OK(s); + + // force a memtable flush + // Since our test db has max_write_buffer_number=2, this flush will cause + // the first memtable to get purged from the MemtableList history. + db->Flush(flush_ops); + + s = txn->Put("X", "Y"); + ASSERT_NOK(s); // Put should fail since MemTableList History is not older + // than snapshot. + + s = txn->Commit(); + ASSERT_OK(s); + + // Transaction should only write the keys that succeeded. + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + s = db->Get(read_options, "X", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_F(TransactionTest, NoSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + db->Put(write_options, "AAA", "bar"); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + // Modify key after transaction start + db->Put(write_options, "AAA", "bar1"); + + // Read and write without a snapshot + txn->GetForUpdate(read_options, "AAA", &value); + ASSERT_EQ(value, "bar1"); + s = txn->Put("AAA", "bar2"); + ASSERT_OK(s); + + // Should commit since read/write was done after data changed + s = txn->Commit(); + ASSERT_OK(s); + + txn->GetForUpdate(read_options, "AAA", &value); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_F(TransactionTest, MultipleSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + string value; + Status s; + + db->Put(write_options, "AAA", "bar"); + db->Put(write_options, "BBB", "bar"); + db->Put(write_options, "CCC", "bar"); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + db->Put(write_options, "AAA", "bar1"); + + // Read and write without a snapshot + txn->GetForUpdate(read_options, "AAA", &value); + ASSERT_EQ(value, "bar1"); + s = txn->Put("AAA", "bar2"); + ASSERT_OK(s); + + // Modify BBB before snapshot is taken + db->Put(write_options, "BBB", "bar1"); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + txn->GetForUpdate(snapshot_read_options, "BBB", &value); + ASSERT_EQ(value, "bar1"); + s = txn->Put("BBB", "bar2"); + ASSERT_OK(s); + + db->Put(write_options, "CCC", "bar1"); + + // Set a new snapshot + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + txn->GetForUpdate(snapshot_read_options, "CCC", &value); + ASSERT_EQ(value, "bar1"); + s = txn->Put("CCC", "bar2"); + ASSERT_OK(s); + + s = txn->GetForUpdate(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = txn->GetForUpdate(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = txn->GetForUpdate(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + s = db->Get(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + s = db->Get(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + s = db->Get(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = db->Get(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = db->Get(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + // verify that we track multiple writes to the same key at different snapshots + delete txn; + txn = db->BeginTransaction(write_options); + + // Potentially conflicting writes + db->Put(write_options, "ZZZ", "zzz"); + db->Put(write_options, "XXX", "xxx"); + + txn->SetSnapshot(); + + TransactionOptions txn_options; + txn_options.set_snapshot = true; + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + txn2->SetSnapshot(); + + // This should not conflict in txn since the snapshot is later than the + // previous write (spoiler alert: it will later conflict with txn2). + s = txn->Put("ZZZ", "zzzz"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; + + // This will conflict since the snapshot is earlier than another write to ZZZ + s = txn2->Put("ZZZ", "xxxxx"); + ASSERT_NOK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "ZZZ", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzzz"); + + delete txn2; +} + +TEST_F(TransactionTest, ColumnFamiliesTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + ColumnFamilyHandle *cfa, *cfb; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "CFB", &cfb); + ASSERT_OK(s); + + delete cfa; + delete cfb; + delete db; + + // open DB with three column families + std::vector column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + column_families.push_back( + ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + + std::vector handles; + + s = TransactionDB::Open(options, txn_db_options, dbname, column_families, + &handles, &db); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn_options.set_snapshot = true; + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + // Write some data to the db + WriteBatch batch; + batch.Put("foo", "foo"); + batch.Put(handles[1], "AAA", "bar"); + batch.Put(handles[1], "AAAZZZ", "bar"); + s = db->Write(write_options, &batch); + ASSERT_OK(s); + db->Delete(write_options, handles[1], "AAAZZZ"); + + // These keys do not conflict with existing writes since they're in + // different column families + s = txn->Delete("AAA"); + ASSERT_OK(s); + s = txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + Slice key_slice("AAAZZZ"); + Slice value_slices[2] = {Slice("bar"), Slice("bar")}; + s = txn->Put(handles[2], SliceParts(&key_slice, 1), + SliceParts(value_slices, 2)); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, "AAA", &value); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(read_options, handles[2], "AAAZZZ", &value); + ASSERT_EQ(value, "barbar"); + + Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")}; + Slice value_slice("barbarbar"); + + s = txn2->Delete(handles[2], "XXX"); + ASSERT_OK(s); + s = txn2->Delete(handles[1], "XXX"); + ASSERT_OK(s); + + // This write will cause a conflict with the earlier batch write + s = txn2->Put(handles[1], SliceParts(key_slices, 3), + SliceParts(&value_slice, 1)); + ASSERT_NOK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, handles[1], "AAAZZZ", &value); + ASSERT_EQ(value, "barbar"); + + delete txn; + delete txn2; + + txn = db->BeginTransaction(write_options, txn_options); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + std::vector multiget_cfh = {handles[1], handles[2], + handles[0], handles[2]}; + std::vector multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"}; + std::vector values(4); + + std::vector results = txn->MultiGetForUpdate( + snapshot_read_options, multiget_cfh, multiget_keys, &values); + ASSERT_OK(results[0]); + ASSERT_OK(results[1]); + ASSERT_OK(results[2]); + ASSERT_TRUE(results[3].IsNotFound()); + ASSERT_EQ(values[0], "bar"); + ASSERT_EQ(values[1], "barbar"); + ASSERT_EQ(values[2], "foo"); + + s = txn->Delete(handles[2], "ZZZ"); + ASSERT_OK(s); + s = txn->Put(handles[2], "ZZZ", "YYY"); + ASSERT_OK(s); + s = txn->Put(handles[2], "ZZZ", "YYYY"); + ASSERT_OK(s); + s = txn->Delete(handles[2], "ZZZ"); + ASSERT_OK(s); + s = txn->Put(handles[2], "AAAZZZ", "barbarbar"); + ASSERT_OK(s); + + // Txn should commit + s = txn->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, handles[2], "ZZZ", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Put a key which will conflict with the next txn using the previous snapshot + db->Put(write_options, handles[2], "foo", "000"); + + results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh, + multiget_keys, &values); + // All results should fail since there was a conflict + ASSERT_NOK(results[0]); + ASSERT_NOK(results[1]); + ASSERT_NOK(results[2]); + ASSERT_NOK(results[3]); + + s = db->Get(read_options, handles[2], "foo", &value); + ASSERT_EQ(value, "000"); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->DropColumnFamily(handles[1]); + ASSERT_OK(s); + s = db->DropColumnFamily(handles[2]); + ASSERT_OK(s); + + delete txn; + delete txn2; + + for (auto handle : handles) { + delete handle; + } +} + +TEST_F(TransactionTest, ColumnFamiliesTest2) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + ColumnFamilyHandle *one, *two; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + s = db->CreateColumnFamily(cf_options, "ONE", &one); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "TWO", &two); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn1); + Transaction* txn2 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn2); + + s = txn1->Put(one, "X", "1"); + ASSERT_OK(s); + s = txn1->Put(two, "X", "2"); + ASSERT_OK(s); + s = txn1->Put("X", "0"); + ASSERT_OK(s); + + s = txn2->Put(one, "X", "11"); + ASSERT_TRUE(s.IsBusy()); + + s = txn1->Commit(); + ASSERT_OK(s); + + // Drop first column family + s = db->DropColumnFamily(one); + ASSERT_OK(s); + + // Should fail since column family was dropped. + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn1; + txn1 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn1); + + // Should fail since column family was dropped + s = txn1->Put(one, "X", "111"); + ASSERT_TRUE(s.IsInvalidArgument()); + + s = txn1->Put(two, "X", "222"); + ASSERT_OK(s); + + s = txn1->Put("X", "000"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, two, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("222", value); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("000", value); + + s = db->DropColumnFamily(two); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + delete one; + delete two; +} + +TEST_F(TransactionTest, EmptyTest) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + txn = db->BeginTransaction(write_options); + txn->Rollback(); + delete txn; + + txn = db->BeginTransaction(write_options); + s = txn->GetForUpdate(read_options, "aaa", &value); + ASSERT_EQ(value, "aaa"); + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = txn->GetForUpdate(read_options, "aaa", &value); + ASSERT_EQ(value, "aaa"); + + // Conflicts with previous GetForUpdate + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_NOK(s); + + // transaction expired! + s = txn->Commit(); + ASSERT_OK(s); + delete txn; +} + +TEST_F(TransactionTest, PredicateManyPreceders) { + WriteOptions write_options; + ReadOptions read_options1, read_options2; + TransactionOptions txn_options; + string value; + Status s; + + txn_options.set_snapshot = true; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + Transaction* txn2 = db->BeginTransaction(write_options); + txn2->SetSnapshot(); + read_options2.snapshot = txn2->GetSnapshot(); + + std::vector multiget_keys = {"1", "2", "3"}; + std::vector multiget_values; + + std::vector results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_TRUE(results[1].IsNotFound()); + + s = txn2->Put("2", "x"); // Conflict's with txn1's MultiGetForUpdate + ASSERT_NOK(s); + + txn2->Rollback(); + + multiget_values.clear(); + results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_TRUE(results[1].IsNotFound()); + + s = txn1->Commit(); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("4", "x"); + ASSERT_OK(s); + + s = txn2->Delete("4"); // conflict + ASSERT_NOK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options2, "4", &value); + ASSERT_TRUE(s.IsBusy()); + + txn2->Rollback(); + + delete txn1; + delete txn2; +} + +TEST_F(TransactionTest, LostUpdate) { + WriteOptions write_options; + ReadOptions read_options, read_options1, read_options2; + TransactionOptions txn_options; + string value; + Status s; + + // Test 2 transactions writing to the same key in multiple orders and + // with/without snapshots + + Transaction* txn1 = db->BeginTransaction(write_options); + Transaction* txn2 = db->BeginTransaction(write_options); + + s = txn1->Put("1", "1"); + ASSERT_OK(s); + + s = txn2->Put("1", "2"); // conflict + ASSERT_NOK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("1", value); + + delete txn1; + delete txn2; + + txn_options.set_snapshot = true; + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "3"); + ASSERT_OK(s); + s = txn2->Put("1", "4"); // conflict + ASSERT_NOK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("3", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "5"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("1", "6"); + ASSERT_NOK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "7"); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + txn2->SetSnapshot(); + s = txn2->Put("1", "8"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("8", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options); + txn2 = db->BeginTransaction(write_options); + + s = txn1->Put("1", "9"); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("1", "10"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "10"); +} + +TEST_F(TransactionTest, UntrackedWrites) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + // Verify transaction rollback works for untracked keys. + Transaction* txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = txn->PutUntracked("untracked", "0"); + ASSERT_OK(s); + txn->Rollback(); + s = db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = db->Put(write_options, "untracked", "x"); + ASSERT_OK(s); + + // Untracked writes should succeed even though key was written after snapshot + s = txn->PutUntracked("untracked", "1"); + ASSERT_OK(s); + s = txn->MergeUntracked("untracked", "2"); + ASSERT_OK(s); + s = txn->DeleteUntracked("untracked"); + ASSERT_OK(s); + + // Conflict + s = txn->Put("untracked", "3"); + ASSERT_NOK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_F(TransactionTest, ExpiredTransaction) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + string value; + Status s; + + // Set txn expiration timeout to 0 microseconds (expires instantly) + txn_options.expiration = 0; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + s = txn1->Put("X", "1"); + ASSERT_OK(s); + + s = txn1->Put("Y", "1"); + ASSERT_OK(s); + + Transaction* txn2 = db->BeginTransaction(write_options); + + // txn2 should be able to write to X since txn1 has expired + s = txn2->Put("X", "2"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("2", value); + + s = txn1->Put("Z", "1"); + ASSERT_OK(s); + + // txn1 should fail to commit since it is expired + s = txn1->Commit(); + ASSERT_TRUE(s.IsTimedOut()); + + s = db->Get(read_options, "Y", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "Z", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn1; + delete txn2; +} + +TEST_F(TransactionTest, Rollback) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + ASSERT_OK(s); + + s = txn1->Put("X", "1"); + ASSERT_OK(s); + + Transaction* txn2 = db->BeginTransaction(write_options); + + // txn2 should not be able to write to X since txn1 has it locked + s = txn2->Put("X", "2"); + ASSERT_TRUE(s.IsBusy()); + + txn1->Rollback(); + delete txn1; + + // txn2 should now be able to write to X + s = txn2->Put("X", "3"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("3", value); + + delete txn2; +} + +TEST_F(TransactionTest, LockLimitTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + delete db; + + // Open DB with a lock limit of 3 + txn_db_options.max_num_locks = 3; + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + ASSERT_OK(s); + + // Create a txn and verify we can only lock up to 3 keys + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("X", "x"); + ASSERT_OK(s); + + s = txn->Put("Y", "y"); + ASSERT_OK(s); + + s = txn->Put("Z", "z"); + ASSERT_OK(s); + + // lock limit reached + s = txn->Put("W", "w"); + ASSERT_TRUE(s.IsBusy()); + + // re-locking same key shouldn't put us over the limit + s = txn->Put("X", "xx"); + ASSERT_OK(s); + + s = txn->GetForUpdate(read_options, "W", &value); + ASSERT_TRUE(s.IsBusy()); + s = txn->GetForUpdate(read_options, "V", &value); + ASSERT_TRUE(s.IsBusy()); + + // re-locking same key shouldn't put us over the limit + s = txn->GetForUpdate(read_options, "Y", &value); + ASSERT_OK(s); + ASSERT_EQ("y", value); + + s = txn->Get(read_options, "W", &value); + ASSERT_TRUE(s.IsNotFound()); + + Transaction* txn2 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn2); + + // lock limit reached + s = txn2->Put("X", "x"); + ASSERT_TRUE(s.IsBusy()); + + // lock limit reached + s = txn2->Put("M", "m"); + ASSERT_TRUE(s.IsBusy()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("xx", value); + + s = db->Get(read_options, "W", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Committing txn should release its locks and allow txn2 to proceed + s = txn2->Put("X", "x2"); + ASSERT_OK(s); + + s = txn2->Delete("X"); + ASSERT_OK(s); + + s = txn2->Put("M", "m"); + ASSERT_OK(s); + + s = txn2->Put("Z", "z2"); + ASSERT_OK(s); + + // lock limit reached + s = txn2->Delete("Y"); + ASSERT_TRUE(s.IsBusy()); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ("z2", value); + + s = db->Get(read_options, "Y", &value); + ASSERT_OK(s); + ASSERT_EQ("y", value); + + s = db->Get(read_options, "X", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + delete txn2; +} + +TEST_F(TransactionTest, IteratorTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + // Write some keys to the db + s = db->Put(write_options, "A", "a"); + ASSERT_OK(s); + + s = db->Put(write_options, "G", "g"); + ASSERT_OK(s); + + s = db->Put(write_options, "F", "f"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "c"); + ASSERT_OK(s); + + s = db->Put(write_options, "D", "d"); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + // Write some keys in a txn + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Put("H", "h"); + ASSERT_OK(s); + + s = txn->Delete("D"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + txn->SetSnapshot(); + const Snapshot* snapshot = txn->GetSnapshot(); + + // Write some keys to the db after the snapshot + s = db->Put(write_options, "BB", "xx"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "xx"); + ASSERT_OK(s); + + read_options.snapshot = snapshot; + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + + // Read all keys via iter and lock them all + std::string results[] = {"a", "b", "c", "e", "f", "g", "h"}; + for (int i = 0; i < 7; i++) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(results[i], iter->value().ToString()); + + s = txn->GetForUpdate(read_options, iter->key(), nullptr); + if (i == 2) { + // "C" was modified after txn's snapshot + ASSERT_TRUE(s.IsBusy()); + } else { + ASSERT_OK(s); + } + + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + + iter->Seek("G"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("g", iter->value().ToString()); + + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("f", iter->value().ToString()); + + iter->Seek("D"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek("C"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("c", iter->value().ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->value().ToString()); + + iter->Seek("X"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + + iter->SeekToLast(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("h", iter->value().ToString()); + + s = txn->Commit(); + ASSERT_OK(s); + + delete iter; + delete txn; +} + +TEST_F(TransactionTest, SavepointTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + txn->RollbackToSavePoint(); + + txn->SetSavePoint(); // 1 + + txn->RollbackToSavePoint(); // Rollback to beginning of txn + txn->RollbackToSavePoint(); + + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + delete txn; + txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("B", "bb"); + ASSERT_OK(s); + + s = txn->Put("C", "c"); + ASSERT_OK(s); + + txn->SetSavePoint(); // 2 + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Put("C", "cc"); + ASSERT_OK(s); + + s = txn->Put("D", "d"); + ASSERT_OK(s); + + txn->RollbackToSavePoint(); // Rollback to 2 + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("bb", value); + + s = txn->Get(read_options, "C", &value); + ASSERT_OK(s); + ASSERT_EQ("c", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + txn->RollbackToSavePoint(); // Rollback to beginning of txn + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "aa"); + ASSERT_OK(s); + + s = txn->Put("F", "f"); + ASSERT_OK(s); + + txn->SetSavePoint(); // 3 + txn->SetSavePoint(); // 4 + + s = txn->Put("G", "g"); + ASSERT_OK(s); + + s = txn->Delete("F"); + ASSERT_OK(s); + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = txn->Get(read_options, "F", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + txn->RollbackToSavePoint(); // Rollback to 3 + + s = txn->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = txn->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = db->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = db->Get(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_F(TransactionTest, TimeoutTest) { + WriteOptions write_options; + ReadOptions read_options; + string value; + Status s; + + delete db; + + // transaction writes have an infinite timeout, + // but we will override this when we start a txn + // db writes have infinite timeout + txn_db_options.transaction_lock_timeout = -1; + txn_db_options.default_lock_timeout = -1; + + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + ASSERT_OK(s); + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + TransactionOptions txn_options0; + txn_options0.expiration = 100; // 100ms + txn_options0.lock_timeout = 50; // txn timeout no longer infinite + Transaction* txn1 = db->BeginTransaction(write_options, txn_options0); + + s = txn1->GetForUpdate(read_options, "aaa", nullptr); + ASSERT_OK(s); + + // Conflicts with previous GetForUpdate. + // Since db writes do not have a timeout, this should eventually succeed when + // the transaction expires. + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_NOK(s); // expired! + + s = db->Get(read_options, "aaa", &value); + ASSERT_OK(s); + ASSERT_EQ("xxx", value); + + delete txn1; + delete db; + + // transaction writes have 10ms timeout, + // db writes have infinite timeout + txn_db_options.transaction_lock_timeout = 50; + txn_db_options.default_lock_timeout = -1; + + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + ASSERT_OK(s); + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + TransactionOptions txn_options; + txn_options.expiration = 100; // 100ms + txn1 = db->BeginTransaction(write_options, txn_options); + + s = txn1->GetForUpdate(read_options, "aaa", nullptr); + ASSERT_OK(s); + + // Conflicts with previous GetForUpdate. + // Since db writes do not have a timeout, this should eventually succeed when + // the transaction expires. + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_NOK(s); // expired! + + s = db->Get(read_options, "aaa", &value); + ASSERT_OK(s); + ASSERT_EQ("xxx", value); + + delete txn1; + txn_options.expiration = 6000000; // 100 minutes + txn1 = db->BeginTransaction(write_options, txn_options); + + TransactionOptions txn_options2; + txn_options2.expiration = 10; // 10ms + Transaction* txn2 = db->BeginTransaction(write_options, txn_options2); + ASSERT_OK(s); + + s = txn2->Put("a", "2"); + ASSERT_OK(s); + + // txn1 has a lock timeout longer than txn2's expiration, so it will win + s = txn1->Delete("a"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + // txn2 should be timed out since txn1 waiting until its timeout expired. + s = txn2->Commit(); + ASSERT_TRUE(s.IsTimedOut()); + + delete txn1; + delete txn2; + txn_options.expiration = 6000000; // 100 minutes + txn1 = db->BeginTransaction(write_options, txn_options); + txn_options2.expiration = 100000000; + txn2 = db->BeginTransaction(write_options, txn_options2); + + s = txn1->Delete("asdf"); + ASSERT_OK(s); + + // txn2 has a smaller lock timeout than txn1's expiration, so it will time out + s = txn2->Delete("asdf"); + ASSERT_TRUE(s.IsBusy()); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("asdf", "asdf"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "asdf", &value); + ASSERT_OK(s); + ASSERT_EQ("asdf", value); + + delete txn1; + delete txn2; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc new file mode 100644 index 0000000000..086d650ae6 --- /dev/null +++ b/utilities/transactions/transaction_util.cc @@ -0,0 +1,265 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include "utilities/transactions/transaction_util.h" + +#include +#include +#include + +#include "db/db_impl.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/string_util.h" + +namespace rocksdb { + +Status TransactionUtil::CheckKeyForConflicts(DBImpl* db_impl, + ColumnFamilyHandle* column_family, + const std::string& key, + SequenceNumber key_seq) { + Status result; + + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd); + + if (sv == nullptr) { + result = Status::Busy("Could not access column family " + + cfh->GetName()); + } + + if (result.ok()) { + SequenceNumber earliest_seq = + db_impl->GetEarliestMemTableSequenceNumber(sv, true); + + result = CheckKey(db_impl, sv, earliest_seq, key_seq, key); + + db_impl->ReturnAndCleanupSuperVersion(cfd, sv); + } + + return result; +} + +Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv, + SequenceNumber earliest_seq, + SequenceNumber key_seq, + const std::string& key) { + Status result; + + // Since it would be too slow to check the SST files, we will only use + // the memtables to check whether there have been any recent writes + // to this key after it was accessed in this transaction. But if the + // Memtables do not contain a long enough history, we must fail the + // transaction. + if (earliest_seq == kMaxSequenceNumber) { + // The age of this memtable is unknown. Cannot rely on it to check + // for recent writes. This error shouldn't happen often in practice as + // the + // Memtable should have a valid earliest sequence number except in some + // corner cases (such as error cases during recovery). + result = Status::Busy( + "Transaction ould not check for conflicts as the MemTable does not " + "countain a long enough history to check write at SequenceNumber: ", + ToString(key_seq)); + + } else if (key_seq < earliest_seq) { + // The age of this memtable is too new to use to check for recent + // writes. + char msg[255]; + snprintf(msg, sizeof(msg), + "Transaction could not check for conflicts for opearation at " + "SequenceNumber %" PRIu64 + " as the MemTable only contains changes newer than SequenceNumber " + "%" PRIu64 + ". Increasing the value of the " + "max_write_buffer_number_to_maintain option could reduce the " + "frequency " + "of this error.", + key_seq, earliest_seq); + result = Status::Busy(msg); + } else { + SequenceNumber seq = kMaxSequenceNumber; + Status s = db_impl->GetLatestSequenceForKeyFromMemtable(sv, key, &seq); + if (!s.ok()) { + result = s; + } else if (seq != kMaxSequenceNumber && seq > key_seq) { + result = Status::Busy(); + } + } + + return result; +} + +Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl, + TransactionKeyMap* key_map) { + Status result; + + for (auto& key_map_iter : *key_map) { + uint32_t cf_id = key_map_iter.first; + const auto& keys = key_map_iter.second; + + SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf_id); + if (sv == nullptr) { + result = + Status::Busy("Could not access column family " + ToString(cf_id)); + break; + } + + SequenceNumber earliest_seq = + db_impl->GetEarliestMemTableSequenceNumber(sv, true); + + // For each of the keys in this transaction, check to see if someone has + // written to this key since the start of the transaction. + for (const auto& key_iter : keys) { + const auto& key = key_iter.first; + const SequenceNumber key_seq = key_iter.second; + + result = CheckKey(db_impl, sv, earliest_seq, key_seq, key); + + if (!result.ok()) { + break; + } + } + + db_impl->ReturnAndCleanupSuperVersion(cf_id, sv); + + if (!result.ok()) { + break; + } + } + + return result; +} + +Status TransactionUtil::CopyFirstN(size_t num, WriteBatchWithIndex* batch, + WriteBatchWithIndex* new_batch, + DBImpl* db_impl) { + // Handler for iterating through batch and copying entries to new_batch + class Handler : public WriteBatch::Handler { + public: + WriteBatchWithIndex* batch; + const size_t limit; + DBImpl* db_impl; + size_t seen = 0; + std::unordered_map super_versions; + std::unordered_map handles; + + Handler(WriteBatchWithIndex* dest, size_t new_limit, DBImpl* db) + : batch(dest), limit(new_limit), db_impl(db) {} + + ~Handler() { + for (auto& iter : super_versions) { + db_impl->ReturnAndCleanupSuperVersionUnlocked(iter.first, iter.second); + } + } + + Status GetColumnFamily(uint32_t column_family_id, + ColumnFamilyHandle** cfh) { + // Need to look up ColumnFamilyHandle for this column family id. Since + // doing this requires grabbing a mutex, lets only do it once per column + // family and cache it. + // In order to ensure that the ColumnFamilyHandle is still valid, we need + // to hold the superversion. + const auto& iter = handles.find(column_family_id); + if (iter == handles.end()) { + // Don't have ColumnFamilyHandle cached, look it up from the db. + SuperVersion* sv = + db_impl->GetAndRefSuperVersionUnlocked(column_family_id); + if (sv == nullptr) { + return Status::InvalidArgument( + "Could not find column family for ID " + + ToString(column_family_id)); + } + super_versions.insert({column_family_id, sv}); + + *cfh = db_impl->GetColumnFamilyHandleUnlocked(column_family_id); + if (*cfh == nullptr) { + return Status::InvalidArgument( + "Could not find column family handle for ID " + + ToString(column_family_id)); + } + handles.insert({column_family_id, *cfh}); + } else { + *cfh = iter->second; + } + + return Status::OK(); + } + + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (seen >= limit) { + // Found the first N entries, return Aborted to stop the Iteration. + return Status::Aborted(); + } + ColumnFamilyHandle* cfh = nullptr; + Status s = GetColumnFamily(column_family_id, &cfh); + if (s.ok()) { + batch->Put(cfh, key, value); + } + seen++; + return s; + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (seen >= limit) { + // Found the first N entries, return Aborted to stop the Iteration. + return Status::Aborted(); + } + ColumnFamilyHandle* cfh = nullptr; + Status s = GetColumnFamily(column_family_id, &cfh); + if (s.ok()) { + batch->Merge(cfh, key, value); + } + seen++; + return s; + } + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + if (seen >= limit) { + // Found the first N entries, return Aborted to stop the Iteration. + return Status::Aborted(); + } + ColumnFamilyHandle* cfh = nullptr; + Status s = GetColumnFamily(column_family_id, &cfh); + if (s.ok()) { + batch->Delete(cfh, key); + } + seen++; + return s; + } + + virtual void LogData(const Slice& blob) override { + if (seen < limit) { + batch->PutLogData(blob); + } + seen++; + } + }; + + // Iterating on this handler will add all keys in this batch into a new batch + // up to + // the limit. + Handler handler(new_batch, num, db_impl); + Status s = batch->GetWriteBatch()->Iterate(&handler); + + if (s.IsAborted()) { + // Handler returns Aborted when it is done copying to stop the iteration. + s = Status::OK(); + } + + return s; +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_util.h b/utilities/transactions/transaction_util.h new file mode 100644 index 0000000000..21f69a022c --- /dev/null +++ b/utilities/transactions/transaction_util.h @@ -0,0 +1,65 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace rocksdb { + +using TransactionKeyMap = + std::unordered_map>; + +class DBImpl; +struct SuperVersion; +class WriteBatchWithIndex; + +class TransactionUtil { + public: + // Verifies there have been no writes to this key in the db since this + // sequence number. + // + // Returns OK on success, BUSY if there is a conflicting write, or other error + // status for any unexpected errors. + static Status CheckKeyForConflicts(DBImpl* db_impl, + ColumnFamilyHandle* column_family, + const std::string& key, + SequenceNumber key_seq); + + // For each key,SequenceNumber pair in the TransactionKeyMap, this function + // will verify there have been no writes to the key in the db since that + // sequence number. + // + // Returns OK on success, BUSY if there is a conflicting write, or other error + // status for any unexpected errors. + // + // REQUIRED: this function should only be called on the write thread or if the + // mutex is held. + static Status CheckKeysForConflicts(DBImpl* db_impl, TransactionKeyMap* keys); + + // Copies the first num entries from batch into new_batch (including Put, + // Merge, Delete, and PutLogData). + // Returns non-OK on error. + static Status CopyFirstN(size_t num, WriteBatchWithIndex* batch, + WriteBatchWithIndex* new_batch, DBImpl* db_impl); + + private: + static Status CheckKey(DBImpl* db_impl, SuperVersion* sv, + SequenceNumber earliest_seq, SequenceNumber key_seq, + const std::string& key); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index 507aff2488..9308ba39bb 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -626,12 +626,15 @@ Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family, switch (result) { case WriteBatchWithIndexInternal::Result::kFound: case WriteBatchWithIndexInternal::Result::kError: - return s; + // use returned status + break; case WriteBatchWithIndexInternal::Result::kDeleted: case WriteBatchWithIndexInternal::Result::kNotFound: - return Status::NotFound(); + s = Status::NotFound(); + break; case WriteBatchWithIndexInternal::Result::kMergeInProgress: - return Status::MergeInProgress(""); + s = Status::MergeInProgress(""); + break; default: assert(false); } @@ -659,8 +662,8 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, std::string batch_value; WriteBatchWithIndexInternal::Result result = WriteBatchWithIndexInternal::GetFromBatch( - options, this, column_family, key, &merge_context, &rep->comparator, - &batch_value, &s); + options, this, column_family, key, &merge_context, + &rep->comparator, &batch_value, &s); if (result == WriteBatchWithIndexInternal::Result::kFound) { value->assign(batch_value.data(), batch_value.size());