mirror of https://github.com/facebook/rocksdb.git
Add a TransactionOptions to enable tracking timestamp size info inside WriteBatch (#12864)
Summary:
In normal use cases, meta info like column family's timestamp size is tracked at the transaction layer, so it's not necessary and even detrimental to track such info inside the internal WriteBatch because it may let anti-patterns like bypassing Transaction write APIs and directly write to its internal WriteBatch like this:
9d0a754dc9/storage/rocksdb/ha_rocksdb.cc (L4949-L4950)
Setting this option to true will keep aforementioned use case continue to work before it's refactored out. This option is only for this purpose and it will be gradually deprecated after aforementioned MyRocks use case are refactored.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12864
Test Plan: Added unit tests
Reviewed By: cbi42
Differential Revision: D60194094
Pulled By: jowlyzhang
fbshipit-source-id: 64a98822167e99aa7e4fa2a60085d44a5deaa45c
This commit is contained in:
parent
36b061a6c7
commit
719c96125c
|
@ -929,15 +929,19 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::Put(this, cf_id, key, value);
|
||||
s = WriteBatchInternal::Put(this, cf_id, key, value);
|
||||
} else {
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(&value, 1));
|
||||
}
|
||||
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(&value, 1));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
|
@ -962,7 +966,7 @@ Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
|
||||
Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& ts, const Slice& value) {
|
||||
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -970,8 +974,12 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
assert(column_family);
|
||||
uint32_t cf_id = column_family->GetID();
|
||||
std::array<Slice, 2> key_with_ts{{key, ts}};
|
||||
return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(&value, 1));
|
||||
s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(&value, 1));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts.size());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key,
|
||||
|
@ -1039,7 +1047,11 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
|
|||
}
|
||||
|
||||
if (ts_sz == 0) {
|
||||
return WriteBatchInternal::Put(this, cf_id, key, value);
|
||||
s = WriteBatchInternal::Put(this, cf_id, key, value);
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
|
@ -1246,20 +1258,24 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::Delete(this, cf_id, key);
|
||||
s = WriteBatchInternal::Delete(this, cf_id, key);
|
||||
} else {
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
s = WriteBatchInternal::Delete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
}
|
||||
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
return WriteBatchInternal::Delete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& ts) {
|
||||
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -1267,8 +1283,12 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
has_key_with_ts_ = true;
|
||||
uint32_t cf_id = column_family->GetID();
|
||||
std::array<Slice, 2> key_with_ts{{key, ts}};
|
||||
return WriteBatchInternal::Delete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
s = WriteBatchInternal::Delete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts.size());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
|
||||
|
@ -1313,7 +1333,11 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::Delete(this, cf_id, key);
|
||||
s = WriteBatchInternal::Delete(this, cf_id, key);
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
|
@ -1361,20 +1385,24 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::SingleDelete(this, cf_id, key);
|
||||
s = WriteBatchInternal::SingleDelete(this, cf_id, key);
|
||||
} else {
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
s = WriteBatchInternal::SingleDelete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
}
|
||||
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
return WriteBatchInternal::SingleDelete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
|
||||
const Slice& key, const Slice& ts) {
|
||||
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -1382,8 +1410,12 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
|
|||
assert(column_family);
|
||||
uint32_t cf_id = column_family->GetID();
|
||||
std::array<Slice, 2> key_with_ts{{key, ts}};
|
||||
return WriteBatchInternal::SingleDelete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
s = WriteBatchInternal::SingleDelete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts.size());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatchInternal::SingleDelete(WriteBatch* b,
|
||||
|
@ -1430,7 +1462,11 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::SingleDelete(this, cf_id, key);
|
||||
s = WriteBatchInternal::SingleDelete(this, cf_id, key);
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
|
@ -1480,23 +1516,27 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
|
||||
s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
|
||||
} else {
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
|
||||
std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
|
||||
s = WriteBatchInternal::DeleteRange(this, cf_id,
|
||||
SliceParts(begin_key_with_ts.data(), 2),
|
||||
SliceParts(end_key_with_ts.data(), 2));
|
||||
}
|
||||
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
|
||||
std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
|
||||
return WriteBatchInternal::DeleteRange(
|
||||
this, cf_id, SliceParts(begin_key_with_ts.data(), 2),
|
||||
SliceParts(end_key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
|
||||
const Slice& begin_key, const Slice& end_key,
|
||||
const Slice& ts) {
|
||||
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -1505,9 +1545,13 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
|
|||
uint32_t cf_id = column_family->GetID();
|
||||
std::array<Slice, 2> key_with_ts{{begin_key, ts}};
|
||||
std::array<Slice, 2> end_key_with_ts{{end_key, ts}};
|
||||
return WriteBatchInternal::DeleteRange(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(end_key_with_ts.data(), 2));
|
||||
s = WriteBatchInternal::DeleteRange(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(end_key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts.size());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
|
||||
|
@ -1554,7 +1598,11 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
|
||||
s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
|
@ -1608,21 +1656,25 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::Merge(this, cf_id, key, value);
|
||||
s = WriteBatchInternal::Merge(this, cf_id, key, value);
|
||||
} else {
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
|
||||
s = WriteBatchInternal::Merge(
|
||||
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
|
||||
}
|
||||
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
|
||||
return WriteBatchInternal::Merge(
|
||||
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& ts, const Slice& value) {
|
||||
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -1630,8 +1682,12 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
assert(column_family);
|
||||
uint32_t cf_id = column_family->GetID();
|
||||
std::array<Slice, 2> key_with_ts{{key, ts}};
|
||||
return WriteBatchInternal::Merge(
|
||||
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
|
||||
s = WriteBatchInternal::Merge(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(&value, 1));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts.size());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
|
||||
|
@ -1680,7 +1736,11 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::Merge(this, cf_id, key, value);
|
||||
s = WriteBatchInternal::Merge(this, cf_id, key, value);
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
|
|
|
@ -323,6 +323,22 @@ struct TransactionOptions {
|
|||
// description. If a negative value is specified, then the default value from
|
||||
// TransactionDBOptions is used.
|
||||
int64_t write_batch_flush_threshold = -1;
|
||||
|
||||
// DO NOT USE.
|
||||
// This is only a temporary option dedicated for MyRocks that will soon be
|
||||
// removed.
|
||||
// In normal use cases, meta info like column family's timestamp size is
|
||||
// tracked at the transaction layer, so it's not necessary and even
|
||||
// detrimental to track such info inside the internal WriteBatch because it
|
||||
// may let anti-patterns like bypassing Transaction write APIs and directly
|
||||
// write to its internal `WriteBatch` retrieved like this:
|
||||
// https://github.com/facebook/mysql-5.6/blob/fb-mysql-8.0.32/storage/rocksdb/ha_rocksdb.cc#L4949-L4950
|
||||
// Setting this option to true will keep aforementioned use case continue to
|
||||
// work before it's refactored out.
|
||||
// When this flag is enabled, we also intentionally only track the timestamp
|
||||
// size in APIs that MyRocks currently are using, including Put, Merge, Delete
|
||||
// DeleteRange, SingleDelete.
|
||||
bool write_batch_track_timestamp_size = false;
|
||||
};
|
||||
|
||||
// The per-write optimizations that do not involve transactions. TransactionDB
|
||||
|
|
|
@ -437,6 +437,30 @@ class WriteBatch : public WriteBatchBase {
|
|||
Status UpdateTimestamps(const Slice& ts,
|
||||
std::function<size_t(uint32_t /*cf*/)> ts_sz_func);
|
||||
|
||||
// TODO: remove these internal APIs after MyRocks refactor to not directly
|
||||
// write to a `WriteBatch` retrieved from `Transaction` via
|
||||
// `Transaction::GetWriteBatch`.
|
||||
|
||||
void SetTrackTimestampSize(bool track_timestamp_size) {
|
||||
track_timestamp_size_ = track_timestamp_size;
|
||||
}
|
||||
|
||||
inline void MaybeTrackTimestampSize(uint32_t column_family_id, size_t ts_sz) {
|
||||
if (!track_timestamp_size_) {
|
||||
return;
|
||||
}
|
||||
auto iter = cf_id_to_ts_sz_.find(column_family_id);
|
||||
if (iter == cf_id_to_ts_sz_.end()) {
|
||||
cf_id_to_ts_sz_.emplace(column_family_id, ts_sz);
|
||||
}
|
||||
}
|
||||
|
||||
// Return a mapping from column family id to timestamp size of all the column
|
||||
// families involved in this WriteBatch.
|
||||
const std::unordered_map<uint32_t, size_t>& GetColumnFamilyToTimestampSize() {
|
||||
return cf_id_to_ts_sz_;
|
||||
}
|
||||
|
||||
// Verify the per-key-value checksums of this write batch.
|
||||
// Corruption status will be returned if the verification fails.
|
||||
// If this write batch does not have per-key-value checksum,
|
||||
|
@ -511,6 +535,10 @@ class WriteBatch : public WriteBatchBase {
|
|||
|
||||
size_t default_cf_ts_sz_ = 0;
|
||||
|
||||
bool track_timestamp_size_ = false;
|
||||
|
||||
std::unordered_map<uint32_t, size_t> cf_id_to_ts_sz_;
|
||||
|
||||
protected:
|
||||
std::string rep_; // See comment in write_batch.cc for the format of rep_
|
||||
};
|
||||
|
|
|
@ -73,6 +73,8 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
|
|||
deadlock_detect_ = txn_options.deadlock_detect;
|
||||
deadlock_detect_depth_ = txn_options.deadlock_detect_depth;
|
||||
write_batch_.SetMaxBytes(txn_options.max_write_batch_size);
|
||||
write_batch_.GetWriteBatch()->SetTrackTimestampSize(
|
||||
txn_options.write_batch_track_timestamp_size);
|
||||
skip_concurrency_control_ = txn_options.skip_concurrency_control;
|
||||
|
||||
lock_timeout_ = txn_options.lock_timeout * 1000;
|
||||
|
@ -763,8 +765,16 @@ Status WriteCommittedTxn::CommitWithoutPrepareInternal() {
|
|||
EncodeFixed64(commit_ts_buf, commit_timestamp_);
|
||||
Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf));
|
||||
|
||||
Status s =
|
||||
wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t {
|
||||
Status s = wb->UpdateTimestamps(
|
||||
commit_ts, [wb, wbwi, this](uint32_t cf) -> size_t {
|
||||
// First search through timestamp info kept inside the WriteBatch
|
||||
// in case some writes bypassed the Transaction's write APIs.
|
||||
auto cf_id_to_ts_sz = wb->GetColumnFamilyToTimestampSize();
|
||||
auto iter = cf_id_to_ts_sz.find(cf);
|
||||
if (iter != cf_id_to_ts_sz.end()) {
|
||||
size_t ts_sz = iter->second;
|
||||
return ts_sz;
|
||||
}
|
||||
auto cf_iter = cfs_with_ts_tracked_when_indexing_disabled_.find(cf);
|
||||
if (cf_iter != cfs_with_ts_tracked_when_indexing_disabled_.end()) {
|
||||
return sizeof(kMaxTxnTimestamp);
|
||||
|
@ -840,16 +850,24 @@ Status WriteCommittedTxn::CommitInternal() {
|
|||
s = WriteBatchInternal::MarkCommitWithTimestamp(working_batch, name_,
|
||||
commit_ts);
|
||||
if (s.ok()) {
|
||||
s = wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t {
|
||||
if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) !=
|
||||
cfs_with_ts_tracked_when_indexing_disabled_.end()) {
|
||||
return sizeof(kMaxTxnTimestamp);
|
||||
}
|
||||
const Comparator* ucmp =
|
||||
WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf);
|
||||
return ucmp ? ucmp->timestamp_size()
|
||||
: std::numeric_limits<size_t>::max();
|
||||
});
|
||||
s = wb->UpdateTimestamps(
|
||||
commit_ts, [wb, wbwi, this](uint32_t cf) -> size_t {
|
||||
// first search through timestamp info kept inside the WriteBatch
|
||||
// in case some writes bypassed the Transaction's write APIs.
|
||||
auto cf_id_to_ts_sz = wb->GetColumnFamilyToTimestampSize();
|
||||
auto iter = cf_id_to_ts_sz.find(cf);
|
||||
if (iter != cf_id_to_ts_sz.end()) {
|
||||
return iter->second;
|
||||
}
|
||||
if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) !=
|
||||
cfs_with_ts_tracked_when_indexing_disabled_.end()) {
|
||||
return sizeof(kMaxTxnTimestamp);
|
||||
}
|
||||
const Comparator* ucmp =
|
||||
WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf);
|
||||
return ucmp ? ucmp->timestamp_size()
|
||||
: std::numeric_limits<size_t>::max();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -130,6 +130,128 @@ void CheckKeyValueTsWithIterator(
|
|||
}
|
||||
}
|
||||
|
||||
// This is an incorrect usage of this API, supporting this should be removed
|
||||
// after MyRocks remove this pattern in a refactor.
|
||||
TEST_P(WriteCommittedTxnWithTsTest, WritesBypassTransactionAPIs) {
|
||||
options.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
||||
ASSERT_OK(ReOpen());
|
||||
|
||||
const std::string test_cf_name = "test_cf";
|
||||
ColumnFamilyOptions cf_options;
|
||||
ColumnFamilyHandle* cfh = nullptr;
|
||||
assert(db);
|
||||
ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
|
||||
delete cfh;
|
||||
cfh = nullptr;
|
||||
|
||||
std::vector<ColumnFamilyDescriptor> cf_descs;
|
||||
cf_descs.emplace_back(kDefaultColumnFamilyName, options);
|
||||
cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options));
|
||||
options.avoid_flush_during_shutdown = true;
|
||||
ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
|
||||
|
||||
// Write in each transaction a mixture of column families that enable
|
||||
// timestamp and disable timestamps.
|
||||
|
||||
TransactionOptions txn_opts;
|
||||
txn_opts.write_batch_track_timestamp_size = true;
|
||||
std::unique_ptr<Transaction> txn0(NewTxn(WriteOptions(), txn_opts));
|
||||
assert(txn0);
|
||||
ASSERT_OK(txn0->Put(handles_[0], "key1", "key1_val"));
|
||||
// Timestamp size info for writes like this can only be correctly tracked if
|
||||
// TransactionOptions.write_batch_track_timestamp_size is true.
|
||||
ASSERT_OK(txn0->GetWriteBatch()->GetWriteBatch()->Put(handles_[1], "foo",
|
||||
"foo_val"));
|
||||
ASSERT_OK(txn0->SetName("txn0"));
|
||||
ASSERT_OK(txn0->SetCommitTimestamp(2));
|
||||
ASSERT_OK(txn0->Prepare());
|
||||
ASSERT_OK(txn0->Commit());
|
||||
txn0.reset();
|
||||
|
||||
// For keys written from transactions that disable
|
||||
// `write_batch_track_timestamp_size`
|
||||
// The keys has incorrect behavior like:
|
||||
// *Cannot be found after commit: because transaction's UpdateTimestamp do not
|
||||
// have correct timestamp size when this write bypass transaction write APIs.
|
||||
// *Can be found again after DB restart recovers the write from WAL log:
|
||||
// because recovered transaction's UpdateTimestamp get correct timestamp size
|
||||
// info directly from VersionSet.
|
||||
// If there is a flush that persisted this transaction into sst files after
|
||||
// it's committed, the key will be forever corrupted.
|
||||
std::unique_ptr<Transaction> txn1(
|
||||
NewTxn(WriteOptions(), TransactionOptions()));
|
||||
assert(txn1);
|
||||
ASSERT_OK(txn1->Put(handles_[0], "key2", "key2_val"));
|
||||
// Writing a key with more than 8 bytes so that we can manifest the error as
|
||||
// a NotFound error instead of an issue during `WriteBatch::UpdateTimestamp`.
|
||||
ASSERT_OK(txn1->GetWriteBatch()->GetWriteBatch()->Put(
|
||||
handles_[1], "foobarbaz", "baz_val"));
|
||||
ASSERT_OK(txn1->SetName("txn1"));
|
||||
ASSERT_OK(txn1->SetCommitTimestamp(2));
|
||||
ASSERT_OK(txn1->Prepare());
|
||||
ASSERT_OK(txn1->Commit());
|
||||
txn1.reset();
|
||||
|
||||
ASSERT_OK(db->Flush(FlushOptions(), handles_[1]));
|
||||
|
||||
std::unique_ptr<Transaction> txn2(
|
||||
NewTxn(WriteOptions(), TransactionOptions()));
|
||||
assert(txn2);
|
||||
ASSERT_OK(txn2->Put(handles_[0], "key3", "key3_val"));
|
||||
ASSERT_OK(txn2->GetWriteBatch()->GetWriteBatch()->Put(
|
||||
handles_[1], "bazbazbaz", "bazbazbaz_val"));
|
||||
ASSERT_OK(txn2->SetCommitTimestamp(2));
|
||||
ASSERT_OK(txn2->SetName("txn2"));
|
||||
ASSERT_OK(txn2->Prepare());
|
||||
ASSERT_OK(txn2->Commit());
|
||||
txn2.reset();
|
||||
|
||||
std::unique_ptr<Transaction> txn3(
|
||||
NewTxn(WriteOptions(), TransactionOptions()));
|
||||
assert(txn3);
|
||||
std::string value;
|
||||
ReadOptions ropts;
|
||||
std::string read_ts;
|
||||
Slice timestamp = EncodeU64Ts(2, &read_ts);
|
||||
ropts.timestamp = ×tamp;
|
||||
ASSERT_OK(txn3->Get(ropts, handles_[0], "key1", &value));
|
||||
ASSERT_EQ("key1_val", value);
|
||||
ASSERT_OK(txn3->Get(ropts, handles_[0], "key2", &value));
|
||||
ASSERT_EQ("key2_val", value);
|
||||
ASSERT_OK(txn3->Get(ropts, handles_[0], "key3", &value));
|
||||
ASSERT_EQ("key3_val", value);
|
||||
txn3.reset();
|
||||
|
||||
std::unique_ptr<Transaction> txn4(
|
||||
NewTxn(WriteOptions(), TransactionOptions()));
|
||||
assert(txn4);
|
||||
ASSERT_OK(txn4->Get(ReadOptions(), handles_[1], "foo", &value));
|
||||
ASSERT_EQ("foo_val", value);
|
||||
// Incorrect behavior: committed keys cannot be found
|
||||
ASSERT_TRUE(
|
||||
txn4->Get(ReadOptions(), handles_[1], "foobarbaz", &value).IsNotFound());
|
||||
ASSERT_TRUE(
|
||||
txn4->Get(ReadOptions(), handles_[1], "bazbazbaz", &value).IsNotFound());
|
||||
txn4.reset();
|
||||
|
||||
ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
|
||||
std::unique_ptr<Transaction> txn5(
|
||||
NewTxn(WriteOptions(), TransactionOptions()));
|
||||
assert(txn5);
|
||||
ASSERT_OK(txn5->Get(ReadOptions(), handles_[1], "foo", &value));
|
||||
ASSERT_EQ("foo_val", value);
|
||||
// Incorrect behavior:
|
||||
// *unflushed key can be found after reopen replays the entries from WAL
|
||||
// (this is not suggesting using flushing as a workaround but to show a
|
||||
// possible misleading behavior)
|
||||
// *flushed key is forever corrupted.
|
||||
ASSERT_TRUE(
|
||||
txn5->Get(ReadOptions(), handles_[1], "foobarbaz", &value).IsNotFound());
|
||||
ASSERT_OK(txn5->Get(ReadOptions(), handles_[1], "bazbazbaz", &value));
|
||||
ASSERT_EQ("bazbazbaz_val", value);
|
||||
txn5.reset();
|
||||
}
|
||||
|
||||
TEST_P(WriteCommittedTxnWithTsTest, ReOpenWithTimestamp) {
|
||||
options.merge_operator = MergeOperators::CreateUInt64AddOperator();
|
||||
ASSERT_OK(ReOpenNoDelete());
|
||||
|
|
Loading…
Reference in New Issue