rocksdb/util/udt_util.cc

465 lines
16 KiB
C++
Raw Normal View History

// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "util/udt_util.h"
#include "db/dbformat.h"
#include "rocksdb/types.h"
#include "util/coding.h"
#include "util/write_batch_util.h"
namespace ROCKSDB_NAMESPACE {
namespace {
enum class RecoveryType {
kNoop,
kUnrecoverable,
kStripTimestamp,
kPadTimestamp,
};
RecoveryType GetRecoveryType(const size_t running_ts_sz,
const std::optional<size_t>& recorded_ts_sz) {
if (running_ts_sz == 0) {
if (!recorded_ts_sz.has_value()) {
// A column family id not recorded is equivalent to that column family has
// zero timestamp size.
return RecoveryType::kNoop;
}
return RecoveryType::kStripTimestamp;
}
assert(running_ts_sz != 0);
if (!recorded_ts_sz.has_value()) {
return RecoveryType::kPadTimestamp;
}
if (running_ts_sz != *recorded_ts_sz) {
return RecoveryType::kUnrecoverable;
}
return RecoveryType::kNoop;
}
bool AllRunningColumnFamiliesConsistent(
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
const UnorderedMap<uint32_t, size_t>& record_ts_sz) {
for (const auto& [cf_id, ts_sz] : running_ts_sz) {
auto record_it = record_ts_sz.find(cf_id);
RecoveryType recovery_type =
GetRecoveryType(ts_sz, record_it != record_ts_sz.end()
? std::optional<size_t>(record_it->second)
: std::nullopt);
if (recovery_type != RecoveryType::kNoop) {
return false;
}
}
return true;
}
Status CheckWriteBatchTimestampSizeConsistency(
const WriteBatch* batch,
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
const UnorderedMap<uint32_t, size_t>& record_ts_sz,
TimestampSizeConsistencyMode check_mode, bool* ts_need_recovery) {
std::vector<uint32_t> column_family_ids;
Status status =
CollectColumnFamilyIdsFromWriteBatch(*batch, &column_family_ids);
if (!status.ok()) {
return status;
}
for (const auto& cf_id : column_family_ids) {
auto running_iter = running_ts_sz.find(cf_id);
if (running_iter == running_ts_sz.end()) {
// Ignore dropped column family referred to in a WriteBatch regardless of
// its consistency.
continue;
}
auto record_iter = record_ts_sz.find(cf_id);
RecoveryType recovery_type = GetRecoveryType(
running_iter->second, record_iter != record_ts_sz.end()
? std::optional<size_t>(record_iter->second)
: std::nullopt);
if (recovery_type != RecoveryType::kNoop) {
if (check_mode == TimestampSizeConsistencyMode::kVerifyConsistency) {
return Status::InvalidArgument(
"WriteBatch contains timestamp size inconsistency.");
}
if (recovery_type == RecoveryType::kUnrecoverable) {
return Status::InvalidArgument(
"WriteBatch contains unrecoverable timestamp size inconsistency.");
}
// If any column family needs reconciliation, it will mark the whole
// WriteBatch to need recovery and rebuilt.
*ts_need_recovery = true;
}
}
return Status::OK();
}
Support switching on / off UDT together with in-Memtable-only feature (#11623) Summary: Add support to allow enabling / disabling user-defined timestamps feature for an existing column family in combination with the in-Memtable only feature. To do this, this PR includes: 1) Log the `persist_user_defined_timestamps` option per column family in Manifest to facilitate detecting an attempt to enable / disable UDT. This entry is enforced to be logged in the same VersionEdit as the user comparator name entry. 2) User-defined timestamps related options are validated when re-opening a column family, including user comparator name and the `persist_user_defined_timestamps` flag. These type of settings and settings change are considered valid: a) no user comparator change and no effective `persist_user_defined_timestamp` flag change. b) switch user comparator to enable UDT provided the immediately effective `persist_user_defined_timestamps` flag is false. c) switch user comparator to disable UDT provided that the before-change `persist_user_defined_timestamps` is already false. 3) when an attempt to enable UDT is detected, we mark all its existing SST files as "having no UDT" by marking its `FileMetaData.user_defined_timestamps_persisted` flag to false and handle their file boundaries `FileMetaData.smallest`, `FileMetaData.largest` by padding a min timestamp. 4) while enabling / disabling UDT feature, timestamp size inconsistency in existing WAL logs are handled to make it compatible with the running user comparator. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11623 Test Plan: ``` make all check ./db_with_timestamp_basic_test --gtest-filter="*EnableDisableUDT*" ./db_wal_test --gtest_filter="*EnableDisableUDT*" ``` Reviewed By: ltamasi Differential Revision: D47636862 Pulled By: jowlyzhang fbshipit-source-id: dcd19f67292da3c3cc9584c09ad00331c9ab9322
2023-07-27 03:16:32 +00:00
enum class ToggleUDT {
kUnchanged,
kEnableUDT,
kDisableUDT,
kInvalidChange,
};
ToggleUDT CompareComparator(const Comparator* new_comparator,
const std::string& old_comparator_name) {
static const char* kUDTSuffix = ".u64ts";
static const Slice kSuffixSlice = kUDTSuffix;
static const size_t kSuffixSize = 6;
size_t ts_sz = new_comparator->timestamp_size();
(void)ts_sz;
Slice new_ucmp_name(new_comparator->Name());
Slice old_ucmp_name(old_comparator_name);
if (new_ucmp_name.compare(old_ucmp_name) == 0) {
return ToggleUDT::kUnchanged;
}
if (new_ucmp_name.size() == old_ucmp_name.size() + kSuffixSize &&
new_ucmp_name.starts_with(old_ucmp_name) &&
new_ucmp_name.ends_with(kSuffixSlice)) {
assert(ts_sz == 8);
return ToggleUDT::kEnableUDT;
}
if (old_ucmp_name.size() == new_ucmp_name.size() + kSuffixSize &&
old_ucmp_name.starts_with(new_ucmp_name) &&
old_ucmp_name.ends_with(kSuffixSlice)) {
assert(ts_sz == 0);
return ToggleUDT::kDisableUDT;
}
return ToggleUDT::kInvalidChange;
}
} // namespace
TimestampRecoveryHandler::TimestampRecoveryHandler(
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
const UnorderedMap<uint32_t, size_t>& record_ts_sz, bool seq_per_batch,
bool batch_per_txn)
: running_ts_sz_(running_ts_sz),
record_ts_sz_(record_ts_sz),
// Write after commit currently uses one seq per key (instead of per
// batch). So seq_per_batch being false indicates write_after_commit
// approach.
write_after_commit_(!seq_per_batch),
// WriteUnprepared can write multiple WriteBatches per transaction, so
// batch_per_txn being false indicates write_before_prepare.
write_before_prepare_(!batch_per_txn),
new_batch_(new WriteBatch()),
handler_valid_(true),
new_batch_diff_from_orig_batch_(false) {}
Status TimestampRecoveryHandler::PutCF(uint32_t cf, const Slice& key,
const Slice& value) {
std::string new_key_buf;
Slice new_key;
Status status =
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
if (!status.ok()) {
return status;
}
return WriteBatchInternal::Put(new_batch_.get(), cf, new_key, value);
}
Add timestamp support in dump_wal/dump/idump (#12690) Summary: As titled. For dumping wal files, since a mapping from column family id to the user comparator object is needed to print the timestamp in human readable format, option `[--db=<db_path>]` is added to `dump_wal` command to allow the user to choose to optionally open the DB as read only instance and dump the wal file with better timestamp formatting. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12690 Test Plan: Manually tested dump_wal: [dump a wal file specified with --walfile] ``` >> ./ldb --walfile=$TEST_DB/000004.log dump_wal --print_value >>1,1,28,13,PUT(0) : 0x666F6F0100000000000000 : 0x7631 (Column family id: [0] contained in WAL are not opened in DB. Applied default hex formatting for user key. Specify --db=<db_path> to open DB for better user key formatting if it contains timestamp.) ``` [dump with --db specified for better timestamp formatting] ``` >> ./ldb --walfile=$TEST_DB/000004.log dump_wal --db=$TEST_DB --print_value >> 1,1,28,13,PUT(0) : 0x666F6F|timestamp:1 : 0x7631 ``` dump: [dump a file specified with --path] ``` >>./ldb --path=/tmp/rocksdbtest-501/column_family_test_75359_17910784957761284041/000004.log dump Sequence,Count,ByteSize,Physical Offset,Key(s) : value 1,1,28,13,PUT(0) : 0x666F6F0100000000000000 : 0x7631 (Column family id: [0] contained in WAL are not opened in DB. Applied default hex formatting for user key. Specify --db=<db_path> to open DB for better user key formatting if it contains timestamp.) ``` [dump db specified with --db] ``` >> ./ldb --db=/tmp/rocksdbtest-501/column_family_test_75359_17910784957761284041 dump >> foo|timestamp:1 ==> v1 Keys in range: 1 ``` idump ``` ./ldb --db=$TEST_DB idump 'foo|timestamp:1' seq:1, type:1 => v1 Internal keys in range: 1 ``` Reviewed By: ltamasi Differential Revision: D57755382 Pulled By: jowlyzhang fbshipit-source-id: a0a2ef80c92801cbf7bfccc64769c1191824362e
2024-05-24 03:26:57 +00:00
Status TimestampRecoveryHandler::PutEntityCF(uint32_t cf, const Slice& key,
const Slice& entity) {
std::string new_key_buf;
Slice new_key;
Status status = TimestampRecoveryHandler::ReconcileTimestampDiscrepancy(
cf, key, &new_key_buf, &new_key);
if (!status.ok()) {
return status;
}
Slice entity_copy = entity;
WideColumns columns;
if (!WideColumnSerialization::Deserialize(entity_copy, columns).ok()) {
return Status::Corruption("Unable to deserialize entity",
entity.ToString(/* hex */ true));
}
return WriteBatchInternal::PutEntity(new_batch_.get(), cf, new_key, columns);
}
Status TimestampRecoveryHandler::TimedPutCF(uint32_t cf, const Slice& key,
const Slice& value,
uint64_t write_time) {
std::string new_key_buf;
Slice new_key;
Status status =
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
if (!status.ok()) {
return status;
}
return WriteBatchInternal::TimedPut(new_batch_.get(), cf, new_key, value,
write_time);
}
Status TimestampRecoveryHandler::DeleteCF(uint32_t cf, const Slice& key) {
std::string new_key_buf;
Slice new_key;
Status status =
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
if (!status.ok()) {
return status;
}
return WriteBatchInternal::Delete(new_batch_.get(), cf, new_key);
}
Status TimestampRecoveryHandler::SingleDeleteCF(uint32_t cf, const Slice& key) {
std::string new_key_buf;
Slice new_key;
Status status =
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
if (!status.ok()) {
return status;
}
return WriteBatchInternal::SingleDelete(new_batch_.get(), cf, new_key);
}
Status TimestampRecoveryHandler::DeleteRangeCF(uint32_t cf,
const Slice& begin_key,
const Slice& end_key) {
std::string new_begin_key_buf;
Slice new_begin_key;
std::string new_end_key_buf;
Slice new_end_key;
Status status = ReconcileTimestampDiscrepancy(
cf, begin_key, &new_begin_key_buf, &new_begin_key);
if (!status.ok()) {
return status;
}
status = ReconcileTimestampDiscrepancy(cf, end_key, &new_end_key_buf,
&new_end_key);
if (!status.ok()) {
return status;
}
return WriteBatchInternal::DeleteRange(new_batch_.get(), cf, new_begin_key,
new_end_key);
}
Status TimestampRecoveryHandler::MergeCF(uint32_t cf, const Slice& key,
const Slice& value) {
std::string new_key_buf;
Slice new_key;
Status status =
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
if (!status.ok()) {
return status;
}
return WriteBatchInternal::Merge(new_batch_.get(), cf, new_key, value);
}
Status TimestampRecoveryHandler::PutBlobIndexCF(uint32_t cf, const Slice& key,
const Slice& value) {
std::string new_key_buf;
Slice new_key;
Status status =
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
if (!status.ok()) {
return status;
}
return WriteBatchInternal::PutBlobIndex(new_batch_.get(), cf, new_key, value);
}
Status TimestampRecoveryHandler::MarkBeginPrepare(bool unprepare) {
// Transaction policy change requires empty WAL and User-defined timestamp is
// only supported for write committed txns.
// WriteBatch::Iterate has will handle this based on
// handler->WriteAfterCommit() and handler->WriteBeforePrepare().
if (unprepare) {
return Status::InvalidArgument(
"Handle user defined timestamp setting change is not supported for"
"write unprepared policy. The WAL must be emptied.");
}
return WriteBatchInternal::InsertBeginPrepare(new_batch_.get(),
write_after_commit_,
/* unprepared_batch */ false);
}
Status TimestampRecoveryHandler::MarkEndPrepare(const Slice& name) {
return WriteBatchInternal::InsertEndPrepare(new_batch_.get(), name);
}
Status TimestampRecoveryHandler::MarkCommit(const Slice& name) {
return WriteBatchInternal::MarkCommit(new_batch_.get(), name);
}
Status TimestampRecoveryHandler::MarkCommitWithTimestamp(
const Slice& name, const Slice& commit_ts) {
return WriteBatchInternal::MarkCommitWithTimestamp(new_batch_.get(), name,
commit_ts);
}
Status TimestampRecoveryHandler::MarkRollback(const Slice& name) {
return WriteBatchInternal::MarkRollback(new_batch_.get(), name);
}
Status TimestampRecoveryHandler::MarkNoop(bool /*empty_batch*/) {
return WriteBatchInternal::InsertNoop(new_batch_.get());
}
Status TimestampRecoveryHandler::ReconcileTimestampDiscrepancy(
uint32_t cf, const Slice& key, std::string* new_key_buf, Slice* new_key) {
assert(handler_valid_);
auto running_iter = running_ts_sz_.find(cf);
if (running_iter == running_ts_sz_.end()) {
// The column family referred to by the WriteBatch is no longer running.
// Copy over the entry as is to the new WriteBatch.
*new_key = key;
return Status::OK();
}
size_t running_ts_sz = running_iter->second;
auto record_iter = record_ts_sz_.find(cf);
std::optional<size_t> record_ts_sz =
record_iter != record_ts_sz_.end()
? std::optional<size_t>(record_iter->second)
: std::nullopt;
RecoveryType recovery_type = GetRecoveryType(running_ts_sz, record_ts_sz);
switch (recovery_type) {
case RecoveryType::kNoop:
*new_key = key;
break;
case RecoveryType::kStripTimestamp:
assert(record_ts_sz.has_value());
*new_key = StripTimestampFromUserKey(key, *record_ts_sz);
new_batch_diff_from_orig_batch_ = true;
break;
case RecoveryType::kPadTimestamp:
AppendKeyWithMinTimestamp(new_key_buf, key, running_ts_sz);
*new_key = *new_key_buf;
new_batch_diff_from_orig_batch_ = true;
break;
case RecoveryType::kUnrecoverable:
return Status::InvalidArgument(
"Unrecoverable timestamp size inconsistency encountered by "
"TimestampRecoveryHandler.");
default:
assert(false);
}
return Status::OK();
}
Status HandleWriteBatchTimestampSizeDifference(
const WriteBatch* batch,
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
const UnorderedMap<uint32_t, size_t>& record_ts_sz,
TimestampSizeConsistencyMode check_mode, bool seq_per_batch,
bool batch_per_txn, std::unique_ptr<WriteBatch>* new_batch) {
// Quick path to bypass checking the WriteBatch.
if (AllRunningColumnFamiliesConsistent(running_ts_sz, record_ts_sz)) {
return Status::OK();
}
bool need_recovery = false;
Status status = CheckWriteBatchTimestampSizeConsistency(
batch, running_ts_sz, record_ts_sz, check_mode, &need_recovery);
if (!status.ok()) {
return status;
} else if (need_recovery) {
assert(new_batch);
SequenceNumber sequence = WriteBatchInternal::Sequence(batch);
TimestampRecoveryHandler recovery_handler(running_ts_sz, record_ts_sz,
seq_per_batch, batch_per_txn);
status = batch->Iterate(&recovery_handler);
if (!status.ok()) {
return status;
} else {
*new_batch = recovery_handler.TransferNewBatch();
WriteBatchInternal::SetSequence(new_batch->get(), sequence);
}
}
return Status::OK();
}
Support switching on / off UDT together with in-Memtable-only feature (#11623) Summary: Add support to allow enabling / disabling user-defined timestamps feature for an existing column family in combination with the in-Memtable only feature. To do this, this PR includes: 1) Log the `persist_user_defined_timestamps` option per column family in Manifest to facilitate detecting an attempt to enable / disable UDT. This entry is enforced to be logged in the same VersionEdit as the user comparator name entry. 2) User-defined timestamps related options are validated when re-opening a column family, including user comparator name and the `persist_user_defined_timestamps` flag. These type of settings and settings change are considered valid: a) no user comparator change and no effective `persist_user_defined_timestamp` flag change. b) switch user comparator to enable UDT provided the immediately effective `persist_user_defined_timestamps` flag is false. c) switch user comparator to disable UDT provided that the before-change `persist_user_defined_timestamps` is already false. 3) when an attempt to enable UDT is detected, we mark all its existing SST files as "having no UDT" by marking its `FileMetaData.user_defined_timestamps_persisted` flag to false and handle their file boundaries `FileMetaData.smallest`, `FileMetaData.largest` by padding a min timestamp. 4) while enabling / disabling UDT feature, timestamp size inconsistency in existing WAL logs are handled to make it compatible with the running user comparator. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11623 Test Plan: ``` make all check ./db_with_timestamp_basic_test --gtest-filter="*EnableDisableUDT*" ./db_wal_test --gtest_filter="*EnableDisableUDT*" ``` Reviewed By: ltamasi Differential Revision: D47636862 Pulled By: jowlyzhang fbshipit-source-id: dcd19f67292da3c3cc9584c09ad00331c9ab9322
2023-07-27 03:16:32 +00:00
Status ValidateUserDefinedTimestampsOptions(
const Comparator* new_comparator, const std::string& old_comparator_name,
bool new_persist_udt, bool old_persist_udt,
bool* mark_sst_files_has_no_udt) {
size_t ts_sz = new_comparator->timestamp_size();
ToggleUDT res = CompareComparator(new_comparator, old_comparator_name);
switch (res) {
case ToggleUDT::kUnchanged:
if (old_persist_udt == new_persist_udt) {
return Status::OK();
}
if (ts_sz == 0) {
return Status::OK();
}
return Status::InvalidArgument(
"Cannot toggle the persist_user_defined_timestamps flag for a column "
"family with user-defined timestamps feature enabled.");
case ToggleUDT::kEnableUDT:
if (!new_persist_udt) {
*mark_sst_files_has_no_udt = true;
return Status::OK();
}
return Status::InvalidArgument(
"Cannot open a column family and enable user-defined timestamps "
"feature without setting persist_user_defined_timestamps flag to "
"false.");
case ToggleUDT::kDisableUDT:
if (!old_persist_udt) {
return Status::OK();
}
return Status::InvalidArgument(
"Cannot open a column family and disable user-defined timestamps "
"feature if its existing persist_user_defined_timestamps flag is not "
"false.");
case ToggleUDT::kInvalidChange:
return Status::InvalidArgument(
new_comparator->Name(),
"does not match existing comparator " + old_comparator_name);
default:
break;
}
return Status::InvalidArgument(
"Unsupported user defined timestamps settings change.");
}
void GetFullHistoryTsLowFromU64CutoffTs(Slice* cutoff_ts,
std::string* full_history_ts_low) {
uint64_t cutoff_udt_ts = 0;
[[maybe_unused]] bool format_res = GetFixed64(cutoff_ts, &cutoff_udt_ts);
assert(format_res);
PutFixed64(full_history_ts_low, cutoff_udt_ts + 1);
}
std::tuple<std::optional<Slice>, std::optional<Slice>>
MaybeAddTimestampsToRange(const Slice* start, const Slice* end, size_t ts_sz,
std::string* start_with_ts, std::string* end_with_ts,
bool exclusive_end) {
std::optional<Slice> ret_start, ret_end;
if (start) {
if (ts_sz == 0) {
ret_start = *start;
} else {
// Maximum timestamp means including all keys with any timestamp for start
AppendKeyWithMaxTimestamp(start_with_ts, *start, ts_sz);
ret_start = Slice(*start_with_ts);
}
}
if (end) {
if (ts_sz == 0) {
ret_end = *end;
} else {
if (exclusive_end) {
// Append a maximum timestamp as the range limit is exclusive:
// [start, end)
AppendKeyWithMaxTimestamp(end_with_ts, *end, ts_sz);
} else {
// Append a minimum timestamp to end so the range limit is inclusive:
// [start, end]
AppendKeyWithMinTimestamp(end_with_ts, *end, ts_sz);
}
ret_end = Slice(*end_with_ts);
}
}
return std::make_tuple(ret_start, ret_end);
}
} // namespace ROCKSDB_NAMESPACE