mirror of https://github.com/facebook/rocksdb.git
419 lines
14 KiB
C++
419 lines
14 KiB
C++
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
//
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "util/udt_util.h"
|
|
|
|
#include "db/dbformat.h"
|
|
#include "rocksdb/types.h"
|
|
#include "util/coding.h"
|
|
#include "util/write_batch_util.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
namespace {
|
|
enum class RecoveryType {
|
|
kNoop,
|
|
kUnrecoverable,
|
|
kStripTimestamp,
|
|
kPadTimestamp,
|
|
};
|
|
|
|
RecoveryType GetRecoveryType(const size_t running_ts_sz,
|
|
const std::optional<size_t>& recorded_ts_sz) {
|
|
if (running_ts_sz == 0) {
|
|
if (!recorded_ts_sz.has_value()) {
|
|
// A column family id not recorded is equivalent to that column family has
|
|
// zero timestamp size.
|
|
return RecoveryType::kNoop;
|
|
}
|
|
return RecoveryType::kStripTimestamp;
|
|
}
|
|
|
|
assert(running_ts_sz != 0);
|
|
|
|
if (!recorded_ts_sz.has_value()) {
|
|
return RecoveryType::kPadTimestamp;
|
|
}
|
|
|
|
if (running_ts_sz != *recorded_ts_sz) {
|
|
return RecoveryType::kUnrecoverable;
|
|
}
|
|
|
|
return RecoveryType::kNoop;
|
|
}
|
|
|
|
bool AllRunningColumnFamiliesConsistent(
|
|
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
|
|
const UnorderedMap<uint32_t, size_t>& record_ts_sz) {
|
|
for (const auto& [cf_id, ts_sz] : running_ts_sz) {
|
|
auto record_it = record_ts_sz.find(cf_id);
|
|
RecoveryType recovery_type =
|
|
GetRecoveryType(ts_sz, record_it != record_ts_sz.end()
|
|
? std::optional<size_t>(record_it->second)
|
|
: std::nullopt);
|
|
if (recovery_type != RecoveryType::kNoop) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Status CheckWriteBatchTimestampSizeConsistency(
|
|
const WriteBatch* batch,
|
|
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
|
|
const UnorderedMap<uint32_t, size_t>& record_ts_sz,
|
|
TimestampSizeConsistencyMode check_mode, bool* ts_need_recovery) {
|
|
std::vector<uint32_t> column_family_ids;
|
|
Status status =
|
|
CollectColumnFamilyIdsFromWriteBatch(*batch, &column_family_ids);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
for (const auto& cf_id : column_family_ids) {
|
|
auto running_iter = running_ts_sz.find(cf_id);
|
|
if (running_iter == running_ts_sz.end()) {
|
|
// Ignore dropped column family referred to in a WriteBatch regardless of
|
|
// its consistency.
|
|
continue;
|
|
}
|
|
auto record_iter = record_ts_sz.find(cf_id);
|
|
RecoveryType recovery_type = GetRecoveryType(
|
|
running_iter->second, record_iter != record_ts_sz.end()
|
|
? std::optional<size_t>(record_iter->second)
|
|
: std::nullopt);
|
|
if (recovery_type != RecoveryType::kNoop) {
|
|
if (check_mode == TimestampSizeConsistencyMode::kVerifyConsistency) {
|
|
return Status::InvalidArgument(
|
|
"WriteBatch contains timestamp size inconsistency.");
|
|
}
|
|
|
|
if (recovery_type == RecoveryType::kUnrecoverable) {
|
|
return Status::InvalidArgument(
|
|
"WriteBatch contains unrecoverable timestamp size inconsistency.");
|
|
}
|
|
|
|
// If any column family needs reconciliation, it will mark the whole
|
|
// WriteBatch to need recovery and rebuilt.
|
|
*ts_need_recovery = true;
|
|
}
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
enum class ToggleUDT {
|
|
kUnchanged,
|
|
kEnableUDT,
|
|
kDisableUDT,
|
|
kInvalidChange,
|
|
};
|
|
|
|
ToggleUDT CompareComparator(const Comparator* new_comparator,
|
|
const std::string& old_comparator_name) {
|
|
static const char* kUDTSuffix = ".u64ts";
|
|
static const Slice kSuffixSlice = kUDTSuffix;
|
|
static const size_t kSuffixSize = 6;
|
|
size_t ts_sz = new_comparator->timestamp_size();
|
|
(void)ts_sz;
|
|
Slice new_ucmp_name(new_comparator->Name());
|
|
Slice old_ucmp_name(old_comparator_name);
|
|
if (new_ucmp_name.compare(old_ucmp_name) == 0) {
|
|
return ToggleUDT::kUnchanged;
|
|
}
|
|
if (new_ucmp_name.size() == old_ucmp_name.size() + kSuffixSize &&
|
|
new_ucmp_name.starts_with(old_ucmp_name) &&
|
|
new_ucmp_name.ends_with(kSuffixSlice)) {
|
|
assert(ts_sz == 8);
|
|
return ToggleUDT::kEnableUDT;
|
|
}
|
|
if (old_ucmp_name.size() == new_ucmp_name.size() + kSuffixSize &&
|
|
old_ucmp_name.starts_with(new_ucmp_name) &&
|
|
old_ucmp_name.ends_with(kSuffixSlice)) {
|
|
assert(ts_sz == 0);
|
|
return ToggleUDT::kDisableUDT;
|
|
}
|
|
return ToggleUDT::kInvalidChange;
|
|
}
|
|
} // namespace
|
|
|
|
TimestampRecoveryHandler::TimestampRecoveryHandler(
|
|
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
|
|
const UnorderedMap<uint32_t, size_t>& record_ts_sz)
|
|
: running_ts_sz_(running_ts_sz),
|
|
record_ts_sz_(record_ts_sz),
|
|
new_batch_(new WriteBatch()),
|
|
handler_valid_(true),
|
|
new_batch_diff_from_orig_batch_(false) {}
|
|
|
|
Status TimestampRecoveryHandler::PutCF(uint32_t cf, const Slice& key,
|
|
const Slice& value) {
|
|
std::string new_key_buf;
|
|
Slice new_key;
|
|
Status status =
|
|
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
return WriteBatchInternal::Put(new_batch_.get(), cf, new_key, value);
|
|
}
|
|
|
|
Status TimestampRecoveryHandler::PutEntityCF(uint32_t cf, const Slice& key,
|
|
const Slice& entity) {
|
|
std::string new_key_buf;
|
|
Slice new_key;
|
|
Status status = TimestampRecoveryHandler::ReconcileTimestampDiscrepancy(
|
|
cf, key, &new_key_buf, &new_key);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
Slice entity_copy = entity;
|
|
WideColumns columns;
|
|
if (!WideColumnSerialization::Deserialize(entity_copy, columns).ok()) {
|
|
return Status::Corruption("Unable to deserialize entity",
|
|
entity.ToString(/* hex */ true));
|
|
}
|
|
|
|
return WriteBatchInternal::PutEntity(new_batch_.get(), cf, new_key, columns);
|
|
}
|
|
|
|
Status TimestampRecoveryHandler::TimedPutCF(uint32_t cf, const Slice& key,
|
|
const Slice& value,
|
|
uint64_t write_time) {
|
|
std::string new_key_buf;
|
|
Slice new_key;
|
|
Status status =
|
|
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
return WriteBatchInternal::TimedPut(new_batch_.get(), cf, new_key, value,
|
|
write_time);
|
|
}
|
|
|
|
Status TimestampRecoveryHandler::DeleteCF(uint32_t cf, const Slice& key) {
|
|
std::string new_key_buf;
|
|
Slice new_key;
|
|
Status status =
|
|
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
return WriteBatchInternal::Delete(new_batch_.get(), cf, new_key);
|
|
}
|
|
|
|
Status TimestampRecoveryHandler::SingleDeleteCF(uint32_t cf, const Slice& key) {
|
|
std::string new_key_buf;
|
|
Slice new_key;
|
|
Status status =
|
|
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
return WriteBatchInternal::SingleDelete(new_batch_.get(), cf, new_key);
|
|
}
|
|
|
|
Status TimestampRecoveryHandler::DeleteRangeCF(uint32_t cf,
|
|
const Slice& begin_key,
|
|
const Slice& end_key) {
|
|
std::string new_begin_key_buf;
|
|
Slice new_begin_key;
|
|
std::string new_end_key_buf;
|
|
Slice new_end_key;
|
|
Status status = ReconcileTimestampDiscrepancy(
|
|
cf, begin_key, &new_begin_key_buf, &new_begin_key);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
status = ReconcileTimestampDiscrepancy(cf, end_key, &new_end_key_buf,
|
|
&new_end_key);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
return WriteBatchInternal::DeleteRange(new_batch_.get(), cf, new_begin_key,
|
|
new_end_key);
|
|
}
|
|
|
|
Status TimestampRecoveryHandler::MergeCF(uint32_t cf, const Slice& key,
|
|
const Slice& value) {
|
|
std::string new_key_buf;
|
|
Slice new_key;
|
|
Status status =
|
|
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
return WriteBatchInternal::Merge(new_batch_.get(), cf, new_key, value);
|
|
}
|
|
|
|
Status TimestampRecoveryHandler::PutBlobIndexCF(uint32_t cf, const Slice& key,
|
|
const Slice& value) {
|
|
std::string new_key_buf;
|
|
Slice new_key;
|
|
Status status =
|
|
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
return WriteBatchInternal::PutBlobIndex(new_batch_.get(), cf, new_key, value);
|
|
}
|
|
|
|
Status TimestampRecoveryHandler::ReconcileTimestampDiscrepancy(
|
|
uint32_t cf, const Slice& key, std::string* new_key_buf, Slice* new_key) {
|
|
assert(handler_valid_);
|
|
auto running_iter = running_ts_sz_.find(cf);
|
|
if (running_iter == running_ts_sz_.end()) {
|
|
// The column family referred to by the WriteBatch is no longer running.
|
|
// Copy over the entry as is to the new WriteBatch.
|
|
*new_key = key;
|
|
return Status::OK();
|
|
}
|
|
size_t running_ts_sz = running_iter->second;
|
|
auto record_iter = record_ts_sz_.find(cf);
|
|
std::optional<size_t> record_ts_sz =
|
|
record_iter != record_ts_sz_.end()
|
|
? std::optional<size_t>(record_iter->second)
|
|
: std::nullopt;
|
|
RecoveryType recovery_type = GetRecoveryType(running_ts_sz, record_ts_sz);
|
|
|
|
switch (recovery_type) {
|
|
case RecoveryType::kNoop:
|
|
*new_key = key;
|
|
break;
|
|
case RecoveryType::kStripTimestamp:
|
|
assert(record_ts_sz.has_value());
|
|
*new_key = StripTimestampFromUserKey(key, *record_ts_sz);
|
|
new_batch_diff_from_orig_batch_ = true;
|
|
break;
|
|
case RecoveryType::kPadTimestamp:
|
|
AppendKeyWithMinTimestamp(new_key_buf, key, running_ts_sz);
|
|
*new_key = *new_key_buf;
|
|
new_batch_diff_from_orig_batch_ = true;
|
|
break;
|
|
case RecoveryType::kUnrecoverable:
|
|
return Status::InvalidArgument(
|
|
"Unrecoverable timestamp size inconsistency encountered by "
|
|
"TimestampRecoveryHandler.");
|
|
default:
|
|
assert(false);
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status HandleWriteBatchTimestampSizeDifference(
|
|
const WriteBatch* batch,
|
|
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
|
|
const UnorderedMap<uint32_t, size_t>& record_ts_sz,
|
|
TimestampSizeConsistencyMode check_mode,
|
|
std::unique_ptr<WriteBatch>* new_batch) {
|
|
// Quick path to bypass checking the WriteBatch.
|
|
if (AllRunningColumnFamiliesConsistent(running_ts_sz, record_ts_sz)) {
|
|
return Status::OK();
|
|
}
|
|
bool need_recovery = false;
|
|
Status status = CheckWriteBatchTimestampSizeConsistency(
|
|
batch, running_ts_sz, record_ts_sz, check_mode, &need_recovery);
|
|
if (!status.ok()) {
|
|
return status;
|
|
} else if (need_recovery) {
|
|
assert(new_batch);
|
|
SequenceNumber sequence = WriteBatchInternal::Sequence(batch);
|
|
TimestampRecoveryHandler recovery_handler(running_ts_sz, record_ts_sz);
|
|
status = batch->Iterate(&recovery_handler);
|
|
if (!status.ok()) {
|
|
return status;
|
|
} else {
|
|
*new_batch = recovery_handler.TransferNewBatch();
|
|
WriteBatchInternal::SetSequence(new_batch->get(), sequence);
|
|
}
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status ValidateUserDefinedTimestampsOptions(
|
|
const Comparator* new_comparator, const std::string& old_comparator_name,
|
|
bool new_persist_udt, bool old_persist_udt,
|
|
bool* mark_sst_files_has_no_udt) {
|
|
size_t ts_sz = new_comparator->timestamp_size();
|
|
ToggleUDT res = CompareComparator(new_comparator, old_comparator_name);
|
|
switch (res) {
|
|
case ToggleUDT::kUnchanged:
|
|
if (old_persist_udt == new_persist_udt) {
|
|
return Status::OK();
|
|
}
|
|
if (ts_sz == 0) {
|
|
return Status::OK();
|
|
}
|
|
return Status::InvalidArgument(
|
|
"Cannot toggle the persist_user_defined_timestamps flag for a column "
|
|
"family with user-defined timestamps feature enabled.");
|
|
case ToggleUDT::kEnableUDT:
|
|
if (!new_persist_udt) {
|
|
*mark_sst_files_has_no_udt = true;
|
|
return Status::OK();
|
|
}
|
|
return Status::InvalidArgument(
|
|
"Cannot open a column family and enable user-defined timestamps "
|
|
"feature without setting persist_user_defined_timestamps flag to "
|
|
"false.");
|
|
case ToggleUDT::kDisableUDT:
|
|
if (!old_persist_udt) {
|
|
return Status::OK();
|
|
}
|
|
return Status::InvalidArgument(
|
|
"Cannot open a column family and disable user-defined timestamps "
|
|
"feature if its existing persist_user_defined_timestamps flag is not "
|
|
"false.");
|
|
case ToggleUDT::kInvalidChange:
|
|
return Status::InvalidArgument(
|
|
new_comparator->Name(),
|
|
"does not match existing comparator " + old_comparator_name);
|
|
default:
|
|
break;
|
|
}
|
|
return Status::InvalidArgument(
|
|
"Unsupported user defined timestamps settings change.");
|
|
}
|
|
|
|
void GetFullHistoryTsLowFromU64CutoffTs(Slice* cutoff_ts,
|
|
std::string* full_history_ts_low) {
|
|
uint64_t cutoff_udt_ts = 0;
|
|
[[maybe_unused]] bool format_res = GetFixed64(cutoff_ts, &cutoff_udt_ts);
|
|
assert(format_res);
|
|
PutFixed64(full_history_ts_low, cutoff_udt_ts + 1);
|
|
}
|
|
|
|
std::tuple<std::optional<Slice>, std::optional<Slice>>
|
|
MaybeAddTimestampsToRange(const Slice* start, const Slice* end, size_t ts_sz,
|
|
std::string* start_with_ts, std::string* end_with_ts,
|
|
bool exclusive_end) {
|
|
std::optional<Slice> ret_start, ret_end;
|
|
if (start) {
|
|
if (ts_sz == 0) {
|
|
ret_start = *start;
|
|
} else {
|
|
// Maximum timestamp means including all keys with any timestamp for start
|
|
AppendKeyWithMaxTimestamp(start_with_ts, *start, ts_sz);
|
|
ret_start = Slice(*start_with_ts);
|
|
}
|
|
}
|
|
if (end) {
|
|
if (ts_sz == 0) {
|
|
ret_end = *end;
|
|
} else {
|
|
if (exclusive_end) {
|
|
// Append a maximum timestamp as the range limit is exclusive:
|
|
// [start, end)
|
|
AppendKeyWithMaxTimestamp(end_with_ts, *end, ts_sz);
|
|
} else {
|
|
// Append a minimum timestamp to end so the range limit is inclusive:
|
|
// [start, end]
|
|
AppendKeyWithMinTimestamp(end_with_ts, *end, ts_sz);
|
|
}
|
|
ret_end = Slice(*end_with_ts);
|
|
}
|
|
}
|
|
return std::make_tuple(ret_start, ret_end);
|
|
}
|
|
} // namespace ROCKSDB_NAMESPACE
|