rocksdb/db/dbformat.cc
Yu Zhang 071a146fa0 Add support for range deletion when user timestamps are not persisted (#12254)
Summary:
For the user defined timestamps in memtable only feature, some special handling for range deletion blocks are needed since both the key (start_key) and the value (end_key) of a range tombstone can contain user-defined timestamps. Handling for the key is taken care of in the same way as the other data blocks in the block based table. This PR adds the special handling needed for the value (end_key) part. This includes:

1) On the write path, when L0 SST files are first created from flush, user-defined timestamps are removed from an end key of a range tombstone. There are places where it's logically removed (replaced with a min timestamp) because there is still logic with the running comparator that expects a user key that contains timestamp. And in the block based builder, it is eventually physically removed before persisted in a block.

2) On the read path, when range deletion block is being read, we artificially pad a min timestamp to the end key of a range tombstone in `BlockBasedTableReader`.

3) For file boundary `FileMetaData.largest`, we artificially pad a max timestamp to it if it contains a range deletion sentinel. Anytime when range deletion end_key is used to update file boundaries, it's using max timestamp instead of the range tombstone's actual timestamp to mark it as an exclusive end. d69628e6ce/db/dbformat.h (L923-L935)
This max timestamp is removed when in memory `FileMetaData.largest` is persisted into Manifest, we pad it back when it's read from Manifest while handling related `VersionEdit` in `VersionEditHandler`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12254

Test Plan: Added unit test and enabled this feature combination's stress test.

Reviewed By: cbi42

Differential Revision: D52965527

Pulled By: jowlyzhang

fbshipit-source-id: e8315f8a2c5268e2ae0f7aec8012c266b86df985
2024-01-29 11:37:34 -08:00

268 lines
8.9 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/dbformat.h"
#include <cinttypes>
#include <cstdio>
#include "db/lookup_key.h"
#include "monitoring/perf_context_imp.h"
#include "port/port.h"
#include "util/coding.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
// kValueTypeForSeek defines the ValueType that should be passed when
// constructing a ParsedInternalKey object for seeking to a particular
// sequence number (since we sort sequence numbers in decreasing order
// and the value type is embedded as the low 8 bits in the sequence
// number in internal keys, we need to use the highest-numbered
// ValueType, not the lowest).
const ValueType kValueTypeForSeek = kTypeWideColumnEntity;
const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
const std::string kDisableUserTimestamp;
EntryType GetEntryType(ValueType value_type) {
switch (value_type) {
case kTypeValue:
return kEntryPut;
case kTypeDeletion:
return kEntryDelete;
case kTypeDeletionWithTimestamp:
return kEntryDeleteWithTimestamp;
case kTypeSingleDeletion:
return kEntrySingleDelete;
case kTypeMerge:
return kEntryMerge;
case kTypeRangeDeletion:
return kEntryRangeDeletion;
case kTypeBlobIndex:
return kEntryBlobIndex;
case kTypeWideColumnEntity:
return kEntryWideColumnEntity;
default:
return kEntryOther;
}
}
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
result->append(key.user_key.data(), key.user_key.size());
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
}
void AppendInternalKeyWithDifferentTimestamp(std::string* result,
const ParsedInternalKey& key,
const Slice& ts) {
assert(key.user_key.size() >= ts.size());
result->append(key.user_key.data(), key.user_key.size() - ts.size());
result->append(ts.data(), ts.size());
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
}
void AppendUserKeyWithDifferentTimestamp(std::string* result, const Slice& key,
const Slice& ts) {
assert(key.size() >= ts.size());
result->append(key.data(), key.size() - ts.size());
result->append(ts.data(), ts.size());
}
void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
ValueType t) {
PutFixed64(result, PackSequenceAndType(s, t));
}
void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
const std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
result->append(key.data(), key.size());
result->append(kTsMin.data(), ts_sz);
}
void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
const std::string kTsMax(ts_sz, static_cast<unsigned char>(0xff));
result->append(key.data(), key.size());
result->append(kTsMax.data(), ts_sz);
}
void AppendUserKeyWithMinTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
result->append(key.data(), key.size() - ts_sz);
result->append(ts_sz, static_cast<unsigned char>(0));
}
void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
result->append(key.data(), key.size() - ts_sz);
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
if (ts_sz < strlen(kTsMax)) {
result->append(kTsMax, ts_sz);
} else {
result->append(std::string(ts_sz, '\xff'));
}
}
void PadInternalKeyWithMinTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
assert(key.size() >= kNumInternalBytes);
size_t user_key_size = key.size() - kNumInternalBytes;
result->reserve(key.size() + ts_sz);
result->append(key.data(), user_key_size);
result->append(ts_sz, static_cast<unsigned char>(0));
result->append(key.data() + user_key_size, kNumInternalBytes);
}
void PadInternalKeyWithMaxTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
assert(key.size() >= kNumInternalBytes);
size_t user_key_size = key.size() - kNumInternalBytes;
result->reserve(key.size() + ts_sz);
result->append(key.data(), user_key_size);
result->append(std::string(ts_sz, '\xff'));
result->append(key.data() + user_key_size, kNumInternalBytes);
}
void StripTimestampFromInternalKey(std::string* result, const Slice& key,
size_t ts_sz) {
assert(key.size() >= ts_sz + kNumInternalBytes);
result->reserve(key.size() - ts_sz);
result->append(key.data(), key.size() - kNumInternalBytes - ts_sz);
result->append(key.data() + key.size() - kNumInternalBytes,
kNumInternalBytes);
}
void ReplaceInternalKeyWithMinTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
const size_t key_sz = key.size();
assert(key_sz >= ts_sz + kNumInternalBytes);
result->reserve(key_sz);
result->append(key.data(), key_sz - kNumInternalBytes - ts_sz);
result->append(ts_sz, static_cast<unsigned char>(0));
result->append(key.data() + key_sz - kNumInternalBytes, kNumInternalBytes);
}
std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const {
std::string result = "'";
if (log_err_key) {
result += user_key.ToString(hex);
} else {
result += "<redacted>";
}
char buf[50];
snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence,
static_cast<int>(type));
result += buf;
return result;
}
std::string InternalKey::DebugString(bool hex) const {
std::string result;
ParsedInternalKey parsed;
if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) {
result = parsed.DebugString(true /* log_err_key */, hex); // TODO
} else {
result = "(bad)";
result.append(EscapeString(rep_));
}
return result;
}
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
const ParsedInternalKey& b) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_.Compare(a.user_key, b.user_key);
if (r == 0) {
if (a.sequence > b.sequence) {
r = -1;
} else if (a.sequence < b.sequence) {
r = +1;
} else if (a.type > b.type) {
r = -1;
} else if (a.type < b.type) {
r = +1;
}
}
return r;
}
int InternalKeyComparator::Compare(const Slice& a,
const ParsedInternalKey& b) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_.Compare(ExtractUserKey(a), b.user_key);
if (r == 0) {
const uint64_t anum =
DecodeFixed64(a.data() + a.size() - kNumInternalBytes);
const uint64_t bnum = (b.sequence << 8) | b.type;
if (anum > bnum) {
r = -1;
} else if (anum < bnum) {
r = +1;
}
}
return r;
}
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
const Slice& b) const {
return -Compare(b, a);
}
LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
const Slice* ts) {
size_t usize = _user_key.size();
size_t ts_sz = (nullptr == ts) ? 0 : ts->size();
size_t needed = usize + ts_sz + 13; // A conservative estimate
char* dst;
if (needed <= sizeof(space_)) {
dst = space_;
} else {
dst = new char[needed];
}
start_ = dst;
// NOTE: We don't support users keys of more than 2GB :)
dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + ts_sz + 8));
kstart_ = dst;
memcpy(dst, _user_key.data(), usize);
dst += usize;
if (nullptr != ts) {
memcpy(dst, ts->data(), ts_sz);
dst += ts_sz;
}
EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
dst += 8;
end_ = dst;
}
void IterKey::EnlargeBuffer(size_t key_size) {
// If size is smaller than buffer size, continue using current buffer,
// or the static allocated one, as default
assert(key_size > buf_size_);
// Need to enlarge the buffer.
ResetBuffer();
buf_ = new char[key_size];
buf_size_ = key_size;
}
} // namespace ROCKSDB_NAMESPACE