rocksdb/db/dbformat.cc
Yu Zhang 1104eaa35e Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.

The life cycle of such an entry on the write/flush/compaction paths are:

1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`

2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
 when we have easy access to the seqno to time mapping.

3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.

On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.

Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.

2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.

3) Stress test coverage for the feature

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419

Test Plan: Added unit tests

Reviewed By: pdillinger

Differential Revision: D54920296

Pulled By: jowlyzhang

fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 15:44:55 -07:00

268 lines
8.9 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/dbformat.h"
#include <cinttypes>
#include <cstdio>
#include "db/lookup_key.h"
#include "monitoring/perf_context_imp.h"
#include "port/port.h"
#include "util/coding.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
// kValueTypeForSeek defines the ValueType that should be passed when
// constructing a ParsedInternalKey object for seeking to a particular
// sequence number (since we sort sequence numbers in decreasing order
// and the value type is embedded as the low 8 bits in the sequence
// number in internal keys, we need to use the highest-numbered
// ValueType, not the lowest).
const ValueType kValueTypeForSeek = kTypeValuePreferredSeqno;
const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
const std::string kDisableUserTimestamp;
EntryType GetEntryType(ValueType value_type) {
switch (value_type) {
case kTypeValue:
return kEntryPut;
case kTypeDeletion:
return kEntryDelete;
case kTypeDeletionWithTimestamp:
return kEntryDeleteWithTimestamp;
case kTypeSingleDeletion:
return kEntrySingleDelete;
case kTypeMerge:
return kEntryMerge;
case kTypeRangeDeletion:
return kEntryRangeDeletion;
case kTypeBlobIndex:
return kEntryBlobIndex;
case kTypeWideColumnEntity:
return kEntryWideColumnEntity;
default:
return kEntryOther;
}
}
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
result->append(key.user_key.data(), key.user_key.size());
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
}
void AppendInternalKeyWithDifferentTimestamp(std::string* result,
const ParsedInternalKey& key,
const Slice& ts) {
assert(key.user_key.size() >= ts.size());
result->append(key.user_key.data(), key.user_key.size() - ts.size());
result->append(ts.data(), ts.size());
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
}
void AppendUserKeyWithDifferentTimestamp(std::string* result, const Slice& key,
const Slice& ts) {
assert(key.size() >= ts.size());
result->append(key.data(), key.size() - ts.size());
result->append(ts.data(), ts.size());
}
void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
ValueType t) {
PutFixed64(result, PackSequenceAndType(s, t));
}
void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
const std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
result->append(key.data(), key.size());
result->append(kTsMin.data(), ts_sz);
}
void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
const std::string kTsMax(ts_sz, static_cast<unsigned char>(0xff));
result->append(key.data(), key.size());
result->append(kTsMax.data(), ts_sz);
}
void AppendUserKeyWithMinTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
result->append(key.data(), key.size() - ts_sz);
result->append(ts_sz, static_cast<unsigned char>(0));
}
void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
result->append(key.data(), key.size() - ts_sz);
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
if (ts_sz < strlen(kTsMax)) {
result->append(kTsMax, ts_sz);
} else {
result->append(std::string(ts_sz, '\xff'));
}
}
void PadInternalKeyWithMinTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
assert(key.size() >= kNumInternalBytes);
size_t user_key_size = key.size() - kNumInternalBytes;
result->reserve(key.size() + ts_sz);
result->append(key.data(), user_key_size);
result->append(ts_sz, static_cast<unsigned char>(0));
result->append(key.data() + user_key_size, kNumInternalBytes);
}
void PadInternalKeyWithMaxTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
assert(ts_sz > 0);
assert(key.size() >= kNumInternalBytes);
size_t user_key_size = key.size() - kNumInternalBytes;
result->reserve(key.size() + ts_sz);
result->append(key.data(), user_key_size);
result->append(std::string(ts_sz, '\xff'));
result->append(key.data() + user_key_size, kNumInternalBytes);
}
void StripTimestampFromInternalKey(std::string* result, const Slice& key,
size_t ts_sz) {
assert(key.size() >= ts_sz + kNumInternalBytes);
result->reserve(key.size() - ts_sz);
result->append(key.data(), key.size() - kNumInternalBytes - ts_sz);
result->append(key.data() + key.size() - kNumInternalBytes,
kNumInternalBytes);
}
void ReplaceInternalKeyWithMinTimestamp(std::string* result, const Slice& key,
size_t ts_sz) {
const size_t key_sz = key.size();
assert(key_sz >= ts_sz + kNumInternalBytes);
result->reserve(key_sz);
result->append(key.data(), key_sz - kNumInternalBytes - ts_sz);
result->append(ts_sz, static_cast<unsigned char>(0));
result->append(key.data() + key_sz - kNumInternalBytes, kNumInternalBytes);
}
std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const {
std::string result = "'";
if (log_err_key) {
result += user_key.ToString(hex);
} else {
result += "<redacted>";
}
char buf[50];
snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence,
static_cast<int>(type));
result += buf;
return result;
}
std::string InternalKey::DebugString(bool hex) const {
std::string result;
ParsedInternalKey parsed;
if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) {
result = parsed.DebugString(true /* log_err_key */, hex); // TODO
} else {
result = "(bad)";
result.append(EscapeString(rep_));
}
return result;
}
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
const ParsedInternalKey& b) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_.Compare(a.user_key, b.user_key);
if (r == 0) {
if (a.sequence > b.sequence) {
r = -1;
} else if (a.sequence < b.sequence) {
r = +1;
} else if (a.type > b.type) {
r = -1;
} else if (a.type < b.type) {
r = +1;
}
}
return r;
}
int InternalKeyComparator::Compare(const Slice& a,
const ParsedInternalKey& b) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_.Compare(ExtractUserKey(a), b.user_key);
if (r == 0) {
const uint64_t anum =
DecodeFixed64(a.data() + a.size() - kNumInternalBytes);
const uint64_t bnum = (b.sequence << 8) | b.type;
if (anum > bnum) {
r = -1;
} else if (anum < bnum) {
r = +1;
}
}
return r;
}
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
const Slice& b) const {
return -Compare(b, a);
}
LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
const Slice* ts) {
size_t usize = _user_key.size();
size_t ts_sz = (nullptr == ts) ? 0 : ts->size();
size_t needed = usize + ts_sz + 13; // A conservative estimate
char* dst;
if (needed <= sizeof(space_)) {
dst = space_;
} else {
dst = new char[needed];
}
start_ = dst;
// NOTE: We don't support users keys of more than 2GB :)
dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + ts_sz + 8));
kstart_ = dst;
memcpy(dst, _user_key.data(), usize);
dst += usize;
if (nullptr != ts) {
memcpy(dst, ts->data(), ts_sz);
dst += ts_sz;
}
EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
dst += 8;
end_ = dst;
}
void IterKey::EnlargeBuffer(size_t key_size) {
// If size is smaller than buffer size, continue using current buffer,
// or the static allocated one, as default
assert(key_size > buf_size_);
// Need to enlarge the buffer.
ResetBuffer();
buf_ = new char[key_size];
buf_size_ = key_size;
}
} // namespace ROCKSDB_NAMESPACE