mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-25 14:31:35 +00:00
15053f3ab4
Summary: Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes. While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced. ```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000``` Compiled with DEBUG_LEVEL=0 Test vs. control runs simulaneous for better accuracy, units = ops/sec PR vs base: Round 1: 350652 vs 349055 Round 2: 365733 vs 364308 Round 3: 355681 vs 354475 Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557 Test Plan: New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`: `UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp `UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false. ``` make all check ./db_wal_test --gtest_filter="*WithTimestamp*" ./flush_job_test --gtest_filter="*WithTimestamp*" ./repair_test --gtest_filter="*WithTimestamp*" ./block_based_table_reader_test ``` Reviewed By: pdillinger Differential Revision: D47027664 Pulled By: jowlyzhang fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
243 lines
7.9 KiB
C++
243 lines
7.9 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
#include "db/dbformat.h"
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <cinttypes>
|
|
|
|
#include "db/lookup_key.h"
|
|
#include "monitoring/perf_context_imp.h"
|
|
#include "port/port.h"
|
|
#include "util/coding.h"
|
|
#include "util/string_util.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
// kValueTypeForSeek defines the ValueType that should be passed when
|
|
// constructing a ParsedInternalKey object for seeking to a particular
|
|
// sequence number (since we sort sequence numbers in decreasing order
|
|
// and the value type is embedded as the low 8 bits in the sequence
|
|
// number in internal keys, we need to use the highest-numbered
|
|
// ValueType, not the lowest).
|
|
const ValueType kValueTypeForSeek = kTypeWideColumnEntity;
|
|
const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
|
|
const std::string kDisableUserTimestamp("");
|
|
|
|
EntryType GetEntryType(ValueType value_type) {
|
|
switch (value_type) {
|
|
case kTypeValue:
|
|
return kEntryPut;
|
|
case kTypeDeletion:
|
|
return kEntryDelete;
|
|
case kTypeDeletionWithTimestamp:
|
|
return kEntryDeleteWithTimestamp;
|
|
case kTypeSingleDeletion:
|
|
return kEntrySingleDelete;
|
|
case kTypeMerge:
|
|
return kEntryMerge;
|
|
case kTypeRangeDeletion:
|
|
return kEntryRangeDeletion;
|
|
case kTypeBlobIndex:
|
|
return kEntryBlobIndex;
|
|
case kTypeWideColumnEntity:
|
|
return kEntryWideColumnEntity;
|
|
default:
|
|
return kEntryOther;
|
|
}
|
|
}
|
|
|
|
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
|
|
result->append(key.user_key.data(), key.user_key.size());
|
|
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
|
|
}
|
|
|
|
void AppendInternalKeyWithDifferentTimestamp(std::string* result,
|
|
const ParsedInternalKey& key,
|
|
const Slice& ts) {
|
|
assert(key.user_key.size() >= ts.size());
|
|
result->append(key.user_key.data(), key.user_key.size() - ts.size());
|
|
result->append(ts.data(), ts.size());
|
|
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
|
|
}
|
|
|
|
void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
|
|
ValueType t) {
|
|
PutFixed64(result, PackSequenceAndType(s, t));
|
|
}
|
|
|
|
void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
|
|
size_t ts_sz) {
|
|
assert(ts_sz > 0);
|
|
const std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
|
|
result->append(key.data(), key.size());
|
|
result->append(kTsMin.data(), ts_sz);
|
|
}
|
|
|
|
void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
|
|
size_t ts_sz) {
|
|
assert(ts_sz > 0);
|
|
const std::string kTsMax(ts_sz, static_cast<unsigned char>(0xff));
|
|
result->append(key.data(), key.size());
|
|
result->append(kTsMax.data(), ts_sz);
|
|
}
|
|
|
|
void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key,
|
|
size_t ts_sz) {
|
|
assert(ts_sz > 0);
|
|
result->append(key.data(), key.size() - ts_sz);
|
|
|
|
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
|
|
if (ts_sz < strlen(kTsMax)) {
|
|
result->append(kTsMax, ts_sz);
|
|
} else {
|
|
result->append(std::string(ts_sz, '\xff'));
|
|
}
|
|
}
|
|
|
|
void PadInternalKeyWithMinTimestamp(std::string* result, const Slice& key,
|
|
size_t ts_sz) {
|
|
assert(ts_sz > 0);
|
|
size_t user_key_size = key.size() - kNumInternalBytes;
|
|
result->reserve(key.size() + ts_sz);
|
|
result->append(key.data(), user_key_size);
|
|
result->append(ts_sz, static_cast<unsigned char>(0));
|
|
result->append(key.data() + user_key_size, kNumInternalBytes);
|
|
}
|
|
|
|
void StripTimestampFromInternalKey(std::string* result, const Slice& key,
|
|
size_t ts_sz) {
|
|
assert(key.size() >= ts_sz + kNumInternalBytes);
|
|
result->reserve(key.size() - ts_sz);
|
|
result->append(key.data(), key.size() - kNumInternalBytes - ts_sz);
|
|
result->append(key.data() + key.size() - kNumInternalBytes,
|
|
kNumInternalBytes);
|
|
}
|
|
|
|
void ReplaceInternalKeyWithMinTimestamp(std::string* result, const Slice& key,
|
|
size_t ts_sz) {
|
|
const size_t key_sz = key.size();
|
|
assert(key_sz >= ts_sz + kNumInternalBytes);
|
|
result->reserve(key_sz);
|
|
result->append(key.data(), key_sz - kNumInternalBytes - ts_sz);
|
|
result->append(ts_sz, static_cast<unsigned char>(0));
|
|
result->append(key.data() + key_sz - kNumInternalBytes, kNumInternalBytes);
|
|
}
|
|
|
|
std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const {
|
|
std::string result = "'";
|
|
if (log_err_key) {
|
|
result += user_key.ToString(hex);
|
|
} else {
|
|
result += "<redacted>";
|
|
}
|
|
|
|
char buf[50];
|
|
snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence,
|
|
static_cast<int>(type));
|
|
|
|
result += buf;
|
|
return result;
|
|
}
|
|
|
|
std::string InternalKey::DebugString(bool hex) const {
|
|
std::string result;
|
|
ParsedInternalKey parsed;
|
|
if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) {
|
|
result = parsed.DebugString(true /* log_err_key */, hex); // TODO
|
|
} else {
|
|
result = "(bad)";
|
|
result.append(EscapeString(rep_));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
|
|
const ParsedInternalKey& b) const {
|
|
// Order by:
|
|
// increasing user key (according to user-supplied comparator)
|
|
// decreasing sequence number
|
|
// decreasing type (though sequence# should be enough to disambiguate)
|
|
int r = user_comparator_.Compare(a.user_key, b.user_key);
|
|
if (r == 0) {
|
|
if (a.sequence > b.sequence) {
|
|
r = -1;
|
|
} else if (a.sequence < b.sequence) {
|
|
r = +1;
|
|
} else if (a.type > b.type) {
|
|
r = -1;
|
|
} else if (a.type < b.type) {
|
|
r = +1;
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
int InternalKeyComparator::Compare(const Slice& a,
|
|
const ParsedInternalKey& b) const {
|
|
// Order by:
|
|
// increasing user key (according to user-supplied comparator)
|
|
// decreasing sequence number
|
|
// decreasing type (though sequence# should be enough to disambiguate)
|
|
int r = user_comparator_.Compare(ExtractUserKey(a), b.user_key);
|
|
if (r == 0) {
|
|
const uint64_t anum =
|
|
DecodeFixed64(a.data() + a.size() - kNumInternalBytes);
|
|
const uint64_t bnum = (b.sequence << 8) | b.type;
|
|
if (anum > bnum) {
|
|
r = -1;
|
|
} else if (anum < bnum) {
|
|
r = +1;
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
|
|
const Slice& b) const {
|
|
return -Compare(b, a);
|
|
}
|
|
|
|
LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
|
|
const Slice* ts) {
|
|
size_t usize = _user_key.size();
|
|
size_t ts_sz = (nullptr == ts) ? 0 : ts->size();
|
|
size_t needed = usize + ts_sz + 13; // A conservative estimate
|
|
char* dst;
|
|
if (needed <= sizeof(space_)) {
|
|
dst = space_;
|
|
} else {
|
|
dst = new char[needed];
|
|
}
|
|
start_ = dst;
|
|
// NOTE: We don't support users keys of more than 2GB :)
|
|
dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + ts_sz + 8));
|
|
kstart_ = dst;
|
|
memcpy(dst, _user_key.data(), usize);
|
|
dst += usize;
|
|
if (nullptr != ts) {
|
|
memcpy(dst, ts->data(), ts_sz);
|
|
dst += ts_sz;
|
|
}
|
|
EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
|
|
dst += 8;
|
|
end_ = dst;
|
|
}
|
|
|
|
void IterKey::EnlargeBuffer(size_t key_size) {
|
|
// If size is smaller than buffer size, continue using current buffer,
|
|
// or the static allocated one, as default
|
|
assert(key_size > buf_size_);
|
|
// Need to enlarge the buffer.
|
|
ResetBuffer();
|
|
buf_ = new char[key_size];
|
|
buf_size_ = key_size;
|
|
}
|
|
} // namespace ROCKSDB_NAMESPACE
|