mirror of https://github.com/facebook/rocksdb.git
Add some per key optimization for UDT in memtable only feature (#13031)
Summary: This PR added some optimizations for the per key handling for SST file for the user-defined timestamps in Memtable only feature. CPU profiling shows this part is a big culprit for regression. This optimization saves some string construction/destruction/appending/copying. vector operations like reserve/emplace_back. When iterating keys in a block, we need to copy some shared bytes from previous key, put it together with the non shared bytes and find a right location to pad the min timestamp. Previously, we create a tmp local string buffer to first construct the key from its pieces, and then copying this local string's content into `IterKey`'s buffer. To avoid having this local string and to avoid this extra copy. Instead of piecing together the key in a local string first, we just track all the pieces that make this key in a reused Slice array. And then copy the pieces in order into `IterKey`'s buffer. Since the previous key should be kept intact while we are copying some shared bytes from it, we added a secondary buffer in `IterKey` and alternate between primary buffer and secondary buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/13031 Test Plan: Existing tests. Reviewed By: ltamasi Differential Revision: D63416531 Pulled By: jowlyzhang fbshipit-source-id: 9819b0e02301a2dbc90621b2fe4f651bc912113c
This commit is contained in:
parent
917e98ff9e
commit
32dd657bad
|
@ -272,11 +272,23 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
|
||||||
|
|
||||||
void IterKey::EnlargeBuffer(size_t key_size) {
|
void IterKey::EnlargeBuffer(size_t key_size) {
|
||||||
// If size is smaller than buffer size, continue using current buffer,
|
// If size is smaller than buffer size, continue using current buffer,
|
||||||
// or the static allocated one, as default
|
// or the inline one, as default
|
||||||
assert(key_size > buf_size_);
|
assert(key_size > buf_size_);
|
||||||
// Need to enlarge the buffer.
|
// Need to enlarge the buffer.
|
||||||
ResetBuffer();
|
ResetBuffer();
|
||||||
buf_ = new char[key_size];
|
buf_ = new char[key_size];
|
||||||
buf_size_ = key_size;
|
buf_size_ = key_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void IterKey::EnlargeSecondaryBufferIfNeeded(size_t key_size) {
|
||||||
|
// If size is smaller than buffer size, continue using current buffer,
|
||||||
|
// or the inline one, as default
|
||||||
|
if (key_size <= secondary_buf_size_) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Need to enlarge the secondary buffer.
|
||||||
|
ResetSecondaryBuffer();
|
||||||
|
secondary_buf_ = new char[key_size];
|
||||||
|
secondary_buf_size_ = key_size;
|
||||||
|
}
|
||||||
} // namespace ROCKSDB_NAMESPACE
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
|
177
db/dbformat.h
177
db/dbformat.h
|
@ -10,6 +10,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include <array>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -562,18 +563,28 @@ inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
|
||||||
// allocation for smaller keys.
|
// allocation for smaller keys.
|
||||||
// 3. It tracks user key or internal key, and allow conversion between them.
|
// 3. It tracks user key or internal key, and allow conversion between them.
|
||||||
class IterKey {
|
class IterKey {
|
||||||
|
static constexpr size_t kInlineBufferSize = 39;
|
||||||
|
// This is only used by user-defined timestamps in MemTable only feature,
|
||||||
|
// which only supports uint64_t timestamps.
|
||||||
|
static constexpr char kTsMin[] = "\x00\x00\x00\x00\x00\x00\x00\x00";
|
||||||
|
|
||||||
public:
|
public:
|
||||||
IterKey()
|
IterKey()
|
||||||
: buf_(space_),
|
: buf_(space_),
|
||||||
key_(buf_),
|
key_(buf_),
|
||||||
key_size_(0),
|
key_size_(0),
|
||||||
buf_size_(sizeof(space_)),
|
buf_size_(kInlineBufferSize),
|
||||||
is_user_key_(true) {}
|
is_user_key_(true),
|
||||||
|
secondary_buf_(space_for_secondary_buf_),
|
||||||
|
secondary_buf_size_(kInlineBufferSize) {}
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
IterKey(const IterKey&) = delete;
|
IterKey(const IterKey&) = delete;
|
||||||
void operator=(const IterKey&) = delete;
|
void operator=(const IterKey&) = delete;
|
||||||
|
|
||||||
~IterKey() { ResetBuffer(); }
|
~IterKey() {
|
||||||
|
ResetBuffer();
|
||||||
|
ResetSecondaryBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
// The bool will be picked up by the next calls to SetKey
|
// The bool will be picked up by the next calls to SetKey
|
||||||
void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
|
void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
|
||||||
|
@ -641,13 +652,15 @@ class IterKey {
|
||||||
const char* non_shared_data,
|
const char* non_shared_data,
|
||||||
const size_t non_shared_len,
|
const size_t non_shared_len,
|
||||||
const size_t ts_sz) {
|
const size_t ts_sz) {
|
||||||
std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
|
// This function is only used by the UDT in memtable feature, which only
|
||||||
std::string key_with_ts;
|
// support built in comparators with uint64 timestamps.
|
||||||
std::vector<Slice> key_parts_with_ts;
|
assert(ts_sz == sizeof(uint64_t));
|
||||||
|
size_t next_key_slice_index = 0;
|
||||||
if (IsUserKey()) {
|
if (IsUserKey()) {
|
||||||
key_parts_with_ts = {Slice(key_, shared_len),
|
key_slices_[next_key_slice_index++] = Slice(key_, shared_len);
|
||||||
Slice(non_shared_data, non_shared_len),
|
key_slices_[next_key_slice_index++] =
|
||||||
Slice(kTsMin)};
|
Slice(non_shared_data, non_shared_len);
|
||||||
|
key_slices_[next_key_slice_index++] = Slice(kTsMin, ts_sz);
|
||||||
} else {
|
} else {
|
||||||
assert(shared_len + non_shared_len >= kNumInternalBytes);
|
assert(shared_len + non_shared_len >= kNumInternalBytes);
|
||||||
// Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len
|
// Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len
|
||||||
|
@ -664,30 +677,46 @@ class IterKey {
|
||||||
|
|
||||||
// One Slice among the three Slices will get split into two Slices, plus
|
// One Slice among the three Slices will get split into two Slices, plus
|
||||||
// a timestamp slice.
|
// a timestamp slice.
|
||||||
key_parts_with_ts.reserve(5);
|
|
||||||
bool ts_added = false;
|
bool ts_added = false;
|
||||||
// Add slice parts and find the right location to add the min timestamp.
|
// Add slice parts and find the right location to add the min timestamp.
|
||||||
MaybeAddKeyPartsWithTimestamp(
|
MaybeAddKeyPartsWithTimestamp(
|
||||||
key_, shared_user_key_len,
|
key_, shared_user_key_len,
|
||||||
shared_internal_bytes_len + non_shared_len < kNumInternalBytes,
|
shared_internal_bytes_len + non_shared_len < kNumInternalBytes,
|
||||||
shared_len + non_shared_len - kNumInternalBytes, kTsMin,
|
shared_len + non_shared_len - kNumInternalBytes, ts_sz,
|
||||||
key_parts_with_ts, &ts_added);
|
&next_key_slice_index, &ts_added);
|
||||||
MaybeAddKeyPartsWithTimestamp(
|
MaybeAddKeyPartsWithTimestamp(
|
||||||
key_ + user_key_len, shared_internal_bytes_len,
|
key_ + user_key_len, shared_internal_bytes_len,
|
||||||
non_shared_len < kNumInternalBytes,
|
non_shared_len < kNumInternalBytes,
|
||||||
shared_internal_bytes_len + non_shared_len - kNumInternalBytes,
|
shared_internal_bytes_len + non_shared_len - kNumInternalBytes, ts_sz,
|
||||||
kTsMin, key_parts_with_ts, &ts_added);
|
&next_key_slice_index, &ts_added);
|
||||||
MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len,
|
MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len,
|
||||||
non_shared_len >= kNumInternalBytes,
|
non_shared_len >= kNumInternalBytes,
|
||||||
non_shared_len - kNumInternalBytes, kTsMin,
|
non_shared_len - kNumInternalBytes, ts_sz,
|
||||||
key_parts_with_ts, &ts_added);
|
&next_key_slice_index, &ts_added);
|
||||||
assert(ts_added);
|
assert(ts_added);
|
||||||
}
|
}
|
||||||
|
SetKeyImpl(next_key_slice_index,
|
||||||
|
/* total_bytes= */ shared_len + non_shared_len + ts_sz);
|
||||||
|
}
|
||||||
|
|
||||||
Slice new_key(SliceParts(&key_parts_with_ts.front(),
|
Slice SetKeyWithPaddedMinTimestamp(const Slice& key, size_t ts_sz) {
|
||||||
static_cast<int>(key_parts_with_ts.size())),
|
// This function is only used by the UDT in memtable feature, which only
|
||||||
&key_with_ts);
|
// support built in comparators with uint64 timestamps.
|
||||||
SetKey(new_key);
|
assert(ts_sz == sizeof(uint64_t));
|
||||||
|
size_t num_key_slices = 0;
|
||||||
|
if (is_user_key_) {
|
||||||
|
key_slices_[0] = key;
|
||||||
|
key_slices_[1] = Slice(kTsMin, ts_sz);
|
||||||
|
num_key_slices = 2;
|
||||||
|
} else {
|
||||||
|
assert(key.size() >= kNumInternalBytes);
|
||||||
|
size_t user_key_size = key.size() - kNumInternalBytes;
|
||||||
|
key_slices_[0] = Slice(key.data(), user_key_size);
|
||||||
|
key_slices_[1] = Slice(kTsMin, ts_sz);
|
||||||
|
key_slices_[2] = Slice(key.data() + user_key_size, kNumInternalBytes);
|
||||||
|
num_key_slices = 3;
|
||||||
|
}
|
||||||
|
return SetKeyImpl(num_key_slices, key.size() + ts_sz);
|
||||||
}
|
}
|
||||||
|
|
||||||
Slice SetKey(const Slice& key, bool copy = true) {
|
Slice SetKey(const Slice& key, bool copy = true) {
|
||||||
|
@ -718,15 +747,6 @@ class IterKey {
|
||||||
return Slice(key_, key_n);
|
return Slice(key_, key_n);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy the key into IterKey own buf_
|
|
||||||
void OwnKey() {
|
|
||||||
assert(IsKeyPinned() == true);
|
|
||||||
|
|
||||||
Reserve(key_size_);
|
|
||||||
memcpy(buf_, key_, key_size_);
|
|
||||||
key_ = buf_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update the sequence number in the internal key. Guarantees not to
|
// Update the sequence number in the internal key. Guarantees not to
|
||||||
// invalidate slices to the key (and the user key).
|
// invalidate slices to the key (and the user key).
|
||||||
void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
|
void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
|
||||||
|
@ -738,10 +758,15 @@ class IterKey {
|
||||||
ts->size());
|
ts->size());
|
||||||
}
|
}
|
||||||
uint64_t newval = (seq << 8) | t;
|
uint64_t newval = (seq << 8) | t;
|
||||||
EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
|
if (key_ == buf_) {
|
||||||
|
EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
|
||||||
|
} else {
|
||||||
|
assert(key_ == secondary_buf_);
|
||||||
|
EncodeFixed64(&secondary_buf_[key_size_ - kNumInternalBytes], newval);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool IsKeyPinned() const { return (key_ != buf_); }
|
bool IsKeyPinned() const { return key_ != buf_ && key_ != secondary_buf_; }
|
||||||
|
|
||||||
// If `ts` is provided, user_key should not contain timestamp,
|
// If `ts` is provided, user_key should not contain timestamp,
|
||||||
// and `ts` is appended after user_key.
|
// and `ts` is appended after user_key.
|
||||||
|
@ -806,8 +831,24 @@ class IterKey {
|
||||||
const char* key_;
|
const char* key_;
|
||||||
size_t key_size_;
|
size_t key_size_;
|
||||||
size_t buf_size_;
|
size_t buf_size_;
|
||||||
char space_[39]; // Avoid allocation for short keys
|
char space_[kInlineBufferSize]; // Avoid allocation for short keys
|
||||||
bool is_user_key_;
|
bool is_user_key_;
|
||||||
|
// Below variables are only used by user-defined timestamps in MemTable only
|
||||||
|
// feature for iterating keys in an index block or a data block.
|
||||||
|
//
|
||||||
|
// We will alternate between buf_ and secondary_buf_ to hold the key. key_
|
||||||
|
// will be modified in accordance to point to the right one. This is to avoid
|
||||||
|
// an extra copy when we need to copy some shared bytes from previous key
|
||||||
|
// (delta encoding), and we need to pad a min timestamp at the right location.
|
||||||
|
char space_for_secondary_buf_[kInlineBufferSize]; // Avoid allocation for
|
||||||
|
// short keys
|
||||||
|
char* secondary_buf_;
|
||||||
|
size_t secondary_buf_size_;
|
||||||
|
// Use to track the pieces that together make the whole key. We then copy
|
||||||
|
// these pieces in order either into buf_ or secondary_buf_ depending on where
|
||||||
|
// the previous key is held.
|
||||||
|
std::array<Slice, 5> key_slices_;
|
||||||
|
// End of variables used by user-defined timestamps in MemTable only feature.
|
||||||
|
|
||||||
Slice SetKeyImpl(const Slice& key, bool copy) {
|
Slice SetKeyImpl(const Slice& key, bool copy) {
|
||||||
size_t size = key.size();
|
size_t size = key.size();
|
||||||
|
@ -824,18 +865,64 @@ class IterKey {
|
||||||
return Slice(key_, key_size_);
|
return Slice(key_, key_size_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Slice SetKeyImpl(size_t num_key_slices, size_t total_bytes) {
|
||||||
|
assert(num_key_slices <= 5);
|
||||||
|
char* buf_start = nullptr;
|
||||||
|
if (key_ == buf_) {
|
||||||
|
// If the previous key is in buf_, we copy key_slices_ in order into
|
||||||
|
// secondary_buf_.
|
||||||
|
EnlargeSecondaryBufferIfNeeded(total_bytes);
|
||||||
|
buf_start = secondary_buf_;
|
||||||
|
key_ = secondary_buf_;
|
||||||
|
} else {
|
||||||
|
// Copy key_slices_ in order into buf_.
|
||||||
|
EnlargeBufferIfNeeded(total_bytes);
|
||||||
|
buf_start = buf_;
|
||||||
|
key_ = buf_;
|
||||||
|
}
|
||||||
|
#ifndef NDEBUG
|
||||||
|
size_t actual_total_bytes = 0;
|
||||||
|
#endif // NDEBUG
|
||||||
|
for (size_t i = 0; i < num_key_slices; i++) {
|
||||||
|
size_t key_slice_size = key_slices_[i].size();
|
||||||
|
memcpy(buf_start, key_slices_[i].data(), key_slice_size);
|
||||||
|
buf_start += key_slice_size;
|
||||||
|
#ifndef NDEBUG
|
||||||
|
actual_total_bytes += key_slice_size;
|
||||||
|
#endif // NDEBUG
|
||||||
|
}
|
||||||
|
#ifndef NDEBUG
|
||||||
|
assert(actual_total_bytes == total_bytes);
|
||||||
|
#endif // NDEBUG
|
||||||
|
key_size_ = total_bytes;
|
||||||
|
return Slice(key_, key_size_);
|
||||||
|
}
|
||||||
|
|
||||||
void ResetBuffer() {
|
void ResetBuffer() {
|
||||||
|
if (key_ == buf_) {
|
||||||
|
key_size_ = 0;
|
||||||
|
}
|
||||||
if (buf_ != space_) {
|
if (buf_ != space_) {
|
||||||
delete[] buf_;
|
delete[] buf_;
|
||||||
buf_ = space_;
|
buf_ = space_;
|
||||||
}
|
}
|
||||||
buf_size_ = sizeof(space_);
|
buf_size_ = kInlineBufferSize;
|
||||||
key_size_ = 0;
|
}
|
||||||
|
|
||||||
|
void ResetSecondaryBuffer() {
|
||||||
|
if (key_ == secondary_buf_) {
|
||||||
|
key_size_ = 0;
|
||||||
|
}
|
||||||
|
if (secondary_buf_ != space_for_secondary_buf_) {
|
||||||
|
delete[] secondary_buf_;
|
||||||
|
secondary_buf_ = space_for_secondary_buf_;
|
||||||
|
}
|
||||||
|
secondary_buf_size_ = kInlineBufferSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enlarge the buffer size if needed based on key_size.
|
// Enlarge the buffer size if needed based on key_size.
|
||||||
// By default, static allocated buffer is used. Once there is a key
|
// By default, inline buffer is used. Once there is a key
|
||||||
// larger than the static allocated buffer, another buffer is dynamically
|
// larger than the inline buffer, another buffer is dynamically
|
||||||
// allocated, until a larger key buffer is requested. In that case, we
|
// allocated, until a larger key buffer is requested. In that case, we
|
||||||
// reallocate buffer and delete the old one.
|
// reallocate buffer and delete the old one.
|
||||||
void EnlargeBufferIfNeeded(size_t key_size) {
|
void EnlargeBufferIfNeeded(size_t key_size) {
|
||||||
|
@ -846,23 +933,27 @@ class IterKey {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void EnlargeSecondaryBufferIfNeeded(size_t key_size);
|
||||||
|
|
||||||
void EnlargeBuffer(size_t key_size);
|
void EnlargeBuffer(size_t key_size);
|
||||||
|
|
||||||
void MaybeAddKeyPartsWithTimestamp(const char* slice_data,
|
void MaybeAddKeyPartsWithTimestamp(const char* slice_data,
|
||||||
const size_t slice_sz, bool add_timestamp,
|
const size_t slice_sz, bool add_timestamp,
|
||||||
const size_t left_sz,
|
const size_t left_sz, const size_t ts_sz,
|
||||||
const std::string& min_timestamp,
|
size_t* next_key_slice_idx,
|
||||||
std::vector<Slice>& key_parts,
|
|
||||||
bool* ts_added) {
|
bool* ts_added) {
|
||||||
|
assert(next_key_slice_idx);
|
||||||
if (add_timestamp && !*ts_added) {
|
if (add_timestamp && !*ts_added) {
|
||||||
assert(slice_sz >= left_sz);
|
assert(slice_sz >= left_sz);
|
||||||
key_parts.emplace_back(slice_data, left_sz);
|
key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, left_sz);
|
||||||
key_parts.emplace_back(min_timestamp);
|
key_slices_[(*next_key_slice_idx)++] = Slice(kTsMin, ts_sz);
|
||||||
key_parts.emplace_back(slice_data + left_sz, slice_sz - left_sz);
|
key_slices_[(*next_key_slice_idx)++] =
|
||||||
|
Slice(slice_data + left_sz, slice_sz - left_sz);
|
||||||
*ts_added = true;
|
*ts_added = true;
|
||||||
} else {
|
} else {
|
||||||
key_parts.emplace_back(slice_data, slice_sz);
|
key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, slice_sz);
|
||||||
}
|
}
|
||||||
|
assert(*next_key_slice_idx <= 5);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -575,13 +575,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
|
||||||
|
|
||||||
void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
|
void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
|
||||||
if (pad_min_timestamp_) {
|
if (pad_min_timestamp_) {
|
||||||
std::string buf;
|
raw_key_.SetKeyWithPaddedMinTimestamp(key, ts_sz_);
|
||||||
if (raw_key_.IsUserKey()) {
|
|
||||||
AppendKeyWithMinTimestamp(&buf, key, ts_sz_);
|
|
||||||
} else {
|
|
||||||
PadInternalKeyWithMinTimestamp(&buf, key, ts_sz_);
|
|
||||||
}
|
|
||||||
raw_key_.SetKey(buf, true /* copy */);
|
|
||||||
} else {
|
} else {
|
||||||
raw_key_.SetKey(key, false /* copy */);
|
raw_key_.SetKey(key, false /* copy */);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue