Add some per key optimization for UDT in memtable only feature (#13031)

Summary:
This PR added some optimizations for the per key handling for SST file for the user-defined timestamps in Memtable only feature. CPU profiling shows this part is a big culprit for regression. This optimization saves some string construction/destruction/appending/copying. vector operations like reserve/emplace_back.

When iterating keys in a block, we need to copy some shared bytes from previous key, put it together with the non shared bytes and find a right location to pad the min timestamp. Previously, we create a tmp local string buffer to first construct the key from its pieces, and then copying this local string's content into `IterKey`'s buffer. To avoid having this local string and to avoid this extra copy. Instead of piecing together the key in a local string first, we just track all the pieces that make this key in a reused Slice array. And then copy the pieces in order into `IterKey`'s buffer. Since the previous key should be kept intact while we are copying some shared bytes from it,  we added a secondary buffer in `IterKey` and alternate between primary buffer and secondary buffer.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13031

Test Plan: Existing tests.

Reviewed By: ltamasi

Differential Revision: D63416531

Pulled By: jowlyzhang

fbshipit-source-id: 9819b0e02301a2dbc90621b2fe4f651bc912113c
This commit is contained in:
Yu Zhang 2024-10-03 17:57:50 -07:00 committed by Facebook GitHub Bot
parent 917e98ff9e
commit 32dd657bad
3 changed files with 148 additions and 51 deletions

View File

@ -272,11 +272,23 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
void IterKey::EnlargeBuffer(size_t key_size) { void IterKey::EnlargeBuffer(size_t key_size) {
// If size is smaller than buffer size, continue using current buffer, // If size is smaller than buffer size, continue using current buffer,
// or the static allocated one, as default // or the inline one, as default
assert(key_size > buf_size_); assert(key_size > buf_size_);
// Need to enlarge the buffer. // Need to enlarge the buffer.
ResetBuffer(); ResetBuffer();
buf_ = new char[key_size]; buf_ = new char[key_size];
buf_size_ = key_size; buf_size_ = key_size;
} }
void IterKey::EnlargeSecondaryBufferIfNeeded(size_t key_size) {
// If size is smaller than buffer size, continue using current buffer,
// or the inline one, as default
if (key_size <= secondary_buf_size_) {
return;
}
// Need to enlarge the secondary buffer.
ResetSecondaryBuffer();
secondary_buf_ = new char[key_size];
secondary_buf_size_ = key_size;
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -10,6 +10,7 @@
#pragma once #pragma once
#include <stdio.h> #include <stdio.h>
#include <array>
#include <memory> #include <memory>
#include <optional> #include <optional>
#include <string> #include <string>
@ -562,18 +563,28 @@ inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
// allocation for smaller keys. // allocation for smaller keys.
// 3. It tracks user key or internal key, and allow conversion between them. // 3. It tracks user key or internal key, and allow conversion between them.
class IterKey { class IterKey {
static constexpr size_t kInlineBufferSize = 39;
// This is only used by user-defined timestamps in MemTable only feature,
// which only supports uint64_t timestamps.
static constexpr char kTsMin[] = "\x00\x00\x00\x00\x00\x00\x00\x00";
public: public:
IterKey() IterKey()
: buf_(space_), : buf_(space_),
key_(buf_), key_(buf_),
key_size_(0), key_size_(0),
buf_size_(sizeof(space_)), buf_size_(kInlineBufferSize),
is_user_key_(true) {} is_user_key_(true),
secondary_buf_(space_for_secondary_buf_),
secondary_buf_size_(kInlineBufferSize) {}
// No copying allowed // No copying allowed
IterKey(const IterKey&) = delete; IterKey(const IterKey&) = delete;
void operator=(const IterKey&) = delete; void operator=(const IterKey&) = delete;
~IterKey() { ResetBuffer(); } ~IterKey() {
ResetBuffer();
ResetSecondaryBuffer();
}
// The bool will be picked up by the next calls to SetKey // The bool will be picked up by the next calls to SetKey
void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; } void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
@ -641,13 +652,15 @@ class IterKey {
const char* non_shared_data, const char* non_shared_data,
const size_t non_shared_len, const size_t non_shared_len,
const size_t ts_sz) { const size_t ts_sz) {
std::string kTsMin(ts_sz, static_cast<unsigned char>(0)); // This function is only used by the UDT in memtable feature, which only
std::string key_with_ts; // support built in comparators with uint64 timestamps.
std::vector<Slice> key_parts_with_ts; assert(ts_sz == sizeof(uint64_t));
size_t next_key_slice_index = 0;
if (IsUserKey()) { if (IsUserKey()) {
key_parts_with_ts = {Slice(key_, shared_len), key_slices_[next_key_slice_index++] = Slice(key_, shared_len);
Slice(non_shared_data, non_shared_len), key_slices_[next_key_slice_index++] =
Slice(kTsMin)}; Slice(non_shared_data, non_shared_len);
key_slices_[next_key_slice_index++] = Slice(kTsMin, ts_sz);
} else { } else {
assert(shared_len + non_shared_len >= kNumInternalBytes); assert(shared_len + non_shared_len >= kNumInternalBytes);
// Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len // Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len
@ -664,30 +677,46 @@ class IterKey {
// One Slice among the three Slices will get split into two Slices, plus // One Slice among the three Slices will get split into two Slices, plus
// a timestamp slice. // a timestamp slice.
key_parts_with_ts.reserve(5);
bool ts_added = false; bool ts_added = false;
// Add slice parts and find the right location to add the min timestamp. // Add slice parts and find the right location to add the min timestamp.
MaybeAddKeyPartsWithTimestamp( MaybeAddKeyPartsWithTimestamp(
key_, shared_user_key_len, key_, shared_user_key_len,
shared_internal_bytes_len + non_shared_len < kNumInternalBytes, shared_internal_bytes_len + non_shared_len < kNumInternalBytes,
shared_len + non_shared_len - kNumInternalBytes, kTsMin, shared_len + non_shared_len - kNumInternalBytes, ts_sz,
key_parts_with_ts, &ts_added); &next_key_slice_index, &ts_added);
MaybeAddKeyPartsWithTimestamp( MaybeAddKeyPartsWithTimestamp(
key_ + user_key_len, shared_internal_bytes_len, key_ + user_key_len, shared_internal_bytes_len,
non_shared_len < kNumInternalBytes, non_shared_len < kNumInternalBytes,
shared_internal_bytes_len + non_shared_len - kNumInternalBytes, shared_internal_bytes_len + non_shared_len - kNumInternalBytes, ts_sz,
kTsMin, key_parts_with_ts, &ts_added); &next_key_slice_index, &ts_added);
MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len, MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len,
non_shared_len >= kNumInternalBytes, non_shared_len >= kNumInternalBytes,
non_shared_len - kNumInternalBytes, kTsMin, non_shared_len - kNumInternalBytes, ts_sz,
key_parts_with_ts, &ts_added); &next_key_slice_index, &ts_added);
assert(ts_added); assert(ts_added);
} }
SetKeyImpl(next_key_slice_index,
/* total_bytes= */ shared_len + non_shared_len + ts_sz);
}
Slice new_key(SliceParts(&key_parts_with_ts.front(), Slice SetKeyWithPaddedMinTimestamp(const Slice& key, size_t ts_sz) {
static_cast<int>(key_parts_with_ts.size())), // This function is only used by the UDT in memtable feature, which only
&key_with_ts); // support built in comparators with uint64 timestamps.
SetKey(new_key); assert(ts_sz == sizeof(uint64_t));
size_t num_key_slices = 0;
if (is_user_key_) {
key_slices_[0] = key;
key_slices_[1] = Slice(kTsMin, ts_sz);
num_key_slices = 2;
} else {
assert(key.size() >= kNumInternalBytes);
size_t user_key_size = key.size() - kNumInternalBytes;
key_slices_[0] = Slice(key.data(), user_key_size);
key_slices_[1] = Slice(kTsMin, ts_sz);
key_slices_[2] = Slice(key.data() + user_key_size, kNumInternalBytes);
num_key_slices = 3;
}
return SetKeyImpl(num_key_slices, key.size() + ts_sz);
} }
Slice SetKey(const Slice& key, bool copy = true) { Slice SetKey(const Slice& key, bool copy = true) {
@ -718,15 +747,6 @@ class IterKey {
return Slice(key_, key_n); return Slice(key_, key_n);
} }
// Copy the key into IterKey own buf_
void OwnKey() {
assert(IsKeyPinned() == true);
Reserve(key_size_);
memcpy(buf_, key_, key_size_);
key_ = buf_;
}
// Update the sequence number in the internal key. Guarantees not to // Update the sequence number in the internal key. Guarantees not to
// invalidate slices to the key (and the user key). // invalidate slices to the key (and the user key).
void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) { void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
@ -738,10 +758,15 @@ class IterKey {
ts->size()); ts->size());
} }
uint64_t newval = (seq << 8) | t; uint64_t newval = (seq << 8) | t;
if (key_ == buf_) {
EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval); EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
} else {
assert(key_ == secondary_buf_);
EncodeFixed64(&secondary_buf_[key_size_ - kNumInternalBytes], newval);
}
} }
bool IsKeyPinned() const { return (key_ != buf_); } bool IsKeyPinned() const { return key_ != buf_ && key_ != secondary_buf_; }
// If `ts` is provided, user_key should not contain timestamp, // If `ts` is provided, user_key should not contain timestamp,
// and `ts` is appended after user_key. // and `ts` is appended after user_key.
@ -806,8 +831,24 @@ class IterKey {
const char* key_; const char* key_;
size_t key_size_; size_t key_size_;
size_t buf_size_; size_t buf_size_;
char space_[39]; // Avoid allocation for short keys char space_[kInlineBufferSize]; // Avoid allocation for short keys
bool is_user_key_; bool is_user_key_;
// Below variables are only used by user-defined timestamps in MemTable only
// feature for iterating keys in an index block or a data block.
//
// We will alternate between buf_ and secondary_buf_ to hold the key. key_
// will be modified in accordance to point to the right one. This is to avoid
// an extra copy when we need to copy some shared bytes from previous key
// (delta encoding), and we need to pad a min timestamp at the right location.
char space_for_secondary_buf_[kInlineBufferSize]; // Avoid allocation for
// short keys
char* secondary_buf_;
size_t secondary_buf_size_;
// Use to track the pieces that together make the whole key. We then copy
// these pieces in order either into buf_ or secondary_buf_ depending on where
// the previous key is held.
std::array<Slice, 5> key_slices_;
// End of variables used by user-defined timestamps in MemTable only feature.
Slice SetKeyImpl(const Slice& key, bool copy) { Slice SetKeyImpl(const Slice& key, bool copy) {
size_t size = key.size(); size_t size = key.size();
@ -824,18 +865,64 @@ class IterKey {
return Slice(key_, key_size_); return Slice(key_, key_size_);
} }
Slice SetKeyImpl(size_t num_key_slices, size_t total_bytes) {
assert(num_key_slices <= 5);
char* buf_start = nullptr;
if (key_ == buf_) {
// If the previous key is in buf_, we copy key_slices_ in order into
// secondary_buf_.
EnlargeSecondaryBufferIfNeeded(total_bytes);
buf_start = secondary_buf_;
key_ = secondary_buf_;
} else {
// Copy key_slices_ in order into buf_.
EnlargeBufferIfNeeded(total_bytes);
buf_start = buf_;
key_ = buf_;
}
#ifndef NDEBUG
size_t actual_total_bytes = 0;
#endif // NDEBUG
for (size_t i = 0; i < num_key_slices; i++) {
size_t key_slice_size = key_slices_[i].size();
memcpy(buf_start, key_slices_[i].data(), key_slice_size);
buf_start += key_slice_size;
#ifndef NDEBUG
actual_total_bytes += key_slice_size;
#endif // NDEBUG
}
#ifndef NDEBUG
assert(actual_total_bytes == total_bytes);
#endif // NDEBUG
key_size_ = total_bytes;
return Slice(key_, key_size_);
}
void ResetBuffer() { void ResetBuffer() {
if (key_ == buf_) {
key_size_ = 0;
}
if (buf_ != space_) { if (buf_ != space_) {
delete[] buf_; delete[] buf_;
buf_ = space_; buf_ = space_;
} }
buf_size_ = sizeof(space_); buf_size_ = kInlineBufferSize;
}
void ResetSecondaryBuffer() {
if (key_ == secondary_buf_) {
key_size_ = 0; key_size_ = 0;
} }
if (secondary_buf_ != space_for_secondary_buf_) {
delete[] secondary_buf_;
secondary_buf_ = space_for_secondary_buf_;
}
secondary_buf_size_ = kInlineBufferSize;
}
// Enlarge the buffer size if needed based on key_size. // Enlarge the buffer size if needed based on key_size.
// By default, static allocated buffer is used. Once there is a key // By default, inline buffer is used. Once there is a key
// larger than the static allocated buffer, another buffer is dynamically // larger than the inline buffer, another buffer is dynamically
// allocated, until a larger key buffer is requested. In that case, we // allocated, until a larger key buffer is requested. In that case, we
// reallocate buffer and delete the old one. // reallocate buffer and delete the old one.
void EnlargeBufferIfNeeded(size_t key_size) { void EnlargeBufferIfNeeded(size_t key_size) {
@ -846,23 +933,27 @@ class IterKey {
} }
} }
void EnlargeSecondaryBufferIfNeeded(size_t key_size);
void EnlargeBuffer(size_t key_size); void EnlargeBuffer(size_t key_size);
void MaybeAddKeyPartsWithTimestamp(const char* slice_data, void MaybeAddKeyPartsWithTimestamp(const char* slice_data,
const size_t slice_sz, bool add_timestamp, const size_t slice_sz, bool add_timestamp,
const size_t left_sz, const size_t left_sz, const size_t ts_sz,
const std::string& min_timestamp, size_t* next_key_slice_idx,
std::vector<Slice>& key_parts,
bool* ts_added) { bool* ts_added) {
assert(next_key_slice_idx);
if (add_timestamp && !*ts_added) { if (add_timestamp && !*ts_added) {
assert(slice_sz >= left_sz); assert(slice_sz >= left_sz);
key_parts.emplace_back(slice_data, left_sz); key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, left_sz);
key_parts.emplace_back(min_timestamp); key_slices_[(*next_key_slice_idx)++] = Slice(kTsMin, ts_sz);
key_parts.emplace_back(slice_data + left_sz, slice_sz - left_sz); key_slices_[(*next_key_slice_idx)++] =
Slice(slice_data + left_sz, slice_sz - left_sz);
*ts_added = true; *ts_added = true;
} else { } else {
key_parts.emplace_back(slice_data, slice_sz); key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, slice_sz);
} }
assert(*next_key_slice_idx <= 5);
} }
}; };

View File

@ -575,13 +575,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) { void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
if (pad_min_timestamp_) { if (pad_min_timestamp_) {
std::string buf; raw_key_.SetKeyWithPaddedMinTimestamp(key, ts_sz_);
if (raw_key_.IsUserKey()) {
AppendKeyWithMinTimestamp(&buf, key, ts_sz_);
} else {
PadInternalKeyWithMinTimestamp(&buf, key, ts_sz_);
}
raw_key_.SetKey(buf, true /* copy */);
} else { } else {
raw_key_.SetKey(key, false /* copy */); raw_key_.SetKey(key, false /* copy */);
} }