mirror of https://github.com/facebook/rocksdb.git
Add some per key optimization for UDT in memtable only feature (#13031)
Summary: This PR added some optimizations for the per key handling for SST file for the user-defined timestamps in Memtable only feature. CPU profiling shows this part is a big culprit for regression. This optimization saves some string construction/destruction/appending/copying. vector operations like reserve/emplace_back. When iterating keys in a block, we need to copy some shared bytes from previous key, put it together with the non shared bytes and find a right location to pad the min timestamp. Previously, we create a tmp local string buffer to first construct the key from its pieces, and then copying this local string's content into `IterKey`'s buffer. To avoid having this local string and to avoid this extra copy. Instead of piecing together the key in a local string first, we just track all the pieces that make this key in a reused Slice array. And then copy the pieces in order into `IterKey`'s buffer. Since the previous key should be kept intact while we are copying some shared bytes from it, we added a secondary buffer in `IterKey` and alternate between primary buffer and secondary buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/13031 Test Plan: Existing tests. Reviewed By: ltamasi Differential Revision: D63416531 Pulled By: jowlyzhang fbshipit-source-id: 9819b0e02301a2dbc90621b2fe4f651bc912113c
This commit is contained in:
parent
917e98ff9e
commit
32dd657bad
|
@ -272,11 +272,23 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
|
|||
|
||||
void IterKey::EnlargeBuffer(size_t key_size) {
|
||||
// If size is smaller than buffer size, continue using current buffer,
|
||||
// or the static allocated one, as default
|
||||
// or the inline one, as default
|
||||
assert(key_size > buf_size_);
|
||||
// Need to enlarge the buffer.
|
||||
ResetBuffer();
|
||||
buf_ = new char[key_size];
|
||||
buf_size_ = key_size;
|
||||
}
|
||||
|
||||
void IterKey::EnlargeSecondaryBufferIfNeeded(size_t key_size) {
|
||||
// If size is smaller than buffer size, continue using current buffer,
|
||||
// or the inline one, as default
|
||||
if (key_size <= secondary_buf_size_) {
|
||||
return;
|
||||
}
|
||||
// Need to enlarge the secondary buffer.
|
||||
ResetSecondaryBuffer();
|
||||
secondary_buf_ = new char[key_size];
|
||||
secondary_buf_size_ = key_size;
|
||||
}
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
177
db/dbformat.h
177
db/dbformat.h
|
@ -10,6 +10,7 @@
|
|||
#pragma once
|
||||
#include <stdio.h>
|
||||
|
||||
#include <array>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
@ -562,18 +563,28 @@ inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
|
|||
// allocation for smaller keys.
|
||||
// 3. It tracks user key or internal key, and allow conversion between them.
|
||||
class IterKey {
|
||||
static constexpr size_t kInlineBufferSize = 39;
|
||||
// This is only used by user-defined timestamps in MemTable only feature,
|
||||
// which only supports uint64_t timestamps.
|
||||
static constexpr char kTsMin[] = "\x00\x00\x00\x00\x00\x00\x00\x00";
|
||||
|
||||
public:
|
||||
IterKey()
|
||||
: buf_(space_),
|
||||
key_(buf_),
|
||||
key_size_(0),
|
||||
buf_size_(sizeof(space_)),
|
||||
is_user_key_(true) {}
|
||||
buf_size_(kInlineBufferSize),
|
||||
is_user_key_(true),
|
||||
secondary_buf_(space_for_secondary_buf_),
|
||||
secondary_buf_size_(kInlineBufferSize) {}
|
||||
// No copying allowed
|
||||
IterKey(const IterKey&) = delete;
|
||||
void operator=(const IterKey&) = delete;
|
||||
|
||||
~IterKey() { ResetBuffer(); }
|
||||
~IterKey() {
|
||||
ResetBuffer();
|
||||
ResetSecondaryBuffer();
|
||||
}
|
||||
|
||||
// The bool will be picked up by the next calls to SetKey
|
||||
void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
|
||||
|
@ -641,13 +652,15 @@ class IterKey {
|
|||
const char* non_shared_data,
|
||||
const size_t non_shared_len,
|
||||
const size_t ts_sz) {
|
||||
std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
|
||||
std::string key_with_ts;
|
||||
std::vector<Slice> key_parts_with_ts;
|
||||
// This function is only used by the UDT in memtable feature, which only
|
||||
// support built in comparators with uint64 timestamps.
|
||||
assert(ts_sz == sizeof(uint64_t));
|
||||
size_t next_key_slice_index = 0;
|
||||
if (IsUserKey()) {
|
||||
key_parts_with_ts = {Slice(key_, shared_len),
|
||||
Slice(non_shared_data, non_shared_len),
|
||||
Slice(kTsMin)};
|
||||
key_slices_[next_key_slice_index++] = Slice(key_, shared_len);
|
||||
key_slices_[next_key_slice_index++] =
|
||||
Slice(non_shared_data, non_shared_len);
|
||||
key_slices_[next_key_slice_index++] = Slice(kTsMin, ts_sz);
|
||||
} else {
|
||||
assert(shared_len + non_shared_len >= kNumInternalBytes);
|
||||
// Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len
|
||||
|
@ -664,30 +677,46 @@ class IterKey {
|
|||
|
||||
// One Slice among the three Slices will get split into two Slices, plus
|
||||
// a timestamp slice.
|
||||
key_parts_with_ts.reserve(5);
|
||||
bool ts_added = false;
|
||||
// Add slice parts and find the right location to add the min timestamp.
|
||||
MaybeAddKeyPartsWithTimestamp(
|
||||
key_, shared_user_key_len,
|
||||
shared_internal_bytes_len + non_shared_len < kNumInternalBytes,
|
||||
shared_len + non_shared_len - kNumInternalBytes, kTsMin,
|
||||
key_parts_with_ts, &ts_added);
|
||||
shared_len + non_shared_len - kNumInternalBytes, ts_sz,
|
||||
&next_key_slice_index, &ts_added);
|
||||
MaybeAddKeyPartsWithTimestamp(
|
||||
key_ + user_key_len, shared_internal_bytes_len,
|
||||
non_shared_len < kNumInternalBytes,
|
||||
shared_internal_bytes_len + non_shared_len - kNumInternalBytes,
|
||||
kTsMin, key_parts_with_ts, &ts_added);
|
||||
shared_internal_bytes_len + non_shared_len - kNumInternalBytes, ts_sz,
|
||||
&next_key_slice_index, &ts_added);
|
||||
MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len,
|
||||
non_shared_len >= kNumInternalBytes,
|
||||
non_shared_len - kNumInternalBytes, kTsMin,
|
||||
key_parts_with_ts, &ts_added);
|
||||
non_shared_len - kNumInternalBytes, ts_sz,
|
||||
&next_key_slice_index, &ts_added);
|
||||
assert(ts_added);
|
||||
}
|
||||
SetKeyImpl(next_key_slice_index,
|
||||
/* total_bytes= */ shared_len + non_shared_len + ts_sz);
|
||||
}
|
||||
|
||||
Slice new_key(SliceParts(&key_parts_with_ts.front(),
|
||||
static_cast<int>(key_parts_with_ts.size())),
|
||||
&key_with_ts);
|
||||
SetKey(new_key);
|
||||
Slice SetKeyWithPaddedMinTimestamp(const Slice& key, size_t ts_sz) {
|
||||
// This function is only used by the UDT in memtable feature, which only
|
||||
// support built in comparators with uint64 timestamps.
|
||||
assert(ts_sz == sizeof(uint64_t));
|
||||
size_t num_key_slices = 0;
|
||||
if (is_user_key_) {
|
||||
key_slices_[0] = key;
|
||||
key_slices_[1] = Slice(kTsMin, ts_sz);
|
||||
num_key_slices = 2;
|
||||
} else {
|
||||
assert(key.size() >= kNumInternalBytes);
|
||||
size_t user_key_size = key.size() - kNumInternalBytes;
|
||||
key_slices_[0] = Slice(key.data(), user_key_size);
|
||||
key_slices_[1] = Slice(kTsMin, ts_sz);
|
||||
key_slices_[2] = Slice(key.data() + user_key_size, kNumInternalBytes);
|
||||
num_key_slices = 3;
|
||||
}
|
||||
return SetKeyImpl(num_key_slices, key.size() + ts_sz);
|
||||
}
|
||||
|
||||
Slice SetKey(const Slice& key, bool copy = true) {
|
||||
|
@ -718,15 +747,6 @@ class IterKey {
|
|||
return Slice(key_, key_n);
|
||||
}
|
||||
|
||||
// Copy the key into IterKey own buf_
|
||||
void OwnKey() {
|
||||
assert(IsKeyPinned() == true);
|
||||
|
||||
Reserve(key_size_);
|
||||
memcpy(buf_, key_, key_size_);
|
||||
key_ = buf_;
|
||||
}
|
||||
|
||||
// Update the sequence number in the internal key. Guarantees not to
|
||||
// invalidate slices to the key (and the user key).
|
||||
void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
|
||||
|
@ -738,10 +758,15 @@ class IterKey {
|
|||
ts->size());
|
||||
}
|
||||
uint64_t newval = (seq << 8) | t;
|
||||
EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
|
||||
if (key_ == buf_) {
|
||||
EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
|
||||
} else {
|
||||
assert(key_ == secondary_buf_);
|
||||
EncodeFixed64(&secondary_buf_[key_size_ - kNumInternalBytes], newval);
|
||||
}
|
||||
}
|
||||
|
||||
bool IsKeyPinned() const { return (key_ != buf_); }
|
||||
bool IsKeyPinned() const { return key_ != buf_ && key_ != secondary_buf_; }
|
||||
|
||||
// If `ts` is provided, user_key should not contain timestamp,
|
||||
// and `ts` is appended after user_key.
|
||||
|
@ -806,8 +831,24 @@ class IterKey {
|
|||
const char* key_;
|
||||
size_t key_size_;
|
||||
size_t buf_size_;
|
||||
char space_[39]; // Avoid allocation for short keys
|
||||
char space_[kInlineBufferSize]; // Avoid allocation for short keys
|
||||
bool is_user_key_;
|
||||
// Below variables are only used by user-defined timestamps in MemTable only
|
||||
// feature for iterating keys in an index block or a data block.
|
||||
//
|
||||
// We will alternate between buf_ and secondary_buf_ to hold the key. key_
|
||||
// will be modified in accordance to point to the right one. This is to avoid
|
||||
// an extra copy when we need to copy some shared bytes from previous key
|
||||
// (delta encoding), and we need to pad a min timestamp at the right location.
|
||||
char space_for_secondary_buf_[kInlineBufferSize]; // Avoid allocation for
|
||||
// short keys
|
||||
char* secondary_buf_;
|
||||
size_t secondary_buf_size_;
|
||||
// Use to track the pieces that together make the whole key. We then copy
|
||||
// these pieces in order either into buf_ or secondary_buf_ depending on where
|
||||
// the previous key is held.
|
||||
std::array<Slice, 5> key_slices_;
|
||||
// End of variables used by user-defined timestamps in MemTable only feature.
|
||||
|
||||
Slice SetKeyImpl(const Slice& key, bool copy) {
|
||||
size_t size = key.size();
|
||||
|
@ -824,18 +865,64 @@ class IterKey {
|
|||
return Slice(key_, key_size_);
|
||||
}
|
||||
|
||||
Slice SetKeyImpl(size_t num_key_slices, size_t total_bytes) {
|
||||
assert(num_key_slices <= 5);
|
||||
char* buf_start = nullptr;
|
||||
if (key_ == buf_) {
|
||||
// If the previous key is in buf_, we copy key_slices_ in order into
|
||||
// secondary_buf_.
|
||||
EnlargeSecondaryBufferIfNeeded(total_bytes);
|
||||
buf_start = secondary_buf_;
|
||||
key_ = secondary_buf_;
|
||||
} else {
|
||||
// Copy key_slices_ in order into buf_.
|
||||
EnlargeBufferIfNeeded(total_bytes);
|
||||
buf_start = buf_;
|
||||
key_ = buf_;
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
size_t actual_total_bytes = 0;
|
||||
#endif // NDEBUG
|
||||
for (size_t i = 0; i < num_key_slices; i++) {
|
||||
size_t key_slice_size = key_slices_[i].size();
|
||||
memcpy(buf_start, key_slices_[i].data(), key_slice_size);
|
||||
buf_start += key_slice_size;
|
||||
#ifndef NDEBUG
|
||||
actual_total_bytes += key_slice_size;
|
||||
#endif // NDEBUG
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
assert(actual_total_bytes == total_bytes);
|
||||
#endif // NDEBUG
|
||||
key_size_ = total_bytes;
|
||||
return Slice(key_, key_size_);
|
||||
}
|
||||
|
||||
void ResetBuffer() {
|
||||
if (key_ == buf_) {
|
||||
key_size_ = 0;
|
||||
}
|
||||
if (buf_ != space_) {
|
||||
delete[] buf_;
|
||||
buf_ = space_;
|
||||
}
|
||||
buf_size_ = sizeof(space_);
|
||||
key_size_ = 0;
|
||||
buf_size_ = kInlineBufferSize;
|
||||
}
|
||||
|
||||
void ResetSecondaryBuffer() {
|
||||
if (key_ == secondary_buf_) {
|
||||
key_size_ = 0;
|
||||
}
|
||||
if (secondary_buf_ != space_for_secondary_buf_) {
|
||||
delete[] secondary_buf_;
|
||||
secondary_buf_ = space_for_secondary_buf_;
|
||||
}
|
||||
secondary_buf_size_ = kInlineBufferSize;
|
||||
}
|
||||
|
||||
// Enlarge the buffer size if needed based on key_size.
|
||||
// By default, static allocated buffer is used. Once there is a key
|
||||
// larger than the static allocated buffer, another buffer is dynamically
|
||||
// By default, inline buffer is used. Once there is a key
|
||||
// larger than the inline buffer, another buffer is dynamically
|
||||
// allocated, until a larger key buffer is requested. In that case, we
|
||||
// reallocate buffer and delete the old one.
|
||||
void EnlargeBufferIfNeeded(size_t key_size) {
|
||||
|
@ -846,23 +933,27 @@ class IterKey {
|
|||
}
|
||||
}
|
||||
|
||||
void EnlargeSecondaryBufferIfNeeded(size_t key_size);
|
||||
|
||||
void EnlargeBuffer(size_t key_size);
|
||||
|
||||
void MaybeAddKeyPartsWithTimestamp(const char* slice_data,
|
||||
const size_t slice_sz, bool add_timestamp,
|
||||
const size_t left_sz,
|
||||
const std::string& min_timestamp,
|
||||
std::vector<Slice>& key_parts,
|
||||
const size_t left_sz, const size_t ts_sz,
|
||||
size_t* next_key_slice_idx,
|
||||
bool* ts_added) {
|
||||
assert(next_key_slice_idx);
|
||||
if (add_timestamp && !*ts_added) {
|
||||
assert(slice_sz >= left_sz);
|
||||
key_parts.emplace_back(slice_data, left_sz);
|
||||
key_parts.emplace_back(min_timestamp);
|
||||
key_parts.emplace_back(slice_data + left_sz, slice_sz - left_sz);
|
||||
key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, left_sz);
|
||||
key_slices_[(*next_key_slice_idx)++] = Slice(kTsMin, ts_sz);
|
||||
key_slices_[(*next_key_slice_idx)++] =
|
||||
Slice(slice_data + left_sz, slice_sz - left_sz);
|
||||
*ts_added = true;
|
||||
} else {
|
||||
key_parts.emplace_back(slice_data, slice_sz);
|
||||
key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, slice_sz);
|
||||
}
|
||||
assert(*next_key_slice_idx <= 5);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -575,13 +575,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
|
|||
|
||||
void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
|
||||
if (pad_min_timestamp_) {
|
||||
std::string buf;
|
||||
if (raw_key_.IsUserKey()) {
|
||||
AppendKeyWithMinTimestamp(&buf, key, ts_sz_);
|
||||
} else {
|
||||
PadInternalKeyWithMinTimestamp(&buf, key, ts_sz_);
|
||||
}
|
||||
raw_key_.SetKey(buf, true /* copy */);
|
||||
raw_key_.SetKeyWithPaddedMinTimestamp(key, ts_sz_);
|
||||
} else {
|
||||
raw_key_.SetKey(key, false /* copy */);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue