Fix filter partition size logic (#12904)

Summary:
Was checking == a desired number of entries added to a filter, when the combination of whole key and prefix filtering could add more than one entry per table internal key. This could lead to unnecessarily large filter partitions, which could affect performance and block cache fairness.

Also (only somewhat related because of other work in progress):
* Some variable renaming and a new assertion in BlockBasedTableBuilder, to add some clarity.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12904

Test Plan:
If you add assertion logic to the base revision checking that the partition cut is requested whenever `keys_added_to_partition_ >= keys_per_partition_`, it fails on a number of db_bloom_filter_test tests. However, such an assertion in the revised code would be essentially redundant with the new logic.

If I added a regression test for this, it would be tricky and fragile, so I don't think it's important enough to chase and maintain.  (Open to suggestions / input.)

Reviewed By: jowlyzhang

Differential Revision: D60557827

Pulled By: pdillinger

fbshipit-source-id: 77a56097d540da6e7851941a26d26ced2d944373
This commit is contained in:
Peter Dillinger 2024-08-02 14:49:02 -07:00 committed by Facebook GitHub Bot
parent 2e8a1a14ef
commit 9d5c8c89a1
2 changed files with 22 additions and 19 deletions

View File

@ -296,7 +296,7 @@ struct BlockBasedTableBuilder::Rep {
std::string index_separator_scratch; std::string index_separator_scratch;
PartitionedIndexBuilder* p_index_builder_ = nullptr; PartitionedIndexBuilder* p_index_builder_ = nullptr;
std::string last_key; std::string last_ikey; // Internal key or empty (unset)
const Slice* first_key_in_next_block = nullptr; const Slice* first_key_in_next_block = nullptr;
CompressionType compression_type; CompressionType compression_type;
uint64_t sample_for_compression; uint64_t sample_for_compression;
@ -998,24 +998,24 @@ BlockBasedTableBuilder::~BlockBasedTableBuilder() {
delete rep_; delete rep_;
} }
void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
Rep* r = rep_; Rep* r = rep_;
assert(rep_->state != Rep::State::kClosed); assert(rep_->state != Rep::State::kClosed);
if (!ok()) { if (!ok()) {
return; return;
} }
ValueType value_type = ExtractValueType(key); ValueType value_type = ExtractValueType(ikey);
if (IsValueType(value_type)) { if (IsValueType(value_type)) {
#ifndef NDEBUG #ifndef NDEBUG
if (r->props.num_entries > r->props.num_range_deletions) { if (r->props.num_entries > r->props.num_range_deletions) {
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0); assert(r->internal_comparator.Compare(ikey, Slice(r->last_ikey)) > 0);
} }
#endif // !NDEBUG #endif // !NDEBUG
auto should_flush = r->flush_block_policy->Update(key, value); auto should_flush = r->flush_block_policy->Update(ikey, value);
if (should_flush) { if (should_flush) {
assert(!r->data_block.empty()); assert(!r->data_block.empty());
r->first_key_in_next_block = &key; r->first_key_in_next_block = &ikey;
Flush(); Flush();
if (r->state == Rep::State::kBuffered) { if (r->state == Rep::State::kBuffered) {
bool exceeds_buffer_limit = bool exceeds_buffer_limit =
@ -1050,7 +1050,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
if (r->IsParallelCompressionEnabled()) { if (r->IsParallelCompressionEnabled()) {
r->pc_rep->curr_block_keys->Clear(); r->pc_rep->curr_block_keys->Clear();
} else { } else {
r->index_builder->AddIndexEntry(r->last_key, &key, r->pending_handle, r->index_builder->AddIndexEntry(r->last_ikey, &ikey,
r->pending_handle,
&r->index_separator_scratch); &r->index_separator_scratch);
} }
} }
@ -1060,27 +1061,28 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
// builder after being added to index builder. // builder after being added to index builder.
if (r->state == Rep::State::kUnbuffered) { if (r->state == Rep::State::kUnbuffered) {
if (r->IsParallelCompressionEnabled()) { if (r->IsParallelCompressionEnabled()) {
r->pc_rep->curr_block_keys->PushBack(key); r->pc_rep->curr_block_keys->PushBack(ikey);
} else { } else {
if (r->filter_builder != nullptr) { if (r->filter_builder != nullptr) {
r->filter_builder->Add( r->filter_builder->Add(
ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz));
} }
} }
} }
r->data_block.AddWithLastKey(key, value, r->last_key); r->data_block.AddWithLastKey(ikey, value, r->last_ikey);
r->last_key.assign(key.data(), key.size()); r->last_ikey.assign(ikey.data(), ikey.size());
assert(!r->last_ikey.empty());
if (r->state == Rep::State::kBuffered) { if (r->state == Rep::State::kBuffered) {
// Buffered keys will be replayed from data_block_buffers during // Buffered keys will be replayed from data_block_buffers during
// `Finish()` once compression dictionary has been finalized. // `Finish()` once compression dictionary has been finalized.
} else { } else {
if (!r->IsParallelCompressionEnabled()) { if (!r->IsParallelCompressionEnabled()) {
r->index_builder->OnKeyAdded(key); r->index_builder->OnKeyAdded(ikey);
} }
} }
// TODO offset passed in is not accurate for parallel compression case // TODO offset passed in is not accurate for parallel compression case
NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
r->table_properties_collectors, r->table_properties_collectors,
r->ioptions.logger); r->ioptions.logger);
@ -1094,9 +1096,9 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
if (r->ts_sz > 0 && !r->persist_user_defined_timestamps) { if (r->ts_sz > 0 && !r->persist_user_defined_timestamps) {
persisted_end = StripTimestampFromUserKey(value, r->ts_sz); persisted_end = StripTimestampFromUserKey(value, r->ts_sz);
} }
r->range_del_block.Add(key, persisted_end); r->range_del_block.Add(ikey, persisted_end);
// TODO offset passed in is not accurate for parallel compression case // TODO offset passed in is not accurate for parallel compression case
NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
r->table_properties_collectors, r->table_properties_collectors,
r->ioptions.logger); r->ioptions.logger);
} else { } else {
@ -1108,7 +1110,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
} }
r->props.num_entries++; r->props.num_entries++;
r->props.raw_key_size += key.size(); r->props.raw_key_size += ikey.size();
if (!r->persist_user_defined_timestamps) { if (!r->persist_user_defined_timestamps) {
r->props.raw_key_size -= r->ts_sz; r->props.raw_key_size -= r->ts_sz;
} }
@ -2028,7 +2030,7 @@ Status BlockBasedTableBuilder::Finish() {
// block, we will finish writing all index entries first. // block, we will finish writing all index entries first.
if (ok() && !empty_data_block) { if (ok() && !empty_data_block) {
r->index_builder->AddIndexEntry( r->index_builder->AddIndexEntry(
r->last_key, nullptr /* no next data block */, r->pending_handle, r->last_ikey, nullptr /* no next data block */, r->pending_handle,
&r->index_separator_scratch); &r->index_separator_scratch);
} }
} }

View File

@ -76,8 +76,9 @@ PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {
void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
const Slice* next_key) { const Slice* next_key) {
// Use == to send the request only once // (NOTE: Can't just use ==, because keys_added_to_partition_ might be
if (keys_added_to_partition_ == keys_per_partition_) { // incremented by more than one.)
if (keys_added_to_partition_ >= keys_per_partition_) {
// Currently only index builder is in charge of cutting a partition. We keep // Currently only index builder is in charge of cutting a partition. We keep
// requesting until it is granted. // requesting until it is granted.
p_index_builder_->RequestPartitionCut(); p_index_builder_->RequestPartitionCut();