Fix filter partition size logic (#12904)

Summary: Was checking == a desired number of entries added to a filter, when the combination of whole key and prefix filtering could add more than one entry per table internal key. This could lead to unnecessarily large filter partitions, which could affect performance and block cache fairness. Also (only somewhat related because of other work in progress): * Some variable renaming and a new assertion in BlockBasedTableBuilder, to add some clarity. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12904 Test Plan: If you add assertion logic to the base revision checking that the partition cut is requested whenever `keys_added_to_partition_ >= keys_per_partition_`, it fails on a number of db_bloom_filter_test tests. However, such an assertion in the revised code would be essentially redundant with the new logic. If I added a regression test for this, it would be tricky and fragile, so I don't think it's important enough to chase and maintain. (Open to suggestions / input.) Reviewed By: jowlyzhang Differential Revision: D60557827 Pulled By: pdillinger fbshipit-source-id: 77a56097d540da6e7851941a26d26ced2d944373
2024-08-02 14:49:02 -07:00 · 2024-08-02 14:49:02 -07:00 · 9d5c8c89a1
parent 2e8a1a14ef
commit 9d5c8c89a1
2 changed files with 22 additions and 19 deletions
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@ -296,7 +296,7 @@ struct BlockBasedTableBuilder::Rep {
  std::string index_separator_scratch;
  PartitionedIndexBuilder* p_index_builder_ = nullptr;

-  std::string last_key;
+  std::string last_ikey;  // Internal key or empty (unset)
  const Slice* first_key_in_next_block = nullptr;
  CompressionType compression_type;
  uint64_t sample_for_compression;
@ -998,24 +998,24 @@ BlockBasedTableBuilder::~BlockBasedTableBuilder() {
  delete rep_;
 }

-void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
+void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
  Rep* r = rep_;
  assert(rep_->state != Rep::State::kClosed);
  if (!ok()) {
    return;
  }
-  ValueType value_type = ExtractValueType(key);
+  ValueType value_type = ExtractValueType(ikey);
  if (IsValueType(value_type)) {
 #ifndef NDEBUG
    if (r->props.num_entries > r->props.num_range_deletions) {
-      assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
+      assert(r->internal_comparator.Compare(ikey, Slice(r->last_ikey)) > 0);
    }
 #endif  // !NDEBUG

-    auto should_flush = r->flush_block_policy->Update(key, value);
+    auto should_flush = r->flush_block_policy->Update(ikey, value);
    if (should_flush) {
      assert(!r->data_block.empty());
-      r->first_key_in_next_block = &key;
+      r->first_key_in_next_block = &ikey;
      Flush();
      if (r->state == Rep::State::kBuffered) {
        bool exceeds_buffer_limit =
@ -1050,7 +1050,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
        if (r->IsParallelCompressionEnabled()) {
          r->pc_rep->curr_block_keys->Clear();
        } else {
-          r->index_builder->AddIndexEntry(r->last_key, &key, r->pending_handle,
+          r->index_builder->AddIndexEntry(r->last_ikey, &ikey,
+                                          r->pending_handle,
                                          &r->index_separator_scratch);
        }
      }
@ -1060,27 +1061,28 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
    // builder after being added to index builder.
    if (r->state == Rep::State::kUnbuffered) {
      if (r->IsParallelCompressionEnabled()) {
-        r->pc_rep->curr_block_keys->PushBack(key);
+        r->pc_rep->curr_block_keys->PushBack(ikey);
      } else {
        if (r->filter_builder != nullptr) {
          r->filter_builder->Add(
-              ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
+              ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz));
        }
      }
    }

-    r->data_block.AddWithLastKey(key, value, r->last_key);
-    r->last_key.assign(key.data(), key.size());
+    r->data_block.AddWithLastKey(ikey, value, r->last_ikey);
+    r->last_ikey.assign(ikey.data(), ikey.size());
+    assert(!r->last_ikey.empty());
    if (r->state == Rep::State::kBuffered) {
      // Buffered keys will be replayed from data_block_buffers during
      // `Finish()` once compression dictionary has been finalized.
    } else {
      if (!r->IsParallelCompressionEnabled()) {
-        r->index_builder->OnKeyAdded(key);
+        r->index_builder->OnKeyAdded(ikey);
      }
    }
    // TODO offset passed in is not accurate for parallel compression case
-    NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
+    NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
                                      r->table_properties_collectors,
                                      r->ioptions.logger);

@ -1094,9 +1096,9 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
    if (r->ts_sz > 0 && !r->persist_user_defined_timestamps) {
      persisted_end = StripTimestampFromUserKey(value, r->ts_sz);
    }
-    r->range_del_block.Add(key, persisted_end);
+    r->range_del_block.Add(ikey, persisted_end);
    // TODO offset passed in is not accurate for parallel compression case
-    NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
+    NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
                                      r->table_properties_collectors,
                                      r->ioptions.logger);
  } else {
@ -1108,7 +1110,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
  }

  r->props.num_entries++;
-  r->props.raw_key_size += key.size();
+  r->props.raw_key_size += ikey.size();
  if (!r->persist_user_defined_timestamps) {
    r->props.raw_key_size -= r->ts_sz;
  }
@ -2028,7 +2030,7 @@ Status BlockBasedTableBuilder::Finish() {
    // block, we will finish writing all index entries first.
    if (ok() && !empty_data_block) {
      r->index_builder->AddIndexEntry(
-          r->last_key, nullptr /* no next data block */, r->pending_handle,
+          r->last_ikey, nullptr /* no next data block */, r->pending_handle,
          &r->index_separator_scratch);
    }
  }
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@ -76,8 +76,9 @@ PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {

 void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
    const Slice* next_key) {
-  // Use == to send the request only once
-  if (keys_added_to_partition_ == keys_per_partition_) {
+  // (NOTE: Can't just use ==, because keys_added_to_partition_ might be
+  // incremented by more than one.)
+  if (keys_added_to_partition_ >= keys_per_partition_) {
    // Currently only index builder is in charge of cutting a partition. We keep
    // requesting until it is granted.
    p_index_builder_->RequestPartitionCut();