Update/clarify required properties for prefix extractors (#10245)

Summary: Most of the properties listed as required for prefix extractors are not really required but offer some conveniences. This updates API comments to clarify actual requirements, and adds tests to demonstrate how previously presumed requirements can be safely violated. This might seem like a useless exercise, but this relaxing of requirements would be needed if we generalize prefixes to group keys not just at the byte level but also based on bits or arbitrary value ranges. For applications without a "natural" prefix size, having only byte-level granularity often means one prefix size to the next differs in magnitude by a factor of 256. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10245 Test Plan: Tests added, also covering missing Iterator cases from https://github.com/facebook/rocksdb/issues/10244 Reviewed By: bjlemaire Differential Revision: D37371559 Pulled By: pdillinger fbshipit-source-id: ab2dd719992eea7656e9042cf8542393e02fa244
2022-06-28 16:08:30 -07:00 · 2022-06-28 16:08:30 -07:00 · d96febeeaa
parent ca81b80d83
commit d96febeeaa
2 changed files with 405 additions and 11 deletions
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@ -3113,6 +3113,382 @@ TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) {
  it.reset();
 }

+namespace {
+class BackwardBytewiseComparator : public Comparator {
+ public:
+  const char* Name() const override { return "BackwardBytewiseComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    int min_size_neg = -static_cast<int>(std::min(a.size(), b.size()));
+    const char* a_end = a.data() + a.size();
+    const char* b_end = b.data() + b.size();
+    for (int i = -1; i >= min_size_neg; --i) {
+      if (a_end[i] != b_end[i]) {
+        if (static_cast<unsigned char>(a_end[i]) <
+            static_cast<unsigned char>(b_end[i])) {
+          return -1;
+        } else {
+          return 1;
+        }
+      }
+    }
+    return static_cast<int>(a.size()) - static_cast<int>(b.size());
+  }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+const BackwardBytewiseComparator kBackwardBytewiseComparator{};
+
+class FixedSuffix4Transform : public SliceTransform {
+  const char* Name() const override { return "FixedSuffixTransform"; }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data() + src.size() - 4, 4);
+  }
+
+  bool InDomain(const Slice& src) const override { return src.size() >= 4; }
+};
+
+std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
+  if (sst) {
+    return {
+        options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_CHECKED),
+        options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_USEFUL)};
+  } else {
+    auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
+    auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
+    return {hit + miss, miss};
+  }
+}
+
+std::pair<uint64_t, uint64_t> CheckedAndUseful(uint64_t checked,
+                                               uint64_t useful) {
+  return {checked, useful};
+}
+}  // namespace
+
+// This uses a prefix_extractor + comparator combination that violates
+// one of the old obsolete, unnecessary axioms of prefix extraction:
+// * key.starts_with(prefix(key))
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter1) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.comparator = &kBackwardBytewiseComparator;
+  options.prefix_extractor = std::make_shared<FixedSuffix4Transform>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.statistics = CreateDBStatistics();
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("321aaaa", "val1"));
+  ASSERT_OK(Put("112aaaa", "val2"));
+  ASSERT_OK(Put("009aaaa", "val3"));
+  ASSERT_OK(Put("baa", "val4"));  // out of domain
+  ASSERT_OK(Put("321abaa", "val5"));
+  ASSERT_OK(Put("zzz", "val6"));  // out of domain
+
+  for (auto flushed : {false, true}) {
+    SCOPED_TRACE("flushed=" + std::to_string(flushed));
+    if (flushed) {
+      ASSERT_OK(Flush());
+    }
+    ReadOptions read_options;
+    if (flushed) {  // TODO: support auto_prefix_mode in memtable?
+      read_options.auto_prefix_mode = true;
+    }
+    EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+    {
+      Slice ub("999aaaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa"), 3);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("999abaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "abaa"), 1);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("999acaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "acaa"), 0);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+    }
+    {
+      Slice ub("zzzz");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "baa"), 3);
+      if (flushed) {  // TODO: fix memtable case
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+    }
+  }
+}
+
+// This uses a prefix_extractor + comparator combination that violates
+// one of the old obsolete, unnecessary axioms of prefix extraction:
+// * Compare(prefix(key), key) <= 0
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.comparator = ReverseBytewiseComparator();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.statistics = CreateDBStatistics();
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("aaaa123", "val1"));
+  ASSERT_OK(Put("aaaa211", "val2"));
+  ASSERT_OK(Put("aaaa900", "val3"));
+  ASSERT_OK(Put("aab", "val4"));  // out of domain
+  ASSERT_OK(Put("aaba123", "val5"));
+  ASSERT_OK(Put("qqqq123", "val7"));
+  ASSERT_OK(Put("qqqq", "val8"));
+  ASSERT_OK(Put("zzz", "val8"));  // out of domain
+
+  for (auto flushed : {false, true}) {
+    SCOPED_TRACE("flushed=" + std::to_string(flushed));
+    if (flushed) {
+      ASSERT_OK(Flush());
+    }
+    ReadOptions read_options;
+    if (flushed) {  // TODO: support auto_prefix_mode in memtable?
+      read_options.auto_prefix_mode = true;
+    } else {
+      // TODO: why needed?
+      get_perf_context()->bloom_memtable_hit_count = 0;
+      get_perf_context()->bloom_memtable_miss_count = 0;
+    }
+    EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+    {
+      Slice ub("aaaa000");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa999"), 3);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      // Note: prefix does work as upper bound
+      Slice ub("aaaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa999"), 3);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      // Note: prefix does not work here as seek key
+      Slice ub("aaaa500");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa"), 0);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("aaba000");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaba999"), 1);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("aaca000");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaca999"), 0);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+    }
+    {
+      Slice ub("aaaz");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "zzz"), 5);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+    }
+    {
+      // Note: prefix does work here as seek key, but only finds key equal
+      // to prefix (others with same prefix are less)
+      read_options.auto_prefix_mode = false;
+      read_options.iterate_upper_bound = nullptr;
+      read_options.prefix_same_as_start = true;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "qqqq"), 1);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+  }
+}
+
+namespace {
+// A weird comparator that in combination with NonIdempotentFixed4Transform
+// breaks an old axiom of prefix filtering.
+class WeirdComparator : public Comparator {
+ public:
+  const char* Name() const override { return "WeirdComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    bool a_in = a.size() >= 5;
+    bool b_in = b.size() >= 5;
+    if (a_in != b_in) {
+      // Order keys after prefixes
+      return a_in - b_in;
+    }
+    if (a_in) {
+      return BytewiseComparator()->Compare(a, b);
+    } else {
+      // Different ordering on the prefixes
+      return ReverseBytewiseComparator()->Compare(a, b);
+    }
+  }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+const WeirdComparator kWeirdComparator{};
+
+// Non-idempotentent because prefix is always 4 bytes, but this is
+// out-of-domain for keys to be assigned prefixes (>= 5 bytes)
+class NonIdempotentFixed4Transform : public SliceTransform {
+  const char* Name() const override { return "NonIdempotentFixed4Transform"; }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 4);
+  }
+
+  bool InDomain(const Slice& src) const override { return src.size() >= 5; }
+};
+}  // namespace
+
+// This uses a prefix_extractor + comparator combination that violates
+// two of the old obsolete, unnecessary axioms of prefix extraction:
+// * prefix(prefix(key)) == prefix(key)
+// * If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.prefix_extractor = std::make_shared<NonIdempotentFixed4Transform>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.statistics = CreateDBStatistics();
+
+  for (auto weird_comparator : {false, true}) {
+    if (weird_comparator) {
+      options.comparator = &kWeirdComparator;
+    }
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("aaaa123", "val1"));
+    ASSERT_OK(Put("aaaa211", "val2"));
+    ASSERT_OK(Put("aaaa900", "val3"));
+    ASSERT_OK(Put("aab", "val4"));  // out of domain
+    ASSERT_OK(Put("aaba123", "val5"));
+    ASSERT_OK(Put("qqqq123", "val7"));
+    ASSERT_OK(Put("qqqq", "val8"));  // out of domain
+    ASSERT_OK(Put("zzzz", "val8"));  // out of domain
+
+    for (auto flushed : {false, true}) {
+      SCOPED_TRACE("flushed=" + std::to_string(flushed));
+      if (flushed) {
+        ASSERT_OK(Flush());
+      }
+      ReadOptions read_options;
+      if (flushed) {  // TODO: support auto_prefix_mode in memtable?
+        read_options.auto_prefix_mode = true;
+      } else {
+        // TODO: why needed?
+        get_perf_context()->bloom_memtable_hit_count = 0;
+        get_perf_context()->bloom_memtable_miss_count = 0;
+      }
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      {
+        Slice ub("aaaa999");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaaa000"), 3);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+      }
+      {
+        // Note: prefix as seek key is not bloom-optimized
+        // Note: the count works with weird_comparator because "aaaa" is
+        // ordered as the last of the prefixes
+        Slice ub("aaaa999");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaaa"), 3);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+      {
+        Slice ub("aaba9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaba0"), 1);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+      }
+      {
+        Slice ub("aaca9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaca0"), 0);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+      }
+      {
+        Slice ub("qqqq9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "qqqq0"), 1);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+      }
+      {
+        // Note: prefix as seek key is not bloom-optimized
+        Slice ub("qqqq9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "qqqq"), weird_comparator ? 7 : 2);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+      {
+        // Note: prefix as seek key is not bloom-optimized
+        Slice ub("zzzz9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "zzzz"), weird_comparator ? 8 : 1);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+      {
+        Slice ub("zzzz9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aab"), weird_comparator ? 6 : 5);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+    }
+  }
+}
+
 #endif  // ROCKSDB_LITE

 }  // namespace ROCKSDB_NAMESPACE
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -240,18 +240,36 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
  // Dynamically changeable through SetOptions() API
  int level0_file_num_compaction_trigger = 4;

-  // If non-nullptr, use the specified function to determine the
-  // prefixes for keys.  These prefixes will be placed in the filter.
-  // Depending on the workload, this can reduce the number of read-IOP
-  // cost for scans when a prefix is passed via ReadOptions to
-  // db.NewIterator().  For prefix filtering to work properly,
-  // "prefix_extractor" and "comparator" must be such that the following
-  // properties hold:
+  // If non-nullptr, use the specified function to put keys in contiguous
+  // groups called "prefixes". These prefixes are used to place one
+  // representative entry for the group into the Bloom filter
+  // rather than an entry for each key (see whole_key_filtering).
+  // Under certain conditions, this enables optimizing some range queries
+  // (Iterators) in addition to some point lookups (Get/MultiGet).
  //
-  // 1) key.starts_with(prefix(key))
-  // 2) Compare(prefix(key), key) <= 0.
-  // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
-  // 4) prefix(prefix(key)) == prefix(key)
+  // Together `prefix_extractor` and `comparator` must satisfy one essential
+  // property for valid prefix filtering of range queries:
+  //   If Compare(k1, k2) <= 0 and Compare(k2, k3) <= 0 and
+  //      InDomain(k1) and InDomain(k3) and prefix(k1) == prefix(k3),
+  //   Then InDomain(k2) and prefix(k2) == prefix(k1)
+  //
+  // In other words, all keys with the same prefix must be in a contiguous
+  // group by comparator order, and cannot be interrupted by keys with no
+  // prefix ("out of domain"). (This makes it valid to conclude that no
+  // entries within some bounds are present if the upper and lower bounds
+  // have a common prefix and no entries with that same prefix are present.)
+  //
+  // Some other properties are recommended but not strictly required. Under
+  // most sensible comparators, the following will need to hold true to
+  // satisfy the essential property above:
+  // * "Prefix is a prefix": key.starts_with(prefix(key))
+  // * "Prefixes preserve ordering": If Compare(k1, k2) <= 0, then
+  //   Compare(prefix(k1), prefix(k2)) <= 0
+  //
+  // The next two properties ensure that seeking to a prefix allows
+  // enumerating all entries with that prefix:
+  // * "Prefix starts the group": Compare(prefix(key), key) <= 0
+  // * "Prefix idempotent": prefix(prefix(key)) == prefix(key)
  //
  // Default: nullptr
  std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;