Update/clarify required properties for prefix extractors (#10245)

Summary:
Most of the properties listed as required for prefix extractors
are not really required but offer some conveniences. This updates API
comments to clarify actual requirements, and adds tests to demonstrate
how previously presumed requirements can be safely violated.

This might seem like a useless exercise, but this relaxing of requirements
would be needed if we generalize prefixes to group keys not just at the
byte level but also based on bits or arbitrary value ranges. For
applications without a "natural" prefix size, having only byte-level
granularity often means one prefix size to the next differs in magnitude
by a factor of 256.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10245

Test Plan: Tests added, also covering missing Iterator cases from https://github.com/facebook/rocksdb/issues/10244

Reviewed By: bjlemaire

Differential Revision: D37371559

Pulled By: pdillinger

fbshipit-source-id: ab2dd719992eea7656e9042cf8542393e02fa244
This commit is contained in:
Peter Dillinger 2022-06-28 16:08:30 -07:00 committed by Facebook GitHub Bot
parent ca81b80d83
commit d96febeeaa
2 changed files with 405 additions and 11 deletions

View File

@ -3113,6 +3113,382 @@ TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) {
it.reset();
}
namespace {
class BackwardBytewiseComparator : public Comparator {
public:
const char* Name() const override { return "BackwardBytewiseComparator"; }
int Compare(const Slice& a, const Slice& b) const override {
int min_size_neg = -static_cast<int>(std::min(a.size(), b.size()));
const char* a_end = a.data() + a.size();
const char* b_end = b.data() + b.size();
for (int i = -1; i >= min_size_neg; --i) {
if (a_end[i] != b_end[i]) {
if (static_cast<unsigned char>(a_end[i]) <
static_cast<unsigned char>(b_end[i])) {
return -1;
} else {
return 1;
}
}
}
return static_cast<int>(a.size()) - static_cast<int>(b.size());
}
void FindShortestSeparator(std::string* /*start*/,
const Slice& /*limit*/) const override {}
void FindShortSuccessor(std::string* /*key*/) const override {}
};
const BackwardBytewiseComparator kBackwardBytewiseComparator{};
class FixedSuffix4Transform : public SliceTransform {
const char* Name() const override { return "FixedSuffixTransform"; }
Slice Transform(const Slice& src) const override {
return Slice(src.data() + src.size() - 4, 4);
}
bool InDomain(const Slice& src) const override { return src.size() >= 4; }
};
std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
if (sst) {
return {
options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_CHECKED),
options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_USEFUL)};
} else {
auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
return {hit + miss, miss};
}
}
std::pair<uint64_t, uint64_t> CheckedAndUseful(uint64_t checked,
uint64_t useful) {
return {checked, useful};
}
} // namespace
// This uses a prefix_extractor + comparator combination that violates
// one of the old obsolete, unnecessary axioms of prefix extraction:
// * key.starts_with(prefix(key))
// This axiom is not really needed, and we validate that here.
TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter1) {
BlockBasedTableOptions bbto;
bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
bbto.whole_key_filtering = false;
Options options = CurrentOptions();
options.comparator = &kBackwardBytewiseComparator;
options.prefix_extractor = std::make_shared<FixedSuffix4Transform>();
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
options.memtable_prefix_bloom_size_ratio = 0.1;
options.statistics = CreateDBStatistics();
DestroyAndReopen(options);
ASSERT_OK(Put("321aaaa", "val1"));
ASSERT_OK(Put("112aaaa", "val2"));
ASSERT_OK(Put("009aaaa", "val3"));
ASSERT_OK(Put("baa", "val4")); // out of domain
ASSERT_OK(Put("321abaa", "val5"));
ASSERT_OK(Put("zzz", "val6")); // out of domain
for (auto flushed : {false, true}) {
SCOPED_TRACE("flushed=" + std::to_string(flushed));
if (flushed) {
ASSERT_OK(Flush());
}
ReadOptions read_options;
if (flushed) { // TODO: support auto_prefix_mode in memtable?
read_options.auto_prefix_mode = true;
}
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
{
Slice ub("999aaaa");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "aaaa"), 3);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
}
{
Slice ub("999abaa");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "abaa"), 1);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
}
{
Slice ub("999acaa");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "acaa"), 0);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
}
{
Slice ub("zzzz");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "baa"), 3);
if (flushed) { // TODO: fix memtable case
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
}
}
}
}
// This uses a prefix_extractor + comparator combination that violates
// one of the old obsolete, unnecessary axioms of prefix extraction:
// * Compare(prefix(key), key) <= 0
// This axiom is not really needed, and we validate that here.
TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) {
BlockBasedTableOptions bbto;
bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
bbto.whole_key_filtering = false;
Options options = CurrentOptions();
options.comparator = ReverseBytewiseComparator();
options.prefix_extractor.reset(NewFixedPrefixTransform(4));
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
options.memtable_prefix_bloom_size_ratio = 0.1;
options.statistics = CreateDBStatistics();
DestroyAndReopen(options);
ASSERT_OK(Put("aaaa123", "val1"));
ASSERT_OK(Put("aaaa211", "val2"));
ASSERT_OK(Put("aaaa900", "val3"));
ASSERT_OK(Put("aab", "val4")); // out of domain
ASSERT_OK(Put("aaba123", "val5"));
ASSERT_OK(Put("qqqq123", "val7"));
ASSERT_OK(Put("qqqq", "val8"));
ASSERT_OK(Put("zzz", "val8")); // out of domain
for (auto flushed : {false, true}) {
SCOPED_TRACE("flushed=" + std::to_string(flushed));
if (flushed) {
ASSERT_OK(Flush());
}
ReadOptions read_options;
if (flushed) { // TODO: support auto_prefix_mode in memtable?
read_options.auto_prefix_mode = true;
} else {
// TODO: why needed?
get_perf_context()->bloom_memtable_hit_count = 0;
get_perf_context()->bloom_memtable_miss_count = 0;
}
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
{
Slice ub("aaaa000");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "aaaa999"), 3);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
}
{
// Note: prefix does work as upper bound
Slice ub("aaaa");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "aaaa999"), 3);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
}
{
// Note: prefix does not work here as seek key
Slice ub("aaaa500");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "aaaa"), 0);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
}
{
Slice ub("aaba000");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "aaba999"), 1);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
}
{
Slice ub("aaca000");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "aaca999"), 0);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
}
{
Slice ub("aaaz");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "zzz"), 5);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
}
{
// Note: prefix does work here as seek key, but only finds key equal
// to prefix (others with same prefix are less)
read_options.auto_prefix_mode = false;
read_options.iterate_upper_bound = nullptr;
read_options.prefix_same_as_start = true;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "qqqq"), 1);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
}
}
}
namespace {
// A weird comparator that in combination with NonIdempotentFixed4Transform
// breaks an old axiom of prefix filtering.
class WeirdComparator : public Comparator {
public:
const char* Name() const override { return "WeirdComparator"; }
int Compare(const Slice& a, const Slice& b) const override {
bool a_in = a.size() >= 5;
bool b_in = b.size() >= 5;
if (a_in != b_in) {
// Order keys after prefixes
return a_in - b_in;
}
if (a_in) {
return BytewiseComparator()->Compare(a, b);
} else {
// Different ordering on the prefixes
return ReverseBytewiseComparator()->Compare(a, b);
}
}
void FindShortestSeparator(std::string* /*start*/,
const Slice& /*limit*/) const override {}
void FindShortSuccessor(std::string* /*key*/) const override {}
};
const WeirdComparator kWeirdComparator{};
// Non-idempotentent because prefix is always 4 bytes, but this is
// out-of-domain for keys to be assigned prefixes (>= 5 bytes)
class NonIdempotentFixed4Transform : public SliceTransform {
const char* Name() const override { return "NonIdempotentFixed4Transform"; }
Slice Transform(const Slice& src) const override {
return Slice(src.data(), 4);
}
bool InDomain(const Slice& src) const override { return src.size() >= 5; }
};
} // namespace
// This uses a prefix_extractor + comparator combination that violates
// two of the old obsolete, unnecessary axioms of prefix extraction:
// * prefix(prefix(key)) == prefix(key)
// * If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
// This axiom is not really needed, and we validate that here.
TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) {
BlockBasedTableOptions bbto;
bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
bbto.whole_key_filtering = false;
Options options = CurrentOptions();
options.prefix_extractor = std::make_shared<NonIdempotentFixed4Transform>();
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
options.memtable_prefix_bloom_size_ratio = 0.1;
options.statistics = CreateDBStatistics();
for (auto weird_comparator : {false, true}) {
if (weird_comparator) {
options.comparator = &kWeirdComparator;
}
DestroyAndReopen(options);
ASSERT_OK(Put("aaaa123", "val1"));
ASSERT_OK(Put("aaaa211", "val2"));
ASSERT_OK(Put("aaaa900", "val3"));
ASSERT_OK(Put("aab", "val4")); // out of domain
ASSERT_OK(Put("aaba123", "val5"));
ASSERT_OK(Put("qqqq123", "val7"));
ASSERT_OK(Put("qqqq", "val8")); // out of domain
ASSERT_OK(Put("zzzz", "val8")); // out of domain
for (auto flushed : {false, true}) {
SCOPED_TRACE("flushed=" + std::to_string(flushed));
if (flushed) {
ASSERT_OK(Flush());
}
ReadOptions read_options;
if (flushed) { // TODO: support auto_prefix_mode in memtable?
read_options.auto_prefix_mode = true;
} else {
// TODO: why needed?
get_perf_context()->bloom_memtable_hit_count = 0;
get_perf_context()->bloom_memtable_miss_count = 0;
}
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
{
Slice ub("aaaa999");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "aaaa000"), 3);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
}
{
// Note: prefix as seek key is not bloom-optimized
// Note: the count works with weird_comparator because "aaaa" is
// ordered as the last of the prefixes
Slice ub("aaaa999");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "aaaa"), 3);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
}
{
Slice ub("aaba9");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "aaba0"), 1);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
}
{
Slice ub("aaca9");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "aaca0"), 0);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
}
{
Slice ub("qqqq9");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "qqqq0"), 1);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
}
{
// Note: prefix as seek key is not bloom-optimized
Slice ub("qqqq9");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "qqqq"), weird_comparator ? 7 : 2);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
}
{
// Note: prefix as seek key is not bloom-optimized
Slice ub("zzzz9");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "zzzz"), weird_comparator ? 8 : 1);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
}
{
Slice ub("zzzz9");
read_options.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
EXPECT_EQ(CountIter(iter, "aab"), weird_comparator ? 6 : 5);
EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
}
}
}
}
#endif // ROCKSDB_LITE
} // namespace ROCKSDB_NAMESPACE

View File

@ -240,18 +240,36 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
// Dynamically changeable through SetOptions() API
int level0_file_num_compaction_trigger = 4;
// If non-nullptr, use the specified function to determine the
// prefixes for keys. These prefixes will be placed in the filter.
// Depending on the workload, this can reduce the number of read-IOP
// cost for scans when a prefix is passed via ReadOptions to
// db.NewIterator(). For prefix filtering to work properly,
// "prefix_extractor" and "comparator" must be such that the following
// properties hold:
// If non-nullptr, use the specified function to put keys in contiguous
// groups called "prefixes". These prefixes are used to place one
// representative entry for the group into the Bloom filter
// rather than an entry for each key (see whole_key_filtering).
// Under certain conditions, this enables optimizing some range queries
// (Iterators) in addition to some point lookups (Get/MultiGet).
//
// 1) key.starts_with(prefix(key))
// 2) Compare(prefix(key), key) <= 0.
// 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
// 4) prefix(prefix(key)) == prefix(key)
// Together `prefix_extractor` and `comparator` must satisfy one essential
// property for valid prefix filtering of range queries:
// If Compare(k1, k2) <= 0 and Compare(k2, k3) <= 0 and
// InDomain(k1) and InDomain(k3) and prefix(k1) == prefix(k3),
// Then InDomain(k2) and prefix(k2) == prefix(k1)
//
// In other words, all keys with the same prefix must be in a contiguous
// group by comparator order, and cannot be interrupted by keys with no
// prefix ("out of domain"). (This makes it valid to conclude that no
// entries within some bounds are present if the upper and lower bounds
// have a common prefix and no entries with that same prefix are present.)
//
// Some other properties are recommended but not strictly required. Under
// most sensible comparators, the following will need to hold true to
// satisfy the essential property above:
// * "Prefix is a prefix": key.starts_with(prefix(key))
// * "Prefixes preserve ordering": If Compare(k1, k2) <= 0, then
// Compare(prefix(k1), prefix(k2)) <= 0
//
// The next two properties ensure that seeking to a prefix allows
// enumerating all entries with that prefix:
// * "Prefix starts the group": Compare(prefix(key), key) <= 0
// * "Prefix idempotent": prefix(prefix(key)) == prefix(key)
//
// Default: nullptr
std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;