Fix auto_prefix_mode performance with partitioned filters (#10012)

Summary:
Essentially refactored the RangeMayExist implementation in
FullFilterBlockReader to FilterBlockReaderCommon so that it applies to
partitioned filters as well. (The function is not called for the
block-based filter case.) RangeMayExist is essentially a series of checks
around a possible PrefixMayExist, and I'm confident those checks should
be the same for partitioned as for full filters. (I think it's likely
that bugs remain in those checks, but this change is overall a simplifying
one.)

Added auto_prefix_mode support to db_bench

Other small fixes as well

Fixes https://github.com/facebook/rocksdb/issues/10003

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10012

Test Plan:
Expanded unit test that uses statistics to check for filter
optimization, fails without the production code changes here

Performance: populate two DBs with
```
TEST_TMPDIR=/dev/shm/rocksdb_nonpartitioned ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=16 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8
TEST_TMPDIR=/dev/shm/rocksdb_partitioned ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=16 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -partition_index_and_filters
```

Observe no measurable change in non-partitioned performance
```
TEST_TMPDIR=/dev/shm/rocksdb_nonpartitioned ./db_bench -benchmarks=seekrandom[-X1000] -num=10000000 -readonly -bloom_bits=16 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -auto_prefix_mode -cache_index_and_filter_blocks=1 -cache_size=1000000000 -duration 20
```
Before: seekrandom [AVG 15 runs] : 11798 (± 331) ops/sec
After: seekrandom [AVG 15 runs] : 11724 (± 315) ops/sec

Observe big improvement with partitioned (also supported by bloom use statistics)
```
TEST_TMPDIR=/dev/shm/rocksdb_partitioned ./db_bench -benchmarks=seekrandom[-X1000] -num=10000000 -readonly -bloom_bits=16 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -partition_index_and_filters -auto_prefix_mode -cache_index_and_filter_blocks=1 -cache_size=1000000000 -duration 20
```
Before: seekrandom [AVG 12 runs] : 2942 (± 57) ops/sec
After: seekrandom [AVG 12 runs] : 7489 (± 184) ops/sec

Reviewed By: siying

Differential Revision: D36469796

Pulled By: pdillinger

fbshipit-source-id: bcf1e2a68d347b32adb2b27384f945434e7a266d
This commit is contained in:
Peter Dillinger 2022-05-19 13:09:03 -07:00 committed by Facebook GitHub Bot
parent c6d326d3d7
commit 280b9f371a
7 changed files with 188 additions and 183 deletions

View File

@ -6309,115 +6309,118 @@ TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {
#ifndef ROCKSDB_LITE
TEST_F(DBTest2, AutoPrefixMode1) {
// create a DB with block prefix index
BlockBasedTableOptions table_options;
Options options = CurrentOptions();
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
options.statistics = CreateDBStatistics();
do {
// create a DB with block prefix index
Options options = CurrentOptions();
BlockBasedTableOptions table_options =
*options.table_factory->GetOptions<BlockBasedTableOptions>();
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
options.statistics = CreateDBStatistics();
Reopen(options);
Reopen(options);
Random rnd(301);
std::string large_value = rnd.RandomString(500);
Random rnd(301);
std::string large_value = rnd.RandomString(500);
ASSERT_OK(Put("a1", large_value));
ASSERT_OK(Put("x1", large_value));
ASSERT_OK(Put("y1", large_value));
ASSERT_OK(Flush());
ASSERT_OK(Put("a1", large_value));
ASSERT_OK(Put("x1", large_value));
ASSERT_OK(Put("y1", large_value));
ASSERT_OK(Flush());
ReadOptions ro;
ro.total_order_seek = false;
ro.auto_prefix_mode = true;
{
std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
iterator->Seek("b1");
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("x1", iterator->key().ToString());
ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ASSERT_OK(iterator->status());
}
ReadOptions ro;
ro.total_order_seek = false;
ro.auto_prefix_mode = true;
{
std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
iterator->Seek("b1");
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("x1", iterator->key().ToString());
ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ASSERT_OK(iterator->status());
}
std::string ub_str = "b9";
Slice ub(ub_str);
ro.iterate_upper_bound = &ub;
{
std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
iterator->Seek("b1");
ASSERT_FALSE(iterator->Valid());
ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ASSERT_OK(iterator->status());
}
ub_str = "z";
ub = Slice(ub_str);
{
std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
iterator->Seek("b1");
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("x1", iterator->key().ToString());
ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ASSERT_OK(iterator->status());
}
ub_str = "c";
ub = Slice(ub_str);
{
std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
iterator->Seek("b1");
ASSERT_FALSE(iterator->Valid());
ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ASSERT_OK(iterator->status());
}
// The same queries without recreating iterator
{
ub_str = "b9";
ub = Slice(ub_str);
std::string ub_str = "b9";
Slice ub(ub_str);
ro.iterate_upper_bound = &ub;
std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
iterator->Seek("b1");
ASSERT_FALSE(iterator->Valid());
ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ASSERT_OK(iterator->status());
{
std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
iterator->Seek("b1");
ASSERT_FALSE(iterator->Valid());
ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ASSERT_OK(iterator->status());
}
ub_str = "z";
ub = Slice(ub_str);
iterator->Seek("b1");
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("x1", iterator->key().ToString());
ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
{
std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
iterator->Seek("b1");
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("x1", iterator->key().ToString());
ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ASSERT_OK(iterator->status());
}
ub_str = "c";
ub = Slice(ub_str);
{
std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
iterator->Seek("b1");
ASSERT_FALSE(iterator->Valid());
ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ASSERT_OK(iterator->status());
}
iterator->Seek("b1");
ASSERT_FALSE(iterator->Valid());
ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
// The same queries without recreating iterator
{
ub_str = "b9";
ub = Slice(ub_str);
ro.iterate_upper_bound = &ub;
ub_str = "b9";
ub = Slice(ub_str);
ro.iterate_upper_bound = &ub;
iterator->SeekForPrev("b1");
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("a1", iterator->key().ToString());
ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
iterator->Seek("b1");
ASSERT_FALSE(iterator->Valid());
ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ASSERT_OK(iterator->status());
ub_str = "zz";
ub = Slice(ub_str);
ro.iterate_upper_bound = &ub;
iterator->SeekToLast();
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("y1", iterator->key().ToString());
ub_str = "z";
ub = Slice(ub_str);
iterator->SeekToFirst();
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("a1", iterator->key().ToString());
}
iterator->Seek("b1");
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("x1", iterator->key().ToString());
ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ub_str = "c";
ub = Slice(ub_str);
iterator->Seek("b1");
ASSERT_FALSE(iterator->Valid());
ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ub_str = "b9";
ub = Slice(ub_str);
ro.iterate_upper_bound = &ub;
iterator->SeekForPrev("b1");
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("a1", iterator->key().ToString());
ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
ub_str = "zz";
ub = Slice(ub_str);
ro.iterate_upper_bound = &ub;
iterator->SeekToLast();
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("y1", iterator->key().ToString());
iterator->SeekToFirst();
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("a1", iterator->key().ToString());
}
} while (ChangeOptions(kSkipPlainTable));
}
class RenameCurrentTest : public DBTestBase,

View File

@ -188,16 +188,7 @@ class FilterBlockReader {
const Slice* const const_ikey_ptr,
bool* filter_checked, bool need_upper_bound_check,
bool no_io,
BlockCacheLookupContext* lookup_context) {
if (need_upper_bound_check) {
return true;
}
*filter_checked = true;
Slice prefix = prefix_extractor->Transform(user_key_without_ts);
return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io,
const_ikey_ptr, /* get_context */ nullptr,
lookup_context);
}
BlockCacheLookupContext* lookup_context) = 0;
};
} // namespace ROCKSDB_NAMESPACE

View File

@ -94,6 +94,64 @@ size_t FilterBlockReaderCommon<TBlocklike>::ApproximateFilterBlockMemoryUsage()
: 0;
}
template <typename TBlocklike>
bool FilterBlockReaderCommon<TBlocklike>::RangeMayExist(
const Slice* iterate_upper_bound, const Slice& user_key_without_ts,
const SliceTransform* prefix_extractor, const Comparator* comparator,
const Slice* const const_ikey_ptr, bool* filter_checked,
bool need_upper_bound_check, bool no_io,
BlockCacheLookupContext* lookup_context) {
if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) {
*filter_checked = false;
return true;
}
Slice prefix = prefix_extractor->Transform(user_key_without_ts);
if (need_upper_bound_check &&
!IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
*filter_checked = false;
return true;
} else {
*filter_checked = true;
return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io,
const_ikey_ptr, /* get_context */ nullptr,
lookup_context);
}
}
template <typename TBlocklike>
bool FilterBlockReaderCommon<TBlocklike>::IsFilterCompatible(
const Slice* iterate_upper_bound, const Slice& prefix,
const Comparator* comparator) const {
// Try to reuse the bloom filter in the SST table if prefix_extractor in
// mutable_cf_options has changed. If range [user_key, upper_bound) all
// share the same prefix then we may still be able to use the bloom filter.
const SliceTransform* const prefix_extractor = table_prefix_extractor();
if (iterate_upper_bound != nullptr && prefix_extractor) {
if (!prefix_extractor->InDomain(*iterate_upper_bound)) {
return false;
}
Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
// first check if user_key and upper_bound all share the same prefix
if (comparator->CompareWithoutTimestamp(prefix, false, upper_bound_xform,
false) != 0) {
// second check if user_key's prefix is the immediate predecessor of
// upper_bound and have the same length. If so, we know for sure all
// keys in the range [user_key, upper_bound) share the same prefix.
// Also need to make sure upper_bound are full length to ensure
// correctness
if (!full_length_enabled_ ||
iterate_upper_bound->size() != prefix_extractor_full_length_ ||
!comparator->IsSameLengthImmediateSuccessor(prefix,
*iterate_upper_bound)) {
return false;
}
}
return true;
} else {
return false;
}
}
// Explicitly instantiate templates for both "blocklike" types we use.
// This makes it possible to keep the template definitions in the .cc file.
template class FilterBlockReaderCommon<BlockContents>;

View File

@ -26,8 +26,20 @@ class FilterBlockReaderCommon : public FilterBlockReader {
CachableEntry<TBlocklike>&& filter_block)
: table_(t), filter_block_(std::move(filter_block)) {
assert(table_);
const SliceTransform* const prefix_extractor = table_prefix_extractor();
if (prefix_extractor) {
full_length_enabled_ =
prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_);
}
}
bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
const SliceTransform* prefix_extractor,
const Comparator* comparator,
const Slice* const const_ikey_ptr, bool* filter_checked,
bool need_upper_bound_check, bool no_io,
BlockCacheLookupContext* lookup_context) override;
protected:
static Status ReadFilterBlock(const BlockBasedTable* table,
FilePrefetchBuffer* prefetch_buffer,
@ -47,9 +59,15 @@ class FilterBlockReaderCommon : public FilterBlockReader {
size_t ApproximateFilterBlockMemoryUsage() const;
private:
bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix,
const Comparator* comparator) const;
private:
const BlockBasedTable* table_;
CachableEntry<TBlocklike> filter_block_;
size_t prefix_extractor_full_length_ = 0;
bool full_length_enabled_ = false;
};
} // namespace ROCKSDB_NAMESPACE

View File

@ -120,11 +120,6 @@ FullFilterBlockReader::FullFilterBlockReader(
const BlockBasedTable* t,
CachableEntry<ParsedFullFilterBlock>&& filter_block)
: FilterBlockReaderCommon(t, std::move(filter_block)) {
const SliceTransform* const prefix_extractor = table_prefix_extractor();
if (prefix_extractor) {
full_length_enabled_ =
prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_);
}
}
bool FullFilterBlockReader::KeyMayMatch(
@ -306,60 +301,4 @@ size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
return usage;
}
bool FullFilterBlockReader::RangeMayExist(
const Slice* iterate_upper_bound, const Slice& user_key_without_ts,
const SliceTransform* prefix_extractor, const Comparator* comparator,
const Slice* const const_ikey_ptr, bool* filter_checked,
bool need_upper_bound_check, bool no_io,
BlockCacheLookupContext* lookup_context) {
if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) {
*filter_checked = false;
return true;
}
Slice prefix = prefix_extractor->Transform(user_key_without_ts);
if (need_upper_bound_check &&
!IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
*filter_checked = false;
return true;
} else {
*filter_checked = true;
return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io,
const_ikey_ptr, /* get_context */ nullptr,
lookup_context);
}
}
bool FullFilterBlockReader::IsFilterCompatible(
const Slice* iterate_upper_bound, const Slice& prefix,
const Comparator* comparator) const {
// Try to reuse the bloom filter in the SST table if prefix_extractor in
// mutable_cf_options has changed. If range [user_key, upper_bound) all
// share the same prefix then we may still be able to use the bloom filter.
const SliceTransform* const prefix_extractor = table_prefix_extractor();
if (iterate_upper_bound != nullptr && prefix_extractor) {
if (!prefix_extractor->InDomain(*iterate_upper_bound)) {
return false;
}
Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
// first check if user_key and upper_bound all share the same prefix
if (comparator->CompareWithoutTimestamp(prefix, false, upper_bound_xform,
false) != 0) {
// second check if user_key's prefix is the immediate predecessor of
// upper_bound and have the same length. If so, we know for sure all
// keys in the range [user_key, upper_bound) share the same prefix.
// Also need to make sure upper_bound are full length to ensure
// correctness
if (!full_length_enabled_ ||
iterate_upper_bound->size() != prefix_extractor_full_length_ ||
!comparator->IsSameLengthImmediateSuccessor(prefix,
*iterate_upper_bound)) {
return false;
}
}
return true;
} else {
return false;
}
}
} // namespace ROCKSDB_NAMESPACE

View File

@ -131,25 +131,12 @@ class FullFilterBlockReader
uint64_t block_offset, const bool no_io,
BlockCacheLookupContext* lookup_context) override;
size_t ApproximateMemoryUsage() const override;
bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
const SliceTransform* prefix_extractor,
const Comparator* comparator,
const Slice* const const_ikey_ptr, bool* filter_checked,
bool need_upper_bound_check, bool no_io,
BlockCacheLookupContext* lookup_context) override;
private:
bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context,
BlockCacheLookupContext* lookup_context) const;
void MayMatch(MultiGetRange* range, bool no_io,
const SliceTransform* prefix_extractor,
BlockCacheLookupContext* lookup_context) const;
bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix,
const Comparator* comparator) const;
private:
bool full_length_enabled_;
size_t prefix_extractor_full_length_;
};
} // namespace ROCKSDB_NAMESPACE

View File

@ -318,6 +318,8 @@ DEFINE_bool(reverse_iterator, false,
"When true use Prev rather than Next for iterators that do "
"Seek and then Next");
DEFINE_bool(auto_prefix_mode, false, "Set auto_prefix_mode for seek benchmark");
DEFINE_int64(max_scan_distance, 0,
"Used to define iterate_upper_bound (or iterate_lower_bound "
"if FLAGS_reverse_iterator is set to true) when value is nonzero");
@ -2563,7 +2565,7 @@ class Benchmark {
private:
std::shared_ptr<Cache> cache_;
std::shared_ptr<Cache> compressed_cache_;
const SliceTransform* prefix_extractor_;
std::shared_ptr<const SliceTransform> prefix_extractor_;
DBWithColumnFamilies db_;
std::vector<DBWithColumnFamilies> multi_dbs_;
int64_t num_;
@ -2966,7 +2968,9 @@ class Benchmark {
Benchmark()
: cache_(NewCache(FLAGS_cache_size)),
compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
prefix_extractor_(FLAGS_prefix_size != 0
? NewFixedPrefixTransform(FLAGS_prefix_size)
: nullptr),
num_(FLAGS_num),
key_size_(FLAGS_key_size),
user_timestamp_size_(FLAGS_user_timestamp_size),
@ -3057,7 +3061,6 @@ class Benchmark {
~Benchmark() {
DeleteDBs();
delete prefix_extractor_;
if (cache_.get() != nullptr) {
// Clear cache reference first
open_options_.write_buffer_manager.reset();
@ -4008,10 +4011,7 @@ class Benchmark {
FLAGS_fifo_compaction_allow_compaction);
options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm;
#endif // ROCKSDB_LITE
if (FLAGS_prefix_size != 0) {
options.prefix_extractor.reset(
NewFixedPrefixTransform(FLAGS_prefix_size));
}
options.prefix_extractor = prefix_extractor_;
if (FLAGS_use_uint64_comparator) {
options.comparator = test::Uint64Comparator();
if (FLAGS_key_size != 8) {
@ -6508,6 +6508,7 @@ class Benchmark {
}
}
}
options.auto_prefix_mode = FLAGS_auto_prefix_mode;
std::unique_ptr<const char[]> key_guard;
Slice key = AllocateKey(&key_guard);
@ -6537,6 +6538,14 @@ class Benchmark {
&upper_bound);
options.iterate_upper_bound = &upper_bound;
}
} else if (FLAGS_auto_prefix_mode && prefix_extractor_ &&
!FLAGS_reverse_iterator) {
// Set upper bound to next prefix
auto mutable_upper_bound = const_cast<char*>(upper_bound.data());
std::memcpy(mutable_upper_bound, key.data(), prefix_size_);
mutable_upper_bound[prefix_size_ - 1]++;
upper_bound = Slice(upper_bound.data(), prefix_size_);
options.iterate_upper_bound = &upper_bound;
}
// Pick a Iterator to use