mirror of https://github.com/facebook/rocksdb.git
Fix auto_prefix_mode performance with partitioned filters (#10012)
Summary: Essentially refactored the RangeMayExist implementation in FullFilterBlockReader to FilterBlockReaderCommon so that it applies to partitioned filters as well. (The function is not called for the block-based filter case.) RangeMayExist is essentially a series of checks around a possible PrefixMayExist, and I'm confident those checks should be the same for partitioned as for full filters. (I think it's likely that bugs remain in those checks, but this change is overall a simplifying one.) Added auto_prefix_mode support to db_bench Other small fixes as well Fixes https://github.com/facebook/rocksdb/issues/10003 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10012 Test Plan: Expanded unit test that uses statistics to check for filter optimization, fails without the production code changes here Performance: populate two DBs with ``` TEST_TMPDIR=/dev/shm/rocksdb_nonpartitioned ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=16 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 TEST_TMPDIR=/dev/shm/rocksdb_partitioned ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=16 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -partition_index_and_filters ``` Observe no measurable change in non-partitioned performance ``` TEST_TMPDIR=/dev/shm/rocksdb_nonpartitioned ./db_bench -benchmarks=seekrandom[-X1000] -num=10000000 -readonly -bloom_bits=16 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -auto_prefix_mode -cache_index_and_filter_blocks=1 -cache_size=1000000000 -duration 20 ``` Before: seekrandom [AVG 15 runs] : 11798 (± 331) ops/sec After: seekrandom [AVG 15 runs] : 11724 (± 315) ops/sec Observe big improvement with partitioned (also supported by bloom use statistics) ``` TEST_TMPDIR=/dev/shm/rocksdb_partitioned ./db_bench -benchmarks=seekrandom[-X1000] -num=10000000 -readonly -bloom_bits=16 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -partition_index_and_filters -auto_prefix_mode -cache_index_and_filter_blocks=1 -cache_size=1000000000 -duration 20 ``` Before: seekrandom [AVG 12 runs] : 2942 (± 57) ops/sec After: seekrandom [AVG 12 runs] : 7489 (± 184) ops/sec Reviewed By: siying Differential Revision: D36469796 Pulled By: pdillinger fbshipit-source-id: bcf1e2a68d347b32adb2b27384f945434e7a266d
This commit is contained in:
parent
c6d326d3d7
commit
280b9f371a
|
@ -6309,9 +6309,11 @@ TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {
|
|||
|
||||
#ifndef ROCKSDB_LITE
|
||||
TEST_F(DBTest2, AutoPrefixMode1) {
|
||||
do {
|
||||
// create a DB with block prefix index
|
||||
BlockBasedTableOptions table_options;
|
||||
Options options = CurrentOptions();
|
||||
BlockBasedTableOptions table_options =
|
||||
*options.table_factory->GetOptions<BlockBasedTableOptions>();
|
||||
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
|
||||
|
@ -6418,6 +6420,7 @@ TEST_F(DBTest2, AutoPrefixMode1) {
|
|||
ASSERT_TRUE(iterator->Valid());
|
||||
ASSERT_EQ("a1", iterator->key().ToString());
|
||||
}
|
||||
} while (ChangeOptions(kSkipPlainTable));
|
||||
}
|
||||
|
||||
class RenameCurrentTest : public DBTestBase,
|
||||
|
|
|
@ -188,16 +188,7 @@ class FilterBlockReader {
|
|||
const Slice* const const_ikey_ptr,
|
||||
bool* filter_checked, bool need_upper_bound_check,
|
||||
bool no_io,
|
||||
BlockCacheLookupContext* lookup_context) {
|
||||
if (need_upper_bound_check) {
|
||||
return true;
|
||||
}
|
||||
*filter_checked = true;
|
||||
Slice prefix = prefix_extractor->Transform(user_key_without_ts);
|
||||
return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io,
|
||||
const_ikey_ptr, /* get_context */ nullptr,
|
||||
lookup_context);
|
||||
}
|
||||
BlockCacheLookupContext* lookup_context) = 0;
|
||||
};
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
|
@ -94,6 +94,64 @@ size_t FilterBlockReaderCommon<TBlocklike>::ApproximateFilterBlockMemoryUsage()
|
|||
: 0;
|
||||
}
|
||||
|
||||
template <typename TBlocklike>
|
||||
bool FilterBlockReaderCommon<TBlocklike>::RangeMayExist(
|
||||
const Slice* iterate_upper_bound, const Slice& user_key_without_ts,
|
||||
const SliceTransform* prefix_extractor, const Comparator* comparator,
|
||||
const Slice* const const_ikey_ptr, bool* filter_checked,
|
||||
bool need_upper_bound_check, bool no_io,
|
||||
BlockCacheLookupContext* lookup_context) {
|
||||
if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) {
|
||||
*filter_checked = false;
|
||||
return true;
|
||||
}
|
||||
Slice prefix = prefix_extractor->Transform(user_key_without_ts);
|
||||
if (need_upper_bound_check &&
|
||||
!IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
|
||||
*filter_checked = false;
|
||||
return true;
|
||||
} else {
|
||||
*filter_checked = true;
|
||||
return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io,
|
||||
const_ikey_ptr, /* get_context */ nullptr,
|
||||
lookup_context);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TBlocklike>
|
||||
bool FilterBlockReaderCommon<TBlocklike>::IsFilterCompatible(
|
||||
const Slice* iterate_upper_bound, const Slice& prefix,
|
||||
const Comparator* comparator) const {
|
||||
// Try to reuse the bloom filter in the SST table if prefix_extractor in
|
||||
// mutable_cf_options has changed. If range [user_key, upper_bound) all
|
||||
// share the same prefix then we may still be able to use the bloom filter.
|
||||
const SliceTransform* const prefix_extractor = table_prefix_extractor();
|
||||
if (iterate_upper_bound != nullptr && prefix_extractor) {
|
||||
if (!prefix_extractor->InDomain(*iterate_upper_bound)) {
|
||||
return false;
|
||||
}
|
||||
Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
|
||||
// first check if user_key and upper_bound all share the same prefix
|
||||
if (comparator->CompareWithoutTimestamp(prefix, false, upper_bound_xform,
|
||||
false) != 0) {
|
||||
// second check if user_key's prefix is the immediate predecessor of
|
||||
// upper_bound and have the same length. If so, we know for sure all
|
||||
// keys in the range [user_key, upper_bound) share the same prefix.
|
||||
// Also need to make sure upper_bound are full length to ensure
|
||||
// correctness
|
||||
if (!full_length_enabled_ ||
|
||||
iterate_upper_bound->size() != prefix_extractor_full_length_ ||
|
||||
!comparator->IsSameLengthImmediateSuccessor(prefix,
|
||||
*iterate_upper_bound)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Explicitly instantiate templates for both "blocklike" types we use.
|
||||
// This makes it possible to keep the template definitions in the .cc file.
|
||||
template class FilterBlockReaderCommon<BlockContents>;
|
||||
|
|
|
@ -26,7 +26,19 @@ class FilterBlockReaderCommon : public FilterBlockReader {
|
|||
CachableEntry<TBlocklike>&& filter_block)
|
||||
: table_(t), filter_block_(std::move(filter_block)) {
|
||||
assert(table_);
|
||||
const SliceTransform* const prefix_extractor = table_prefix_extractor();
|
||||
if (prefix_extractor) {
|
||||
full_length_enabled_ =
|
||||
prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_);
|
||||
}
|
||||
}
|
||||
|
||||
bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
|
||||
const SliceTransform* prefix_extractor,
|
||||
const Comparator* comparator,
|
||||
const Slice* const const_ikey_ptr, bool* filter_checked,
|
||||
bool need_upper_bound_check, bool no_io,
|
||||
BlockCacheLookupContext* lookup_context) override;
|
||||
|
||||
protected:
|
||||
static Status ReadFilterBlock(const BlockBasedTable* table,
|
||||
|
@ -47,9 +59,15 @@ class FilterBlockReaderCommon : public FilterBlockReader {
|
|||
|
||||
size_t ApproximateFilterBlockMemoryUsage() const;
|
||||
|
||||
private:
|
||||
bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix,
|
||||
const Comparator* comparator) const;
|
||||
|
||||
private:
|
||||
const BlockBasedTable* table_;
|
||||
CachableEntry<TBlocklike> filter_block_;
|
||||
size_t prefix_extractor_full_length_ = 0;
|
||||
bool full_length_enabled_ = false;
|
||||
};
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
|
@ -120,11 +120,6 @@ FullFilterBlockReader::FullFilterBlockReader(
|
|||
const BlockBasedTable* t,
|
||||
CachableEntry<ParsedFullFilterBlock>&& filter_block)
|
||||
: FilterBlockReaderCommon(t, std::move(filter_block)) {
|
||||
const SliceTransform* const prefix_extractor = table_prefix_extractor();
|
||||
if (prefix_extractor) {
|
||||
full_length_enabled_ =
|
||||
prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_);
|
||||
}
|
||||
}
|
||||
|
||||
bool FullFilterBlockReader::KeyMayMatch(
|
||||
|
@ -306,60 +301,4 @@ size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
|
|||
return usage;
|
||||
}
|
||||
|
||||
bool FullFilterBlockReader::RangeMayExist(
|
||||
const Slice* iterate_upper_bound, const Slice& user_key_without_ts,
|
||||
const SliceTransform* prefix_extractor, const Comparator* comparator,
|
||||
const Slice* const const_ikey_ptr, bool* filter_checked,
|
||||
bool need_upper_bound_check, bool no_io,
|
||||
BlockCacheLookupContext* lookup_context) {
|
||||
if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) {
|
||||
*filter_checked = false;
|
||||
return true;
|
||||
}
|
||||
Slice prefix = prefix_extractor->Transform(user_key_without_ts);
|
||||
if (need_upper_bound_check &&
|
||||
!IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
|
||||
*filter_checked = false;
|
||||
return true;
|
||||
} else {
|
||||
*filter_checked = true;
|
||||
return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io,
|
||||
const_ikey_ptr, /* get_context */ nullptr,
|
||||
lookup_context);
|
||||
}
|
||||
}
|
||||
|
||||
bool FullFilterBlockReader::IsFilterCompatible(
|
||||
const Slice* iterate_upper_bound, const Slice& prefix,
|
||||
const Comparator* comparator) const {
|
||||
// Try to reuse the bloom filter in the SST table if prefix_extractor in
|
||||
// mutable_cf_options has changed. If range [user_key, upper_bound) all
|
||||
// share the same prefix then we may still be able to use the bloom filter.
|
||||
const SliceTransform* const prefix_extractor = table_prefix_extractor();
|
||||
if (iterate_upper_bound != nullptr && prefix_extractor) {
|
||||
if (!prefix_extractor->InDomain(*iterate_upper_bound)) {
|
||||
return false;
|
||||
}
|
||||
Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
|
||||
// first check if user_key and upper_bound all share the same prefix
|
||||
if (comparator->CompareWithoutTimestamp(prefix, false, upper_bound_xform,
|
||||
false) != 0) {
|
||||
// second check if user_key's prefix is the immediate predecessor of
|
||||
// upper_bound and have the same length. If so, we know for sure all
|
||||
// keys in the range [user_key, upper_bound) share the same prefix.
|
||||
// Also need to make sure upper_bound are full length to ensure
|
||||
// correctness
|
||||
if (!full_length_enabled_ ||
|
||||
iterate_upper_bound->size() != prefix_extractor_full_length_ ||
|
||||
!comparator->IsSameLengthImmediateSuccessor(prefix,
|
||||
*iterate_upper_bound)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
|
@ -131,25 +131,12 @@ class FullFilterBlockReader
|
|||
uint64_t block_offset, const bool no_io,
|
||||
BlockCacheLookupContext* lookup_context) override;
|
||||
size_t ApproximateMemoryUsage() const override;
|
||||
bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
|
||||
const SliceTransform* prefix_extractor,
|
||||
const Comparator* comparator,
|
||||
const Slice* const const_ikey_ptr, bool* filter_checked,
|
||||
bool need_upper_bound_check, bool no_io,
|
||||
BlockCacheLookupContext* lookup_context) override;
|
||||
|
||||
private:
|
||||
bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context,
|
||||
BlockCacheLookupContext* lookup_context) const;
|
||||
void MayMatch(MultiGetRange* range, bool no_io,
|
||||
const SliceTransform* prefix_extractor,
|
||||
BlockCacheLookupContext* lookup_context) const;
|
||||
bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix,
|
||||
const Comparator* comparator) const;
|
||||
|
||||
private:
|
||||
bool full_length_enabled_;
|
||||
size_t prefix_extractor_full_length_;
|
||||
};
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
|
@ -318,6 +318,8 @@ DEFINE_bool(reverse_iterator, false,
|
|||
"When true use Prev rather than Next for iterators that do "
|
||||
"Seek and then Next");
|
||||
|
||||
DEFINE_bool(auto_prefix_mode, false, "Set auto_prefix_mode for seek benchmark");
|
||||
|
||||
DEFINE_int64(max_scan_distance, 0,
|
||||
"Used to define iterate_upper_bound (or iterate_lower_bound "
|
||||
"if FLAGS_reverse_iterator is set to true) when value is nonzero");
|
||||
|
@ -2563,7 +2565,7 @@ class Benchmark {
|
|||
private:
|
||||
std::shared_ptr<Cache> cache_;
|
||||
std::shared_ptr<Cache> compressed_cache_;
|
||||
const SliceTransform* prefix_extractor_;
|
||||
std::shared_ptr<const SliceTransform> prefix_extractor_;
|
||||
DBWithColumnFamilies db_;
|
||||
std::vector<DBWithColumnFamilies> multi_dbs_;
|
||||
int64_t num_;
|
||||
|
@ -2966,7 +2968,9 @@ class Benchmark {
|
|||
Benchmark()
|
||||
: cache_(NewCache(FLAGS_cache_size)),
|
||||
compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
|
||||
prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
|
||||
prefix_extractor_(FLAGS_prefix_size != 0
|
||||
? NewFixedPrefixTransform(FLAGS_prefix_size)
|
||||
: nullptr),
|
||||
num_(FLAGS_num),
|
||||
key_size_(FLAGS_key_size),
|
||||
user_timestamp_size_(FLAGS_user_timestamp_size),
|
||||
|
@ -3057,7 +3061,6 @@ class Benchmark {
|
|||
|
||||
~Benchmark() {
|
||||
DeleteDBs();
|
||||
delete prefix_extractor_;
|
||||
if (cache_.get() != nullptr) {
|
||||
// Clear cache reference first
|
||||
open_options_.write_buffer_manager.reset();
|
||||
|
@ -4008,10 +4011,7 @@ class Benchmark {
|
|||
FLAGS_fifo_compaction_allow_compaction);
|
||||
options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm;
|
||||
#endif // ROCKSDB_LITE
|
||||
if (FLAGS_prefix_size != 0) {
|
||||
options.prefix_extractor.reset(
|
||||
NewFixedPrefixTransform(FLAGS_prefix_size));
|
||||
}
|
||||
options.prefix_extractor = prefix_extractor_;
|
||||
if (FLAGS_use_uint64_comparator) {
|
||||
options.comparator = test::Uint64Comparator();
|
||||
if (FLAGS_key_size != 8) {
|
||||
|
@ -6508,6 +6508,7 @@ class Benchmark {
|
|||
}
|
||||
}
|
||||
}
|
||||
options.auto_prefix_mode = FLAGS_auto_prefix_mode;
|
||||
|
||||
std::unique_ptr<const char[]> key_guard;
|
||||
Slice key = AllocateKey(&key_guard);
|
||||
|
@ -6537,6 +6538,14 @@ class Benchmark {
|
|||
&upper_bound);
|
||||
options.iterate_upper_bound = &upper_bound;
|
||||
}
|
||||
} else if (FLAGS_auto_prefix_mode && prefix_extractor_ &&
|
||||
!FLAGS_reverse_iterator) {
|
||||
// Set upper bound to next prefix
|
||||
auto mutable_upper_bound = const_cast<char*>(upper_bound.data());
|
||||
std::memcpy(mutable_upper_bound, key.data(), prefix_size_);
|
||||
mutable_upper_bound[prefix_size_ - 1]++;
|
||||
upper_bound = Slice(upper_bound.data(), prefix_size_);
|
||||
options.iterate_upper_bound = &upper_bound;
|
||||
}
|
||||
|
||||
// Pick a Iterator to use
|
||||
|
|
Loading…
Reference in New Issue