rocksdb/util/bloom_test.cc

301 lines
7.1 KiB
C++
Raw Normal View History

// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
2014-05-09 15:34:18 +00:00
#ifndef GFLAGS
#include <cstdio>
int main() {
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
return 1;
}
#else
#include <gflags/gflags.h>
Implement full filter for block based table. Summary: 1. Make filter_block.h a base class. Derive block_based_filter_block and full_filter_block. The previous one is the traditional filter block. The full_filter_block is newly added. It would generate a filter block that contain all the keys in SST file. 2. When querying a key, table would first check if full_filter is available. If not, it would go to the exact data block and check using block_based filter. 3. User could choose to use full_filter or tradional(block_based_filter). They would be stored in SST file with different meta index name. "filter.filter_policy" or "full_filter.filter_policy". Then, Table reader is able to know the fllter block type. 4. Some optimizations have been done for full_filter_block, thus it requires a different interface compared to the original one in filter_policy.h. 5. Actual implementation of filter bits coding/decoding is placed in util/bloom_impl.cc Benchmark: base commit 1d23b5c470844c1208301311f0889eca750431c0 Command: db_bench --db=/dev/shm/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --write_buffer_size=134217728 --max_write_buffer_number=2 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --verify_checksum=false --max_background_compactions=4 --use_plain_table=0 --memtablerep=prefix_hash --open_files=-1 --mmap_read=1 --mmap_write=0 --bloom_bits=10 --bloom_locality=1 --memtable_bloom_bits=500000 --compression_type=lz4 --num=393216000 --use_hash_search=1 --block_size=1024 --block_restart_interval=16 --use_existing_db=1 --threads=1 --benchmarks=readrandom —disable_auto_compactions=1 Read QPS increase for about 30% from 2230002 to 2991411. Test Plan: make all check valgrind db_test db_stress --use_block_based_filter = 0 ./auto_sanity_test.sh Reviewers: igor, yhchiang, ljin, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D20979
2014-09-08 17:37:05 +00:00
#include <vector>
#include "rocksdb/filter_policy.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
Implement full filter for block based table. Summary: 1. Make filter_block.h a base class. Derive block_based_filter_block and full_filter_block. The previous one is the traditional filter block. The full_filter_block is newly added. It would generate a filter block that contain all the keys in SST file. 2. When querying a key, table would first check if full_filter is available. If not, it would go to the exact data block and check using block_based filter. 3. User could choose to use full_filter or tradional(block_based_filter). They would be stored in SST file with different meta index name. "filter.filter_policy" or "full_filter.filter_policy". Then, Table reader is able to know the fllter block type. 4. Some optimizations have been done for full_filter_block, thus it requires a different interface compared to the original one in filter_policy.h. 5. Actual implementation of filter bits coding/decoding is placed in util/bloom_impl.cc Benchmark: base commit 1d23b5c470844c1208301311f0889eca750431c0 Command: db_bench --db=/dev/shm/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --write_buffer_size=134217728 --max_write_buffer_number=2 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --verify_checksum=false --max_background_compactions=4 --use_plain_table=0 --memtablerep=prefix_hash --open_files=-1 --mmap_read=1 --mmap_write=0 --bloom_bits=10 --bloom_locality=1 --memtable_bloom_bits=500000 --compression_type=lz4 --num=393216000 --use_hash_search=1 --block_size=1024 --block_restart_interval=16 --use_existing_db=1 --threads=1 --benchmarks=readrandom —disable_auto_compactions=1 Read QPS increase for about 30% from 2230002 to 2991411. Test Plan: make all check valgrind db_test db_stress --use_block_based_filter = 0 ./auto_sanity_test.sh Reviewers: igor, yhchiang, ljin, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D20979
2014-09-08 17:37:05 +00:00
#include "util/arena.h"
2014-05-09 15:34:18 +00:00
using GFLAGS::ParseCommandLineFlags;
DEFINE_int32(bits_per_key, 10, "");
namespace rocksdb {
static const int kVerbose = 1;
static Slice Key(int i, char* buffer) {
memcpy(buffer, &i, sizeof(i));
return Slice(buffer, sizeof(i));
}
Implement full filter for block based table. Summary: 1. Make filter_block.h a base class. Derive block_based_filter_block and full_filter_block. The previous one is the traditional filter block. The full_filter_block is newly added. It would generate a filter block that contain all the keys in SST file. 2. When querying a key, table would first check if full_filter is available. If not, it would go to the exact data block and check using block_based filter. 3. User could choose to use full_filter or tradional(block_based_filter). They would be stored in SST file with different meta index name. "filter.filter_policy" or "full_filter.filter_policy". Then, Table reader is able to know the fllter block type. 4. Some optimizations have been done for full_filter_block, thus it requires a different interface compared to the original one in filter_policy.h. 5. Actual implementation of filter bits coding/decoding is placed in util/bloom_impl.cc Benchmark: base commit 1d23b5c470844c1208301311f0889eca750431c0 Command: db_bench --db=/dev/shm/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --write_buffer_size=134217728 --max_write_buffer_number=2 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --verify_checksum=false --max_background_compactions=4 --use_plain_table=0 --memtablerep=prefix_hash --open_files=-1 --mmap_read=1 --mmap_write=0 --bloom_bits=10 --bloom_locality=1 --memtable_bloom_bits=500000 --compression_type=lz4 --num=393216000 --use_hash_search=1 --block_size=1024 --block_restart_interval=16 --use_existing_db=1 --threads=1 --benchmarks=readrandom —disable_auto_compactions=1 Read QPS increase for about 30% from 2230002 to 2991411. Test Plan: make all check valgrind db_test db_stress --use_block_based_filter = 0 ./auto_sanity_test.sh Reviewers: igor, yhchiang, ljin, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D20979
2014-09-08 17:37:05 +00:00
static int NextLength(int length) {
if (length < 10) {
length += 1;
} else if (length < 100) {
length += 10;
} else if (length < 1000) {
length += 100;
} else {
length += 1000;
}
return length;
}
class BloomTest {
private:
const FilterPolicy* policy_;
std::string filter_;
std::vector<std::string> keys_;
public:
Implement full filter for block based table. Summary: 1. Make filter_block.h a base class. Derive block_based_filter_block and full_filter_block. The previous one is the traditional filter block. The full_filter_block is newly added. It would generate a filter block that contain all the keys in SST file. 2. When querying a key, table would first check if full_filter is available. If not, it would go to the exact data block and check using block_based filter. 3. User could choose to use full_filter or tradional(block_based_filter). They would be stored in SST file with different meta index name. "filter.filter_policy" or "full_filter.filter_policy". Then, Table reader is able to know the fllter block type. 4. Some optimizations have been done for full_filter_block, thus it requires a different interface compared to the original one in filter_policy.h. 5. Actual implementation of filter bits coding/decoding is placed in util/bloom_impl.cc Benchmark: base commit 1d23b5c470844c1208301311f0889eca750431c0 Command: db_bench --db=/dev/shm/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --write_buffer_size=134217728 --max_write_buffer_number=2 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --verify_checksum=false --max_background_compactions=4 --use_plain_table=0 --memtablerep=prefix_hash --open_files=-1 --mmap_read=1 --mmap_write=0 --bloom_bits=10 --bloom_locality=1 --memtable_bloom_bits=500000 --compression_type=lz4 --num=393216000 --use_hash_search=1 --block_size=1024 --block_restart_interval=16 --use_existing_db=1 --threads=1 --benchmarks=readrandom —disable_auto_compactions=1 Read QPS increase for about 30% from 2230002 to 2991411. Test Plan: make all check valgrind db_test db_stress --use_block_based_filter = 0 ./auto_sanity_test.sh Reviewers: igor, yhchiang, ljin, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D20979
2014-09-08 17:37:05 +00:00
BloomTest() : policy_(
NewBloomFilterPolicy(FLAGS_bits_per_key)) {}
~BloomTest() {
delete policy_;
}
void Reset() {
keys_.clear();
filter_.clear();
}
void Add(const Slice& s) {
keys_.push_back(s.ToString());
}
void Build() {
std::vector<Slice> key_slices;
for (size_t i = 0; i < keys_.size(); i++) {
key_slices.push_back(Slice(keys_[i]));
}
filter_.clear();
policy_->CreateFilter(&key_slices[0], static_cast<int>(key_slices.size()),
&filter_);
keys_.clear();
if (kVerbose >= 2) DumpFilter();
}
size_t FilterSize() const {
return filter_.size();
}
void DumpFilter() {
fprintf(stderr, "F(");
for (size_t i = 0; i+1 < filter_.size(); i++) {
const unsigned int c = static_cast<unsigned int>(filter_[i]);
for (int j = 0; j < 8; j++) {
fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.');
}
}
fprintf(stderr, ")\n");
}
bool Matches(const Slice& s) {
if (!keys_.empty()) {
Build();
}
return policy_->KeyMayMatch(s, filter_);
}
double FalsePositiveRate() {
char buffer[sizeof(int)];
int result = 0;
for (int i = 0; i < 10000; i++) {
if (Matches(Key(i + 1000000000, buffer))) {
result++;
}
}
return result / 10000.0;
}
};
TEST(BloomTest, EmptyFilter) {
ASSERT_TRUE(! Matches("hello"));
ASSERT_TRUE(! Matches("world"));
}
TEST(BloomTest, Small) {
Add("hello");
Add("world");
ASSERT_TRUE(Matches("hello"));
ASSERT_TRUE(Matches("world"));
ASSERT_TRUE(! Matches("x"));
ASSERT_TRUE(! Matches("foo"));
}
TEST(BloomTest, VaryingLengths) {
char buffer[sizeof(int)];
// Count number of filters that significantly exceed the false positive rate
int mediocre_filters = 0;
int good_filters = 0;
for (int length = 1; length <= 10000; length = NextLength(length)) {
Reset();
for (int i = 0; i < length; i++) {
Add(Key(i, buffer));
}
Build();
ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length;
// All added keys must match
for (int i = 0; i < length; i++) {
ASSERT_TRUE(Matches(Key(i, buffer)))
<< "Length " << length << "; key " << i;
}
// Check false positive rate
double rate = FalsePositiveRate();
if (kVerbose >= 1) {
fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
rate*100.0, length, static_cast<int>(FilterSize()));
}
ASSERT_LE(rate, 0.02); // Must not be over 2%
if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often
else good_filters++;
}
if (kVerbose >= 1) {
fprintf(stderr, "Filters: %d good, %d mediocre\n",
good_filters, mediocre_filters);
}
ASSERT_LE(mediocre_filters, good_filters/5);
}
// Different bits-per-byte
Implement full filter for block based table. Summary: 1. Make filter_block.h a base class. Derive block_based_filter_block and full_filter_block. The previous one is the traditional filter block. The full_filter_block is newly added. It would generate a filter block that contain all the keys in SST file. 2. When querying a key, table would first check if full_filter is available. If not, it would go to the exact data block and check using block_based filter. 3. User could choose to use full_filter or tradional(block_based_filter). They would be stored in SST file with different meta index name. "filter.filter_policy" or "full_filter.filter_policy". Then, Table reader is able to know the fllter block type. 4. Some optimizations have been done for full_filter_block, thus it requires a different interface compared to the original one in filter_policy.h. 5. Actual implementation of filter bits coding/decoding is placed in util/bloom_impl.cc Benchmark: base commit 1d23b5c470844c1208301311f0889eca750431c0 Command: db_bench --db=/dev/shm/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --write_buffer_size=134217728 --max_write_buffer_number=2 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --verify_checksum=false --max_background_compactions=4 --use_plain_table=0 --memtablerep=prefix_hash --open_files=-1 --mmap_read=1 --mmap_write=0 --bloom_bits=10 --bloom_locality=1 --memtable_bloom_bits=500000 --compression_type=lz4 --num=393216000 --use_hash_search=1 --block_size=1024 --block_restart_interval=16 --use_existing_db=1 --threads=1 --benchmarks=readrandom —disable_auto_compactions=1 Read QPS increase for about 30% from 2230002 to 2991411. Test Plan: make all check valgrind db_test db_stress --use_block_based_filter = 0 ./auto_sanity_test.sh Reviewers: igor, yhchiang, ljin, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D20979
2014-09-08 17:37:05 +00:00
class FullBloomTest {
private:
const FilterPolicy* policy_;
std::unique_ptr<FilterBitsBuilder> bits_builder_;
std::unique_ptr<FilterBitsReader> bits_reader_;
std::unique_ptr<const char[]> buf_;
size_t filter_size_;
public:
FullBloomTest() :
policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, false)),
filter_size_(0) {
Reset();
}
~FullBloomTest() {
delete policy_;
}
void Reset() {
bits_builder_.reset(policy_->GetFilterBitsBuilder());
bits_reader_.reset(nullptr);
buf_.reset(nullptr);
filter_size_ = 0;
}
void Add(const Slice& s) {
bits_builder_->AddKey(s);
}
void Build() {
Slice filter = bits_builder_->Finish(&buf_);
bits_reader_.reset(policy_->GetFilterBitsReader(filter));
filter_size_ = filter.size();
}
size_t FilterSize() const {
return filter_size_;
}
bool Matches(const Slice& s) {
if (bits_reader_ == nullptr) {
Build();
}
return bits_reader_->MayMatch(s);
}
double FalsePositiveRate() {
char buffer[sizeof(int)];
int result = 0;
for (int i = 0; i < 10000; i++) {
if (Matches(Key(i + 1000000000, buffer))) {
result++;
}
}
return result / 10000.0;
}
};
TEST(FullBloomTest, FullEmptyFilter) {
// Empty filter is not match, at this level
ASSERT_TRUE(!Matches("hello"));
ASSERT_TRUE(!Matches("world"));
}
TEST(FullBloomTest, FullSmall) {
Add("hello");
Add("world");
ASSERT_TRUE(Matches("hello"));
ASSERT_TRUE(Matches("world"));
ASSERT_TRUE(!Matches("x"));
ASSERT_TRUE(!Matches("foo"));
}
TEST(FullBloomTest, FullVaryingLengths) {
char buffer[sizeof(int)];
// Count number of filters that significantly exceed the false positive rate
int mediocre_filters = 0;
int good_filters = 0;
for (int length = 1; length <= 10000; length = NextLength(length)) {
Reset();
for (int i = 0; i < length; i++) {
Add(Key(i, buffer));
}
Build();
ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 128 + 5)) << length;
// All added keys must match
for (int i = 0; i < length; i++) {
ASSERT_TRUE(Matches(Key(i, buffer)))
<< "Length " << length << "; key " << i;
}
// Check false positive rate
double rate = FalsePositiveRate();
if (kVerbose >= 1) {
fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
rate*100.0, length, static_cast<int>(FilterSize()));
}
ASSERT_LE(rate, 0.02); // Must not be over 2%
if (rate > 0.0125)
mediocre_filters++; // Allowed, but not too often
else
good_filters++;
}
if (kVerbose >= 1) {
fprintf(stderr, "Filters: %d good, %d mediocre\n",
good_filters, mediocre_filters);
}
ASSERT_LE(mediocre_filters, good_filters/5);
}
} // namespace rocksdb
int main(int argc, char** argv) {
2014-05-09 15:34:18 +00:00
ParseCommandLineFlags(&argc, &argv, true);
return rocksdb::test::RunAllTests();
}
2014-05-09 15:34:18 +00:00
#endif // GFLAGS