Cut filter partition based on metadata_block_size

Summary:
Currently metadata_block_size controls only index partition size. With this patch a partition is cut after any of index or filter partitions reaches metadata_block_size.
Closes https://github.com/facebook/rocksdb/pull/2452

Differential Revision: D5275651

Pulled By: maysamyabandeh

fbshipit-source-id: 5057e4424b4c8902043782e6bf8c38f0c4f25160
This commit is contained in:
Maysam Yabandeh 2017-07-02 10:36:10 -07:00 committed by Facebook Github Bot
parent f4ae1bab02
commit 45b9bb0331
10 changed files with 209 additions and 59 deletions

View file

@ -20,8 +20,10 @@
#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
#include <string>
#include <memory>
#include <stdexcept>
#include <string>
#include <vector>
namespace rocksdb {
@ -41,6 +43,16 @@ class FilterBitsBuilder {
// The return value of this function would be the filter bits,
// The ownership of actual data is set to buf
virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
// Calculate num of entries fit into a space.
virtual int CalculateNumEntry(const uint32_t space) {
#ifndef ROCKSDB_LITE
throw std::runtime_error("CalculateNumEntry not Implemented");
#else
abort();
#endif
return 0;
}
};
// A class that checks if a key can be in filter

View file

@ -76,10 +76,18 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
} else {
if (table_opt.partition_filters) {
assert(p_index_builder != nullptr);
// Since after partition cut request from filter builder it takes time
// until index builder actully cuts the partition, we take the lower bound
// as partition size.
assert(table_opt.block_size_deviation <= 100);
auto partition_size =
(const uint32_t)(table_opt.metadata_block_size *
(100 - table_opt.block_size_deviation));
partition_size = std::max(partition_size, (const uint32_t)1);
return new PartitionedFilterBlockBuilder(
opt.prefix_extractor, table_opt.whole_key_filtering,
filter_bits_builder, table_opt.index_block_restart_interval,
p_index_builder);
p_index_builder, partition_size);
} else {
return new FullFilterBlockBuilder(opt.prefix_extractor,
table_opt.whole_key_filtering,

View file

@ -0,0 +1,73 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "rocksdb/filter_policy.h"
namespace rocksdb {
class Slice;
class FullFilterBitsBuilder : public FilterBitsBuilder {
public:
explicit FullFilterBitsBuilder(const size_t bits_per_key,
const size_t num_probes);
~FullFilterBitsBuilder();
virtual void AddKey(const Slice& key) override;
// Create a filter that for hashes [0, n-1], the filter is allocated here
// When creating filter, it is ensured that
// total_bits = num_lines * CACHE_LINE_SIZE * 8
// dst len is >= 5, 1 for num_probes, 4 for num_lines
// Then total_bits = (len - 5) * 8, and cache_line_size could be calculated
// +----------------------------------------------------------------+
// | filter data with length total_bits/8 |
// +----------------------------------------------------------------+
// | |
// | ... |
// | |
// +----------------------------------------------------------------+
// | ... | num_probes : 1 byte | num_lines : 4 bytes |
// +----------------------------------------------------------------+
virtual Slice Finish(std::unique_ptr<const char[]>* buf) override;
// Calculate num of entries fit into a space.
virtual int CalculateNumEntry(const uint32_t space) override;
// Calculate space for new filter. This is reverse of CalculateNumEntry.
uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
uint32_t* num_lines);
private:
size_t bits_per_key_;
size_t num_probes_;
std::vector<uint32_t> hash_entries_;
// Get totalbits that optimized for cpu cache line
uint32_t GetTotalBitsForLocality(uint32_t total_bits);
// Reserve space for new filter
char* ReserveSpace(const int num_entry, uint32_t* total_bits,
uint32_t* num_lines);
// Assuming single threaded access to this function.
void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
// No Copy allowed
FullFilterBitsBuilder(const FullFilterBitsBuilder&);
void operator=(const FullFilterBitsBuilder&);
};
} // namespace rocksdb

View file

@ -77,6 +77,11 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
table_opt_.metadata_block_size, table_opt_.block_size_deviation,
sub_index_builder_->index_block_builder_));
partition_cut_requested_ = false;
}
void PartitionedIndexBuilder::RequestPartitionCut() {
partition_cut_requested_ = true;
}
void PartitionedIndexBuilder::AddIndexEntry(
@ -102,6 +107,7 @@ void PartitionedIndexBuilder::AddIndexEntry(
std::string handle_encoding;
block_handle.EncodeTo(&handle_encoding);
bool do_flush =
partition_cut_requested_ ||
flush_policy_->Update(*last_key_in_current_block, handle_encoding);
if (do_flush) {
entries_.push_back(

View file

@ -314,6 +314,10 @@ class PartitionedIndexBuilder : public IndexBuilder {
std::string& GetPartitionKey() { return sub_index_last_key_; }
// Called when an external entity (such as filter partition builder) request
// cutting the next partition
void RequestPartitionCut();
private:
void MakeNewSubIndexBuilder();
@ -331,6 +335,9 @@ class PartitionedIndexBuilder : public IndexBuilder {
// true if Finish is called once but not complete yet.
bool finishing_indexes = false;
const BlockBasedTableOptions& table_opt_;
// true if an external entity (such as filter partition builder) request
// cutting the next partition
bool partition_cut_requested_ = true;
// true if it should cut the next filter partition block
bool cut_filter_block = false;
};

View file

@ -20,15 +20,25 @@ namespace rocksdb {
PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
const SliceTransform* prefix_extractor, bool whole_key_filtering,
FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
PartitionedIndexBuilder* const p_index_builder)
PartitionedIndexBuilder* const p_index_builder,
const uint32_t partition_size)
: FullFilterBlockBuilder(prefix_extractor, whole_key_filtering,
filter_bits_builder),
index_on_filter_block_builder_(index_block_restart_interval),
p_index_builder_(p_index_builder) {}
p_index_builder_(p_index_builder) {
filters_per_partition_ =
filter_bits_builder_->CalculateNumEntry(partition_size);
}
PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
// Use == to send the request only once
if (filters_in_partition_ == filters_per_partition_) {
// Currently only index builder is in charge of cutting a partition. We keep
// requesting until it is granted.
p_index_builder_->RequestPartitionCut();
}
if (!p_index_builder_->ShouldCutFilterBlock()) {
return;
}
@ -36,11 +46,13 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
std::string& index_key = p_index_builder_->GetPartitionKey();
filters.push_back({index_key, filter});
filters_in_partition_ = 0;
}
void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
MaybeCutAFilterBlock();
filter_bits_builder_->AddKey(key);
filters_in_partition_++;
}
Slice PartitionedFilterBlockBuilder::Finish(

View file

@ -28,7 +28,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
explicit PartitionedFilterBlockBuilder(
const SliceTransform* prefix_extractor, bool whole_key_filtering,
FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
PartitionedIndexBuilder* const p_index_builder);
PartitionedIndexBuilder* const p_index_builder,
const uint32_t partition_size);
virtual ~PartitionedFilterBlockBuilder();
@ -51,7 +52,15 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
false; // true if Finish is called once but not complete yet.
// The policy of when cut a filter block and Finish it
void MaybeCutAFilterBlock();
// Currently we keep the same number of partitions for filters and indexes.
// This would allow for some potentioal optimizations in future. If such
// optimizations did not realize we can use different number of partitions and
// eliminate p_index_builder_
PartitionedIndexBuilder* const p_index_builder_;
// The desired number of filters per partition
uint32_t filters_per_partition_;
// The current number of filters in the last partition
uint32_t filters_in_partition_;
};
class PartitionedFilterBlockReader : public FilterBlockReader {

View file

@ -9,6 +9,7 @@
#include "rocksdb/filter_policy.h"
#include "table/full_filter_bits_builder.h"
#include "table/index_builder.h"
#include "table/partitioned_filter_block.h"
#include "util/coding.h"
@ -64,6 +65,16 @@ class PartitionedFilterBlockTest : public testing::Test {
return max_index_size;
}
uint64_t MaxFilterSize() {
int num_keys = sizeof(keys) / sizeof(*keys);
auto filter_bits_reader = dynamic_cast<rocksdb::FullFilterBitsBuilder*>(
table_options_.filter_policy->GetFilterBitsBuilder());
uint32_t dont_care1, dont_care2;
auto partition_size =
filter_bits_reader->CalculateSpace(num_keys, &dont_care1, &dont_care2);
return partition_size + table_options_.block_size_deviation;
}
int last_offset = 10;
BlockHandle Write(const Slice& slice) {
BlockHandle bh(last_offset + 1, slice.size());
@ -78,10 +89,17 @@ class PartitionedFilterBlockTest : public testing::Test {
PartitionedFilterBlockBuilder* NewBuilder(
PartitionedIndexBuilder* const p_index_builder) {
uint32_t partition_size =
table_options_.metadata_block_size >
(uint64_t)table_options_.block_size_deviation
? table_options_.metadata_block_size -
table_options_.block_size_deviation
: 1;
return new PartitionedFilterBlockBuilder(
nullptr, table_options_.whole_key_filtering,
table_options_.filter_policy->GetFilterBitsBuilder(),
table_options_.index_block_restart_interval, p_index_builder);
table_options_.index_block_restart_interval, p_index_builder,
partition_size);
}
std::unique_ptr<MockedBlockBasedTable> table;
@ -261,7 +279,8 @@ TEST_F(PartitionedFilterBlockTest, OneBlockPerKey) {
TEST_F(PartitionedFilterBlockTest, PartitionCount) {
int num_keys = sizeof(keys) / sizeof(*keys);
table_options_.metadata_block_size = MaxIndexSize();
table_options_.metadata_block_size =
std::max(MaxIndexSize(), MaxFilterSize());
int partitions = TestBlockPerKey();
ASSERT_EQ(partitions, 1);
// A low number ensures cutting a block after each key

View file

@ -13,49 +13,32 @@
#include "rocksdb/slice.h"
#include "table/block_based_filter_block.h"
#include "table/full_filter_bits_builder.h"
#include "table/full_filter_block.h"
#include "util/hash.h"
#include "util/coding.h"
#include "util/hash.h"
namespace rocksdb {
class BlockBasedFilterBlockBuilder;
class FullFilterBlockBuilder;
namespace {
class FullFilterBitsBuilder : public FilterBitsBuilder {
public:
explicit FullFilterBitsBuilder(const size_t bits_per_key,
const size_t num_probes)
: bits_per_key_(bits_per_key),
num_probes_(num_probes) {
assert(bits_per_key_);
FullFilterBitsBuilder::FullFilterBitsBuilder(const size_t bits_per_key,
const size_t num_probes)
: bits_per_key_(bits_per_key), num_probes_(num_probes) {
assert(bits_per_key_);
}
~FullFilterBitsBuilder() {}
FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
virtual void AddKey(const Slice& key) override {
void FullFilterBitsBuilder::AddKey(const Slice& key) {
uint32_t hash = BloomHash(key);
if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
hash_entries_.push_back(hash);
}
}
// Create a filter that for hashes [0, n-1], the filter is allocated here
// When creating filter, it is ensured that
// total_bits = num_lines * CACHE_LINE_SIZE * 8
// dst len is >= 5, 1 for num_probes, 4 for num_lines
// Then total_bits = (len - 5) * 8, and cache_line_size could be calculated
// +----------------------------------------------------------------+
// | filter data with length total_bits/8 |
// +----------------------------------------------------------------+
// | |
// | ... |
// | |
// +----------------------------------------------------------------+
// | ... | num_probes : 1 byte | num_lines : 4 bytes |
// +----------------------------------------------------------------+
virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
uint32_t total_bits, num_lines;
char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
&total_bits, &num_lines);
@ -76,27 +59,6 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
return Slice(data, total_bits / 8 + 5);
}
private:
size_t bits_per_key_;
size_t num_probes_;
std::vector<uint32_t> hash_entries_;
// Get totalbits that optimized for cpu cache line
uint32_t GetTotalBitsForLocality(uint32_t total_bits);
// Reserve space for new filter
char* ReserveSpace(const int num_entry, uint32_t* total_bits,
uint32_t* num_lines);
// Assuming single threaded access to this function.
void AddHash(uint32_t h, char* data, uint32_t num_lines,
uint32_t total_bits);
// No Copy allowed
FullFilterBitsBuilder(const FullFilterBitsBuilder&);
void operator=(const FullFilterBitsBuilder&);
};
uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
uint32_t num_lines =
(total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
@ -109,10 +71,10 @@ uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
return num_lines * (CACHE_LINE_SIZE * 8);
}
char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
uint32_t* total_bits, uint32_t* num_lines) {
uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
uint32_t* total_bits,
uint32_t* num_lines) {
assert(bits_per_key_);
char* data = nullptr;
if (num_entry != 0) {
uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_);
@ -128,12 +90,35 @@ char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
// Reserve space for Filter
uint32_t sz = *total_bits / 8;
sz += 5; // 4 bytes for num_lines, 1 byte for num_probes
return sz;
}
data = new char[sz];
char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
uint32_t* total_bits,
uint32_t* num_lines) {
uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
char* data = new char[sz];
memset(data, 0, sz);
return data;
}
int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
assert(bits_per_key_);
assert(space > 0);
uint32_t dont_care1, dont_care2;
int high = (int) (space * 8 / bits_per_key_ + 1);
int low = 1;
int n = high;
for (; n >= low; n--) {
uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2);
if (sz <= space) {
break;
}
}
assert(n < high); // High should be an overestimation
return n;
}
inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
uint32_t num_lines, uint32_t total_bits) {
assert(num_lines > 0 && total_bits > 0);
@ -151,6 +136,7 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
}
}
namespace {
class FullFilterBitsReader : public FilterBitsReader {
public:
explicit FullFilterBitsReader(const Slice& contents)

View file

@ -21,10 +21,11 @@ int main() {
#include <vector>
#include "rocksdb/filter_policy.h"
#include "table/full_filter_bits_builder.h"
#include "util/arena.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "util/arena.h"
using GFLAGS::ParseCommandLineFlags;
@ -197,6 +198,10 @@ class FullBloomTest : public testing::Test {
delete policy_;
}
FullFilterBitsBuilder* GetFullFilterBitsBuilder() {
return dynamic_cast<FullFilterBitsBuilder*>(bits_builder_.get());
}
void Reset() {
bits_builder_.reset(policy_->GetFilterBitsBuilder());
bits_reader_.reset(nullptr);
@ -237,6 +242,19 @@ class FullBloomTest : public testing::Test {
}
};
TEST_F(FullBloomTest, FilterSize) {
uint32_t dont_care1, dont_care2;
auto full_bits_builder = GetFullFilterBitsBuilder();
for (int n = 1; n < 100; n++) {
auto space = full_bits_builder->CalculateSpace(n, &dont_care1, &dont_care2);
auto n2 = full_bits_builder->CalculateNumEntry(space);
ASSERT_GE(n2, n);
auto space2 =
full_bits_builder->CalculateSpace(n2, &dont_care1, &dont_care2);
ASSERT_EQ(space, space2);
}
}
TEST_F(FullBloomTest, FullEmptyFilter) {
// Empty filter is not match, at this level
ASSERT_TRUE(!Matches("hello"));