mirror of
https://github.com/facebook/rocksdb.git
synced 2024-12-02 10:15:54 +00:00
fefd4b98c5
Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
420 lines
14 KiB
C++
420 lines
14 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "rocksdb/filter_policy.h"
|
|
|
|
#include "rocksdb/slice.h"
|
|
#include "table/block_based_filter_block.h"
|
|
#include "table/full_filter_bits_builder.h"
|
|
#include "table/full_filter_block.h"
|
|
#include "util/coding.h"
|
|
#include "util/hash.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
class BlockBasedFilterBlockBuilder;
|
|
class FullFilterBlockBuilder;
|
|
|
|
FullFilterBitsBuilder::FullFilterBitsBuilder(const size_t bits_per_key,
|
|
const size_t num_probes)
|
|
: bits_per_key_(bits_per_key), num_probes_(num_probes) {
|
|
assert(bits_per_key_);
|
|
}
|
|
|
|
FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
|
|
|
|
void FullFilterBitsBuilder::AddKey(const Slice& key) {
|
|
uint32_t hash = BloomHash(key);
|
|
if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
|
|
hash_entries_.push_back(hash);
|
|
}
|
|
}
|
|
|
|
Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
|
|
uint32_t total_bits, num_lines;
|
|
char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
|
|
&total_bits, &num_lines);
|
|
assert(data);
|
|
|
|
if (total_bits != 0 && num_lines != 0) {
|
|
for (auto h : hash_entries_) {
|
|
AddHash(h, data, num_lines, total_bits);
|
|
}
|
|
}
|
|
data[total_bits/8] = static_cast<char>(num_probes_);
|
|
EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines));
|
|
|
|
const char* const_data = data;
|
|
buf->reset(const_data);
|
|
hash_entries_.clear();
|
|
|
|
return Slice(data, total_bits / 8 + 5);
|
|
}
|
|
|
|
uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
|
|
uint32_t num_lines =
|
|
(total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
|
|
|
|
// Make num_lines an odd number to make sure more bits are involved
|
|
// when determining which block.
|
|
if (num_lines % 2 == 0) {
|
|
num_lines++;
|
|
}
|
|
return num_lines * (CACHE_LINE_SIZE * 8);
|
|
}
|
|
|
|
uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
|
|
uint32_t* total_bits,
|
|
uint32_t* num_lines) {
|
|
assert(bits_per_key_);
|
|
if (num_entry != 0) {
|
|
uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_);
|
|
|
|
*total_bits = GetTotalBitsForLocality(total_bits_tmp);
|
|
*num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
|
|
assert(*total_bits > 0 && *total_bits % 8 == 0);
|
|
} else {
|
|
// filter is empty, just leave space for metadata
|
|
*total_bits = 0;
|
|
*num_lines = 0;
|
|
}
|
|
|
|
// Reserve space for Filter
|
|
uint32_t sz = *total_bits / 8;
|
|
sz += 5; // 4 bytes for num_lines, 1 byte for num_probes
|
|
return sz;
|
|
}
|
|
|
|
char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
|
|
uint32_t* total_bits,
|
|
uint32_t* num_lines) {
|
|
uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
|
|
char* data = new char[sz];
|
|
memset(data, 0, sz);
|
|
return data;
|
|
}
|
|
|
|
int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
|
|
assert(bits_per_key_);
|
|
assert(space > 0);
|
|
uint32_t dont_care1, dont_care2;
|
|
int high = (int) (space * 8 / bits_per_key_ + 1);
|
|
int low = 1;
|
|
int n = high;
|
|
for (; n >= low; n--) {
|
|
uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2);
|
|
if (sz <= space) {
|
|
break;
|
|
}
|
|
}
|
|
assert(n < high); // High should be an overestimation
|
|
return n;
|
|
}
|
|
|
|
inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
|
|
uint32_t num_lines, uint32_t total_bits) {
|
|
#ifdef NDEBUG
|
|
(void)total_bits;
|
|
#endif
|
|
assert(num_lines > 0 && total_bits > 0);
|
|
|
|
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
|
uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8);
|
|
|
|
for (uint32_t i = 0; i < num_probes_; ++i) {
|
|
// Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
|
|
// to a simple operation by compiler.
|
|
const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
|
|
data[bitpos / 8] |= (1 << (bitpos % 8));
|
|
|
|
h += delta;
|
|
}
|
|
}
|
|
|
|
namespace {
|
|
class FullFilterBitsReader : public FilterBitsReader {
|
|
public:
|
|
explicit FullFilterBitsReader(const Slice& contents)
|
|
: data_(const_cast<char*>(contents.data())),
|
|
data_len_(static_cast<uint32_t>(contents.size())),
|
|
num_probes_(0),
|
|
num_lines_(0),
|
|
log2_cache_line_size_(0) {
|
|
assert(data_);
|
|
GetFilterMeta(contents, &num_probes_, &num_lines_);
|
|
// Sanitize broken parameter
|
|
if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) {
|
|
num_lines_ = 0;
|
|
num_probes_ = 0;
|
|
} else if (num_lines_ != 0) {
|
|
while (true) {
|
|
uint32_t num_lines_at_curr_cache_size =
|
|
(data_len_ - 5) >> log2_cache_line_size_;
|
|
if (num_lines_at_curr_cache_size == 0) {
|
|
// The cache line size seems not a power of two. It's not supported
|
|
// and indicates a corruption so disable using this filter.
|
|
assert(false);
|
|
num_lines_ = 0;
|
|
num_probes_ = 0;
|
|
break;
|
|
}
|
|
if (num_lines_at_curr_cache_size == num_lines_) {
|
|
break;
|
|
}
|
|
++log2_cache_line_size_;
|
|
}
|
|
}
|
|
}
|
|
|
|
~FullFilterBitsReader() override {}
|
|
|
|
bool MayMatch(const Slice& entry) override {
|
|
if (data_len_ <= 5) { // remain same with original filter
|
|
return false;
|
|
}
|
|
// Other Error params, including a broken filter, regarded as match
|
|
if (num_probes_ == 0 || num_lines_ == 0) return true;
|
|
uint32_t hash = BloomHash(entry);
|
|
uint32_t bit_offset;
|
|
FilterPrepare(hash, Slice(data_, data_len_), num_lines_, &bit_offset);
|
|
return HashMayMatch(hash, Slice(data_, data_len_), num_probes_, bit_offset);
|
|
}
|
|
|
|
virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
|
|
if (data_len_ <= 5) { // remain same with original filter
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
may_match[i] = false;
|
|
}
|
|
return;
|
|
}
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
may_match[i] = true;
|
|
}
|
|
// Other Error params, including a broken filter, regarded as match
|
|
if (num_probes_ == 0 || num_lines_ == 0) return;
|
|
uint32_t hashes[MultiGetContext::MAX_BATCH_SIZE];
|
|
uint32_t bit_offsets[MultiGetContext::MAX_BATCH_SIZE];
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
hashes[i] = BloomHash(*keys[i]);
|
|
FilterPrepare(hashes[i], Slice(data_, data_len_), num_lines_,
|
|
&bit_offsets[i]);
|
|
}
|
|
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
if (!HashMayMatch(hashes[i], Slice(data_, data_len_), num_probes_,
|
|
bit_offsets[i])) {
|
|
may_match[i] = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
// Filter meta data
|
|
char* data_;
|
|
uint32_t data_len_;
|
|
size_t num_probes_;
|
|
uint32_t num_lines_;
|
|
uint32_t log2_cache_line_size_;
|
|
|
|
// Get num_probes, and num_lines from filter
|
|
// If filter format broken, set both to 0.
|
|
void GetFilterMeta(const Slice& filter, size_t* num_probes,
|
|
uint32_t* num_lines);
|
|
|
|
// "filter" contains the data appended by a preceding call to
|
|
// FilterBitsBuilder::Finish. This method must return true if the key was
|
|
// passed to FilterBitsBuilder::AddKey. This method may return true or false
|
|
// if the key was not on the list, but it should aim to return false with a
|
|
// high probability.
|
|
//
|
|
// hash: target to be checked
|
|
// filter: the whole filter, including meta data bytes
|
|
// num_probes: number of probes, read before hand
|
|
// num_lines: filter metadata, read before hand
|
|
// Before calling this function, need to ensure the input meta data
|
|
// is valid.
|
|
bool HashMayMatch(const uint32_t& hash, const Slice& filter,
|
|
const size_t& num_probes, const uint32_t& bit_offset);
|
|
|
|
void FilterPrepare(const uint32_t& hash, const Slice& filter,
|
|
const uint32_t& num_lines, uint32_t* bit_offset);
|
|
|
|
// No Copy allowed
|
|
FullFilterBitsReader(const FullFilterBitsReader&);
|
|
void operator=(const FullFilterBitsReader&);
|
|
};
|
|
|
|
void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
|
|
size_t* num_probes, uint32_t* num_lines) {
|
|
uint32_t len = static_cast<uint32_t>(filter.size());
|
|
if (len <= 5) {
|
|
// filter is empty or broken
|
|
*num_probes = 0;
|
|
*num_lines = 0;
|
|
return;
|
|
}
|
|
|
|
*num_probes = filter.data()[len - 5];
|
|
*num_lines = DecodeFixed32(filter.data() + len - 4);
|
|
}
|
|
|
|
void FullFilterBitsReader::FilterPrepare(const uint32_t& hash,
|
|
const Slice& filter,
|
|
const uint32_t& num_lines,
|
|
uint32_t* bit_offset) {
|
|
uint32_t len = static_cast<uint32_t>(filter.size());
|
|
if (len <= 5) return; // remain the same with original filter
|
|
|
|
// It is ensured the params are valid before calling it
|
|
assert(num_lines != 0 && (len - 5) % num_lines == 0);
|
|
|
|
uint32_t h = hash;
|
|
// Left shift by an extra 3 to convert bytes to bits
|
|
uint32_t b = (h % num_lines) << (log2_cache_line_size_ + 3);
|
|
PREFETCH(&filter.data()[b / 8], 0 /* rw */, 1 /* locality */);
|
|
PREFETCH(&filter.data()[b / 8 + (1 << log2_cache_line_size_) - 1],
|
|
0 /* rw */, 1 /* locality */);
|
|
*bit_offset = b;
|
|
}
|
|
|
|
bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
|
|
const Slice& filter,
|
|
const size_t& num_probes,
|
|
const uint32_t& bit_offset) {
|
|
uint32_t len = static_cast<uint32_t>(filter.size());
|
|
if (len <= 5) return false; // remain the same with original filter
|
|
|
|
// It is ensured the params are valid before calling it
|
|
assert(num_probes != 0);
|
|
const char* data = filter.data();
|
|
|
|
uint32_t h = hash;
|
|
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
|
|
|
for (uint32_t i = 0; i < num_probes; ++i) {
|
|
// Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
|
|
// to a simple and operation by compiler.
|
|
const uint32_t bitpos =
|
|
bit_offset + (h & ((1 << (log2_cache_line_size_ + 3)) - 1));
|
|
if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
|
|
return false;
|
|
}
|
|
|
|
h += delta;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// An implementation of filter policy
|
|
class BloomFilterPolicy : public FilterPolicy {
|
|
public:
|
|
explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder)
|
|
: bits_per_key_(bits_per_key), hash_func_(BloomHash),
|
|
use_block_based_builder_(use_block_based_builder) {
|
|
initialize();
|
|
}
|
|
|
|
~BloomFilterPolicy() override {}
|
|
|
|
const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; }
|
|
|
|
void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
|
|
// Compute bloom filter size (in both bits and bytes)
|
|
size_t bits = n * bits_per_key_;
|
|
|
|
// For small n, we can see a very high false positive rate. Fix it
|
|
// by enforcing a minimum bloom filter length.
|
|
if (bits < 64) bits = 64;
|
|
|
|
size_t bytes = (bits + 7) / 8;
|
|
bits = bytes * 8;
|
|
|
|
const size_t init_size = dst->size();
|
|
dst->resize(init_size + bytes, 0);
|
|
dst->push_back(static_cast<char>(num_probes_)); // Remember # of probes
|
|
char* array = &(*dst)[init_size];
|
|
for (size_t i = 0; i < (size_t)n; i++) {
|
|
// Use double-hashing to generate a sequence of hash values.
|
|
// See analysis in [Kirsch,Mitzenmacher 2006].
|
|
uint32_t h = hash_func_(keys[i]);
|
|
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
|
for (size_t j = 0; j < num_probes_; j++) {
|
|
const uint32_t bitpos = h % bits;
|
|
array[bitpos/8] |= (1 << (bitpos % 8));
|
|
h += delta;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
|
|
const size_t len = bloom_filter.size();
|
|
if (len < 2) return false;
|
|
|
|
const char* array = bloom_filter.data();
|
|
const size_t bits = (len - 1) * 8;
|
|
|
|
// Use the encoded k so that we can read filters generated by
|
|
// bloom filters created using different parameters.
|
|
const size_t k = array[len-1];
|
|
if (k > 30) {
|
|
// Reserved for potentially new encodings for short bloom filters.
|
|
// Consider it a match.
|
|
return true;
|
|
}
|
|
|
|
uint32_t h = hash_func_(key);
|
|
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
|
for (size_t j = 0; j < k; j++) {
|
|
const uint32_t bitpos = h % bits;
|
|
if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
|
|
h += delta;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
FilterBitsBuilder* GetFilterBitsBuilder() const override {
|
|
if (use_block_based_builder_) {
|
|
return nullptr;
|
|
}
|
|
|
|
return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
|
|
}
|
|
|
|
FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
|
|
return new FullFilterBitsReader(contents);
|
|
}
|
|
|
|
// If choose to use block based builder
|
|
bool UseBlockBasedBuilder() { return use_block_based_builder_; }
|
|
|
|
private:
|
|
size_t bits_per_key_;
|
|
size_t num_probes_;
|
|
uint32_t (*hash_func_)(const Slice& key);
|
|
|
|
const bool use_block_based_builder_;
|
|
|
|
void initialize() {
|
|
// We intentionally round down to reduce probing cost a little bit
|
|
num_probes_ = static_cast<size_t>(bits_per_key_ * 0.69); // 0.69 =~ ln(2)
|
|
if (num_probes_ < 1) num_probes_ = 1;
|
|
if (num_probes_ > 30) num_probes_ = 30;
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
|
|
bool use_block_based_builder) {
|
|
return new BloomFilterPolicy(bits_per_key, use_block_based_builder);
|
|
}
|
|
|
|
} // namespace rocksdb
|