mirror of https://github.com/facebook/rocksdb.git
Prefix filters for scans (v4)
Summary: Similar to v2 (db and table code understands prefixes), but use ReadOptions as in v3. Also, make the CreateFilter code faster and cleaner. Test Plan: make db_test; export LEVELDB_TESTS=PrefixScan; ./db_test Reviewers: dhruba Reviewed By: dhruba CC: haobo, emayanke Differential Revision: https://reviews.facebook.net/D12027
This commit is contained in:
parent
3b81df34bd
commit
f5f1842282
|
@ -23,6 +23,7 @@
|
||||||
#include "db/memtable.h"
|
#include "db/memtable.h"
|
||||||
#include "db/memtablelist.h"
|
#include "db/memtablelist.h"
|
||||||
#include "db/merge_helper.h"
|
#include "db/merge_helper.h"
|
||||||
|
#include "db/prefix_filter_iterator.h"
|
||||||
#include "db/table_cache.h"
|
#include "db/table_cache.h"
|
||||||
#include "db/version_set.h"
|
#include "db/version_set.h"
|
||||||
#include "db/write_batch_internal.h"
|
#include "db/write_batch_internal.h"
|
||||||
|
@ -2339,12 +2340,19 @@ bool DBImpl::KeyMayExist(const ReadOptions& options,
|
||||||
|
|
||||||
Iterator* DBImpl::NewIterator(const ReadOptions& options) {
|
Iterator* DBImpl::NewIterator(const ReadOptions& options) {
|
||||||
SequenceNumber latest_snapshot;
|
SequenceNumber latest_snapshot;
|
||||||
Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
|
Iterator* iter = NewInternalIterator(options, &latest_snapshot);
|
||||||
return NewDBIterator(
|
iter = NewDBIterator(
|
||||||
&dbname_, env_, options_, user_comparator(), internal_iter,
|
&dbname_, env_, options_, user_comparator(), iter,
|
||||||
(options.snapshot != nullptr
|
(options.snapshot != nullptr
|
||||||
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
|
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
|
||||||
: latest_snapshot));
|
: latest_snapshot));
|
||||||
|
if (options.prefix) {
|
||||||
|
// use extra wrapper to exclude any keys from the results which
|
||||||
|
// don't begin with the prefix
|
||||||
|
iter = new PrefixFilterIterator(iter, *options.prefix,
|
||||||
|
options_.prefix_extractor);
|
||||||
|
}
|
||||||
|
return iter;
|
||||||
}
|
}
|
||||||
|
|
||||||
const Snapshot* DBImpl::GetSnapshot() {
|
const Snapshot* DBImpl::GetSnapshot() {
|
||||||
|
|
129
db/db_test.cc
129
db/db_test.cc
|
@ -3640,6 +3640,135 @@ TEST(DBTest, MultiGetEmpty) {
|
||||||
} while (ChangeCompactOptions());
|
} while (ChangeCompactOptions());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PrefixScanInit(DBTest *dbtest) {
|
||||||
|
char buf[100];
|
||||||
|
std::string keystr;
|
||||||
|
const int small_range_sstfiles = 5;
|
||||||
|
const int big_range_sstfiles = 5;
|
||||||
|
|
||||||
|
// Generate 11 sst files with the following prefix ranges.
|
||||||
|
// GROUP 0: [0,10] (level 1)
|
||||||
|
// GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0)
|
||||||
|
// GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0)
|
||||||
|
//
|
||||||
|
// A seek with the previous API would do 11 random I/Os (to all the
|
||||||
|
// files). With the new API and a prefix filter enabled, we should
|
||||||
|
// only do 2 random I/O, to the 2 files containing the key.
|
||||||
|
|
||||||
|
// GROUP 0
|
||||||
|
snprintf(buf, sizeof(buf), "%02d______:start", 0);
|
||||||
|
keystr = std::string(buf);
|
||||||
|
ASSERT_OK(dbtest->Put(keystr, keystr));
|
||||||
|
snprintf(buf, sizeof(buf), "%02d______:end", 10);
|
||||||
|
keystr = std::string(buf);
|
||||||
|
ASSERT_OK(dbtest->Put(keystr, keystr));
|
||||||
|
dbtest->dbfull()->TEST_CompactMemTable();
|
||||||
|
dbtest->dbfull()->CompactRange(nullptr, nullptr); // move to level 1
|
||||||
|
|
||||||
|
// GROUP 1
|
||||||
|
for (int i = 1; i <= small_range_sstfiles; i++) {
|
||||||
|
snprintf(buf, sizeof(buf), "%02d______:start", i);
|
||||||
|
keystr = std::string(buf);
|
||||||
|
ASSERT_OK(dbtest->Put(keystr, keystr));
|
||||||
|
snprintf(buf, sizeof(buf), "%02d______:end", i+1);
|
||||||
|
keystr = std::string(buf);
|
||||||
|
ASSERT_OK(dbtest->Put(keystr, keystr));
|
||||||
|
dbtest->dbfull()->TEST_CompactMemTable();
|
||||||
|
}
|
||||||
|
|
||||||
|
// GROUP 2
|
||||||
|
for (int i = 1; i <= big_range_sstfiles; i++) {
|
||||||
|
std::string keystr;
|
||||||
|
snprintf(buf, sizeof(buf), "%02d______:start", 0);
|
||||||
|
keystr = std::string(buf);
|
||||||
|
ASSERT_OK(dbtest->Put(keystr, keystr));
|
||||||
|
snprintf(buf, sizeof(buf), "%02d______:end",
|
||||||
|
small_range_sstfiles+i+1);
|
||||||
|
keystr = std::string(buf);
|
||||||
|
ASSERT_OK(dbtest->Put(keystr, keystr));
|
||||||
|
dbtest->dbfull()->TEST_CompactMemTable();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(DBTest, PrefixScan) {
|
||||||
|
ReadOptions ro = ReadOptions();
|
||||||
|
int count;
|
||||||
|
Slice prefix;
|
||||||
|
Slice key;
|
||||||
|
char buf[100];
|
||||||
|
Iterator* iter;
|
||||||
|
snprintf(buf, sizeof(buf), "03______:");
|
||||||
|
prefix = Slice(buf, 8);
|
||||||
|
key = Slice(buf, 9);
|
||||||
|
|
||||||
|
// db configs
|
||||||
|
env_->count_random_reads_ = true;
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
options.env = env_;
|
||||||
|
options.block_cache = NewLRUCache(0); // Prevent cache hits
|
||||||
|
options.filter_policy = NewBloomFilterPolicy(10);
|
||||||
|
options.prefix_extractor = NewFixedPrefixTransform(8);
|
||||||
|
options.whole_key_filtering = false;
|
||||||
|
options.disable_auto_compactions = true;
|
||||||
|
options.max_background_compactions = 2;
|
||||||
|
options.create_if_missing = true;
|
||||||
|
options.disable_seek_compaction = true;
|
||||||
|
|
||||||
|
// prefix specified, with blooms: 2 RAND I/Os
|
||||||
|
// SeekToFirst
|
||||||
|
DestroyAndReopen(&options);
|
||||||
|
PrefixScanInit(this);
|
||||||
|
count = 0;
|
||||||
|
env_->random_read_counter_.Reset();
|
||||||
|
ro.prefix = &prefix;
|
||||||
|
iter = db_->NewIterator(ro);
|
||||||
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||||
|
assert(iter->key().starts_with(prefix));
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
ASSERT_TRUE(iter->status().ok());
|
||||||
|
delete iter;
|
||||||
|
ASSERT_EQ(count, 2);
|
||||||
|
ASSERT_EQ(env_->random_read_counter_.Read(), 2);
|
||||||
|
|
||||||
|
// prefix specified, with blooms: 2 RAND I/Os
|
||||||
|
// Seek
|
||||||
|
DestroyAndReopen(&options);
|
||||||
|
PrefixScanInit(this);
|
||||||
|
count = 0;
|
||||||
|
env_->random_read_counter_.Reset();
|
||||||
|
ro.prefix = &prefix;
|
||||||
|
iter = db_->NewIterator(ro);
|
||||||
|
for (iter->Seek(key); iter->Valid(); iter->Next()) {
|
||||||
|
assert(iter->key().starts_with(prefix));
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
ASSERT_TRUE(iter->status().ok());
|
||||||
|
delete iter;
|
||||||
|
ASSERT_EQ(count, 2);
|
||||||
|
ASSERT_EQ(env_->random_read_counter_.Read(), 2);
|
||||||
|
|
||||||
|
// no prefix specified: 11 RAND I/Os
|
||||||
|
DestroyAndReopen(&options);
|
||||||
|
PrefixScanInit(this);
|
||||||
|
count = 0;
|
||||||
|
env_->random_read_counter_.Reset();
|
||||||
|
iter = db_->NewIterator(ReadOptions());
|
||||||
|
for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
|
||||||
|
if (! iter->key().starts_with(prefix)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
ASSERT_TRUE(iter->status().ok());
|
||||||
|
delete iter;
|
||||||
|
ASSERT_EQ(count, 2);
|
||||||
|
ASSERT_EQ(env_->random_read_counter_.Read(), 11);
|
||||||
|
Close();
|
||||||
|
delete options.filter_policy;
|
||||||
|
delete options.prefix_extractor;
|
||||||
|
}
|
||||||
|
|
||||||
std::string MakeKey(unsigned int num) {
|
std::string MakeKey(unsigned int num) {
|
||||||
char buf[30];
|
char buf[30];
|
||||||
snprintf(buf, sizeof(buf), "%016u", num);
|
snprintf(buf, sizeof(buf), "%016u", num);
|
||||||
|
|
|
@ -0,0 +1,76 @@
|
||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
//
|
||||||
|
// Wrap an underlying iterator, but exclude any results not starting
|
||||||
|
// with a given prefix. Seeking to keys not beginning with the prefix
|
||||||
|
// is invalid, and SeekToLast is not implemented (that would be
|
||||||
|
// non-trivial), but otherwise this iterator will behave just like the
|
||||||
|
// underlying iterator would if there happened to be no non-matching
|
||||||
|
// keys in the dataset.
|
||||||
|
|
||||||
|
#ifndef STORAGE_LEVELDB_DB_PREFIX_FILTER_ITERATOR_H_
|
||||||
|
#define STORAGE_LEVELDB_DB_PREFIX_FILTER_ITERATOR_H_
|
||||||
|
|
||||||
|
#include "leveldb/iterator.h"
|
||||||
|
|
||||||
|
namespace leveldb {
|
||||||
|
|
||||||
|
class PrefixFilterIterator : public Iterator {
|
||||||
|
private:
|
||||||
|
Iterator* iter_;
|
||||||
|
const Slice &prefix_;
|
||||||
|
const SliceTransform *prefix_extractor_;
|
||||||
|
Status status_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
PrefixFilterIterator(Iterator* iter,
|
||||||
|
const Slice &prefix,
|
||||||
|
const SliceTransform* prefix_extractor)
|
||||||
|
: iter_(iter), prefix_(prefix),
|
||||||
|
prefix_extractor_(prefix_extractor),
|
||||||
|
status_(Status::OK()) {
|
||||||
|
if (prefix_extractor == nullptr) {
|
||||||
|
status_ = Status::InvalidArgument("A prefix filter may not be used "
|
||||||
|
"unless a function is also defined "
|
||||||
|
"for extracting prefixes");
|
||||||
|
} else if (!prefix_extractor_->InRange(prefix)) {
|
||||||
|
status_ = Status::InvalidArgument("Must provide a slice for prefix which"
|
||||||
|
"is a prefix for some key");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
~PrefixFilterIterator() {
|
||||||
|
delete iter_;
|
||||||
|
}
|
||||||
|
Slice key() const { return iter_->key(); }
|
||||||
|
Slice value() const { return iter_->value(); }
|
||||||
|
Status status() const {
|
||||||
|
if (!status_.ok()) {
|
||||||
|
return status_;
|
||||||
|
}
|
||||||
|
return iter_->status();
|
||||||
|
}
|
||||||
|
void Next() { iter_->Next(); }
|
||||||
|
void Prev() { iter_->Prev(); }
|
||||||
|
void Seek(const Slice& k) {
|
||||||
|
if (prefix_extractor_->Transform(k) == prefix_) {
|
||||||
|
iter_->Seek(k);
|
||||||
|
} else {
|
||||||
|
status_ = Status::InvalidArgument("Seek must begin with target prefix");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void SeekToFirst() {
|
||||||
|
Seek(prefix_);
|
||||||
|
}
|
||||||
|
void SeekToLast() {
|
||||||
|
status_ = Status::NotSupported("SeekToLast is incompatible with prefixes");
|
||||||
|
}
|
||||||
|
bool Valid() const {
|
||||||
|
return (status_.ok() && iter_->Valid() &&
|
||||||
|
prefix_extractor_->Transform(iter_->key()) == prefix_);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace leveldb
|
||||||
|
|
||||||
|
#endif
|
|
@ -15,6 +15,8 @@
|
||||||
#include "leveldb/universal_compaction.h"
|
#include "leveldb/universal_compaction.h"
|
||||||
#include "leveldb/memtablerep.h"
|
#include "leveldb/memtablerep.h"
|
||||||
|
|
||||||
|
#include "leveldb/slice_transform.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
class Cache;
|
class Cache;
|
||||||
|
@ -224,6 +226,28 @@ struct Options {
|
||||||
// Default: nullptr
|
// Default: nullptr
|
||||||
const FilterPolicy* filter_policy;
|
const FilterPolicy* filter_policy;
|
||||||
|
|
||||||
|
// If non-nullptr, use the specified function to determine the
|
||||||
|
// prefixes for keys. These prefixes will be placed in the filter.
|
||||||
|
// Depending on the workload, this can reduce the number of read-IOP
|
||||||
|
// cost for scans when a prefix is passed via ReadOptions to
|
||||||
|
// db.NewIterator(). For prefix filtering to work properly,
|
||||||
|
// "prefix_extractor" and "comparator" must be such that the following
|
||||||
|
// properties hold:
|
||||||
|
//
|
||||||
|
// 1) key.starts_with(prefix(key))
|
||||||
|
// 2) Compare(prefix(key), key) <= 0.
|
||||||
|
// 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
|
||||||
|
// 4) prefix(prefix(key)) == prefix(key)
|
||||||
|
//
|
||||||
|
// Default: nullptr
|
||||||
|
const SliceTransform* prefix_extractor;
|
||||||
|
|
||||||
|
// If true, place whole keys in the filter (not just prefixes).
|
||||||
|
// This must generally be true for gets to be efficient.
|
||||||
|
//
|
||||||
|
// Default: true
|
||||||
|
bool whole_key_filtering;
|
||||||
|
|
||||||
// Number of levels for this database
|
// Number of levels for this database
|
||||||
int num_levels;
|
int num_levels;
|
||||||
|
|
||||||
|
@ -538,14 +562,28 @@ struct ReadOptions {
|
||||||
// Default: nullptr
|
// Default: nullptr
|
||||||
const Snapshot* snapshot;
|
const Snapshot* snapshot;
|
||||||
|
|
||||||
|
// If "prefix" is non-nullptr, and ReadOptions is being passed to
|
||||||
|
// db.NewIterator, only return results when the key begins with this
|
||||||
|
// prefix. This field is ignored by other calls (e.g., Get).
|
||||||
|
// Options.prefix_extractor must also be set, and
|
||||||
|
// prefix_extractor.InRange(prefix) must be true. The iterator
|
||||||
|
// returned by NewIterator when this option is set will behave just
|
||||||
|
// as if the underlying store did not contain any non-matching keys,
|
||||||
|
// with two exceptions. Seek() only accepts keys starting with the
|
||||||
|
// prefix, and SeekToLast() is not supported. prefix filter with this
|
||||||
|
// option will sometimes reduce the number of read IOPs.
|
||||||
|
// Default: nullptr
|
||||||
|
const Slice* prefix;
|
||||||
|
|
||||||
ReadOptions()
|
ReadOptions()
|
||||||
: verify_checksums(false),
|
: verify_checksums(false),
|
||||||
fill_cache(true),
|
fill_cache(true),
|
||||||
snapshot(nullptr) {
|
snapshot(nullptr),
|
||||||
|
prefix(nullptr) {
|
||||||
}
|
}
|
||||||
ReadOptions(bool cksum, bool cache) :
|
ReadOptions(bool cksum, bool cache) :
|
||||||
verify_checksums(cksum), fill_cache(cache),
|
verify_checksums(cksum), fill_cache(cache),
|
||||||
snapshot(nullptr) {
|
snapshot(nullptr), prefix(nullptr) {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
//
|
||||||
|
// Class for specifying user-defined functions which perform a
|
||||||
|
// transformation on a slice. It is not required that every slice
|
||||||
|
// belong to the domain and/or range of a function. Subclasses should
|
||||||
|
// define InDomain and InRange to determine which slices are in either
|
||||||
|
// of these sets respectively.
|
||||||
|
|
||||||
|
#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_TRANSFORM_H_
|
||||||
|
#define STORAGE_LEVELDB_INCLUDE_SLICE_TRANSFORM_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace leveldb {
|
||||||
|
|
||||||
|
class Slice;
|
||||||
|
|
||||||
|
class SliceTransform {
|
||||||
|
public:
|
||||||
|
virtual ~SliceTransform() {};
|
||||||
|
|
||||||
|
// Return the name of this transformation.
|
||||||
|
virtual const char* Name() const = 0;
|
||||||
|
|
||||||
|
// transform a src in domain to a dst in the range
|
||||||
|
virtual Slice Transform(const Slice& src) const = 0;
|
||||||
|
|
||||||
|
// determine whether this is a valid src upon the function applies
|
||||||
|
virtual bool InDomain(const Slice& src) const = 0;
|
||||||
|
|
||||||
|
// determine whether dst=Transform(src) for some src
|
||||||
|
virtual bool InRange(const Slice& dst) const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // STORAGE_LEVELDB_INCLUDE_SLICE_TRANSFORM_H_
|
|
@ -4,6 +4,7 @@
|
||||||
|
|
||||||
#include "table/filter_block.h"
|
#include "table/filter_block.h"
|
||||||
|
|
||||||
|
#include "db/dbformat.h"
|
||||||
#include "leveldb/filter_policy.h"
|
#include "leveldb/filter_policy.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
|
|
||||||
|
@ -15,9 +16,11 @@ namespace leveldb {
|
||||||
static const size_t kFilterBaseLg = 11;
|
static const size_t kFilterBaseLg = 11;
|
||||||
static const size_t kFilterBase = 1 << kFilterBaseLg;
|
static const size_t kFilterBase = 1 << kFilterBaseLg;
|
||||||
|
|
||||||
FilterBlockBuilder::FilterBlockBuilder(const FilterPolicy* policy)
|
FilterBlockBuilder::FilterBlockBuilder(const Options& opt)
|
||||||
: policy_(policy) {
|
: policy_(opt.filter_policy),
|
||||||
}
|
prefix_extractor_(opt.prefix_extractor),
|
||||||
|
whole_key_filtering_(opt.whole_key_filtering),
|
||||||
|
comparator_(opt.comparator){}
|
||||||
|
|
||||||
void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
|
void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
|
||||||
uint64_t filter_index = (block_offset / kFilterBase);
|
uint64_t filter_index = (block_offset / kFilterBase);
|
||||||
|
@ -27,10 +30,47 @@ void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool FilterBlockBuilder::SamePrefix(const Slice &key1,
|
||||||
|
const Slice &key2) const {
|
||||||
|
if (!prefix_extractor_->InDomain(key1) &&
|
||||||
|
!prefix_extractor_->InDomain(key2)) {
|
||||||
|
return true;
|
||||||
|
} else if (!prefix_extractor_->InDomain(key1) ||
|
||||||
|
!prefix_extractor_->InDomain(key2)) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
return (prefix_extractor_->Transform(key1) ==
|
||||||
|
prefix_extractor_->Transform(key2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void FilterBlockBuilder::AddKey(const Slice& key) {
|
void FilterBlockBuilder::AddKey(const Slice& key) {
|
||||||
Slice k = key;
|
Slice prev;
|
||||||
start_.push_back(keys_.size());
|
if (start_.size() > 0) {
|
||||||
keys_.append(k.data(), k.size());
|
size_t prev_start = start_[start_.size() - 1];
|
||||||
|
const char* base = entries_.data() + prev_start;
|
||||||
|
size_t length = entries_.size() - prev_start;
|
||||||
|
prev = Slice(base, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (whole_key_filtering_) {
|
||||||
|
start_.push_back(entries_.size());
|
||||||
|
entries_.append(key.data(), key.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
|
||||||
|
// this assumes prefix(prefix(key)) == prefix(key), as the last
|
||||||
|
// entry in entries_ may be either a key or prefix, and we use
|
||||||
|
// prefix(last entry) to get the prefix of the last key.
|
||||||
|
if (prev.size() == 0 || ! SamePrefix(key, prev)) {
|
||||||
|
Slice prefix = prefix_extractor_->Transform(key);
|
||||||
|
assert(comparator_->Compare(prefix, key) <= 0);
|
||||||
|
InternalKey internal_prefix_tmp(prefix, 0, kTypeValue);
|
||||||
|
Slice internal_prefix = internal_prefix_tmp.Encode();
|
||||||
|
start_.push_back(entries_.size());
|
||||||
|
entries_.append(internal_prefix.data(), internal_prefix.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Slice FilterBlockBuilder::Finish() {
|
Slice FilterBlockBuilder::Finish() {
|
||||||
|
@ -50,34 +90,35 @@ Slice FilterBlockBuilder::Finish() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void FilterBlockBuilder::GenerateFilter() {
|
void FilterBlockBuilder::GenerateFilter() {
|
||||||
const size_t num_keys = start_.size();
|
const size_t num_entries = start_.size();
|
||||||
if (num_keys == 0) {
|
if (num_entries == 0) {
|
||||||
// Fast path if there are no keys for this filter
|
// Fast path if there are no keys for this filter
|
||||||
filter_offsets_.push_back(result_.size());
|
filter_offsets_.push_back(result_.size());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make list of keys from flattened key structure
|
// Make list of keys from flattened key structure
|
||||||
start_.push_back(keys_.size()); // Simplify length computation
|
start_.push_back(entries_.size()); // Simplify length computation
|
||||||
tmp_keys_.resize(num_keys);
|
tmp_entries_.resize(num_entries);
|
||||||
for (size_t i = 0; i < num_keys; i++) {
|
for (size_t i = 0; i < num_entries; i++) {
|
||||||
const char* base = keys_.data() + start_[i];
|
const char* base = entries_.data() + start_[i];
|
||||||
size_t length = start_[i+1] - start_[i];
|
size_t length = start_[i+1] - start_[i];
|
||||||
tmp_keys_[i] = Slice(base, length);
|
tmp_entries_[i] = Slice(base, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate filter for current set of keys and append to result_.
|
// Generate filter for current set of keys and append to result_.
|
||||||
filter_offsets_.push_back(result_.size());
|
filter_offsets_.push_back(result_.size());
|
||||||
policy_->CreateFilter(&tmp_keys_[0], num_keys, &result_);
|
policy_->CreateFilter(&tmp_entries_[0], num_entries, &result_);
|
||||||
|
|
||||||
tmp_keys_.clear();
|
tmp_entries_.clear();
|
||||||
keys_.clear();
|
entries_.clear();
|
||||||
start_.clear();
|
start_.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
FilterBlockReader::FilterBlockReader(const FilterPolicy* policy,
|
FilterBlockReader::FilterBlockReader(const Options& opt, const Slice& contents)
|
||||||
const Slice& contents)
|
: policy_(opt.filter_policy),
|
||||||
: policy_(policy),
|
prefix_extractor_(opt.prefix_extractor),
|
||||||
|
whole_key_filtering_(opt.whole_key_filtering),
|
||||||
data_(nullptr),
|
data_(nullptr),
|
||||||
offset_(nullptr),
|
offset_(nullptr),
|
||||||
num_(0),
|
num_(0),
|
||||||
|
@ -92,16 +133,32 @@ FilterBlockReader::FilterBlockReader(const FilterPolicy* policy,
|
||||||
num_ = (n - 5 - last_word) / 4;
|
num_ = (n - 5 - last_word) / 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) {
|
bool FilterBlockReader::KeyMayMatch(uint64_t block_offset,
|
||||||
|
const Slice& key) {
|
||||||
|
if (!whole_key_filtering_) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return MayMatch(block_offset, key);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset,
|
||||||
|
const Slice& prefix) {
|
||||||
|
if (!prefix_extractor_) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return MayMatch(block_offset, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
|
||||||
uint64_t index = block_offset >> base_lg_;
|
uint64_t index = block_offset >> base_lg_;
|
||||||
if (index < num_) {
|
if (index < num_) {
|
||||||
uint32_t start = DecodeFixed32(offset_ + index*4);
|
uint32_t start = DecodeFixed32(offset_ + index*4);
|
||||||
uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
|
uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
|
||||||
if (start <= limit && limit <= (offset_ - data_)) {
|
if (start <= limit && limit <= (offset_ - data_)) {
|
||||||
Slice filter = Slice(data_ + start, limit - start);
|
Slice filter = Slice(data_ + start, limit - start);
|
||||||
return policy_->KeyMayMatch(key, filter);
|
return policy_->KeyMayMatch(entry, filter);
|
||||||
} else if (start == limit) {
|
} else if (start == limit) {
|
||||||
// Empty filters do not match any keys
|
// Empty filters do not match any entries
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,9 @@
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include "leveldb/options.h"
|
||||||
#include "leveldb/slice.h"
|
#include "leveldb/slice.h"
|
||||||
|
#include "leveldb/slice_transform.h"
|
||||||
#include "util/hash.h"
|
#include "util/hash.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
@ -28,20 +30,25 @@ class FilterPolicy;
|
||||||
// (StartBlock AddKey*)* Finish
|
// (StartBlock AddKey*)* Finish
|
||||||
class FilterBlockBuilder {
|
class FilterBlockBuilder {
|
||||||
public:
|
public:
|
||||||
explicit FilterBlockBuilder(const FilterPolicy*);
|
explicit FilterBlockBuilder(const Options& opt);
|
||||||
|
|
||||||
void StartBlock(uint64_t block_offset);
|
void StartBlock(uint64_t block_offset);
|
||||||
void AddKey(const Slice& key);
|
void AddKey(const Slice& key);
|
||||||
Slice Finish();
|
Slice Finish();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
bool SamePrefix(const Slice &key1, const Slice &key2) const;
|
||||||
void GenerateFilter();
|
void GenerateFilter();
|
||||||
|
|
||||||
const FilterPolicy* policy_;
|
const FilterPolicy* policy_;
|
||||||
std::string keys_; // Flattened key contents
|
const SliceTransform* prefix_extractor_;
|
||||||
std::vector<size_t> start_; // Starting index in keys_ of each key
|
bool whole_key_filtering_;
|
||||||
std::string result_; // Filter data computed so far
|
const Comparator* comparator_;
|
||||||
std::vector<Slice> tmp_keys_; // policy_->CreateFilter() argument
|
|
||||||
|
std::string entries_; // Flattened entry contents
|
||||||
|
std::vector<size_t> start_; // Starting index in entries_ of each entry
|
||||||
|
std::string result_; // Filter data computed so far
|
||||||
|
std::vector<Slice> tmp_entries_; // policy_->CreateFilter() argument
|
||||||
std::vector<uint32_t> filter_offsets_;
|
std::vector<uint32_t> filter_offsets_;
|
||||||
|
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
|
@ -52,15 +59,20 @@ class FilterBlockBuilder {
|
||||||
class FilterBlockReader {
|
class FilterBlockReader {
|
||||||
public:
|
public:
|
||||||
// REQUIRES: "contents" and *policy must stay live while *this is live.
|
// REQUIRES: "contents" and *policy must stay live while *this is live.
|
||||||
FilterBlockReader(const FilterPolicy* policy, const Slice& contents);
|
FilterBlockReader(const Options& opt, const Slice& contents);
|
||||||
bool KeyMayMatch(uint64_t block_offset, const Slice& key);
|
bool KeyMayMatch(uint64_t block_offset, const Slice& key);
|
||||||
|
bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const FilterPolicy* policy_;
|
const FilterPolicy* policy_;
|
||||||
|
const SliceTransform* prefix_extractor_;
|
||||||
|
bool whole_key_filtering_;
|
||||||
const char* data_; // Pointer to filter data (at block-start)
|
const char* data_; // Pointer to filter data (at block-start)
|
||||||
const char* offset_; // Pointer to beginning of offset array (at block-end)
|
const char* offset_; // Pointer to beginning of offset array (at block-end)
|
||||||
size_t num_; // Number of entries in offset array
|
size_t num_; // Number of entries in offset array
|
||||||
size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file)
|
size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file)
|
||||||
|
|
||||||
|
bool MayMatch(uint64_t block_offset, const Slice& entry);
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,19 +41,25 @@ class TestHashFilter : public FilterPolicy {
|
||||||
class FilterBlockTest {
|
class FilterBlockTest {
|
||||||
public:
|
public:
|
||||||
TestHashFilter policy_;
|
TestHashFilter policy_;
|
||||||
|
Options options_;
|
||||||
|
|
||||||
|
FilterBlockTest() {
|
||||||
|
options_ = Options();
|
||||||
|
options_.filter_policy = &policy_;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST(FilterBlockTest, EmptyBuilder) {
|
TEST(FilterBlockTest, EmptyBuilder) {
|
||||||
FilterBlockBuilder builder(&policy_);
|
FilterBlockBuilder builder(options_);
|
||||||
Slice block = builder.Finish();
|
Slice block = builder.Finish();
|
||||||
ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
|
ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
|
||||||
FilterBlockReader reader(&policy_, block);
|
FilterBlockReader reader(options_, block);
|
||||||
ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
|
ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
|
||||||
ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
|
ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(FilterBlockTest, SingleChunk) {
|
TEST(FilterBlockTest, SingleChunk) {
|
||||||
FilterBlockBuilder builder(&policy_);
|
FilterBlockBuilder builder(options_);
|
||||||
builder.StartBlock(100);
|
builder.StartBlock(100);
|
||||||
builder.AddKey("foo");
|
builder.AddKey("foo");
|
||||||
builder.AddKey("bar");
|
builder.AddKey("bar");
|
||||||
|
@ -63,7 +69,7 @@ TEST(FilterBlockTest, SingleChunk) {
|
||||||
builder.StartBlock(300);
|
builder.StartBlock(300);
|
||||||
builder.AddKey("hello");
|
builder.AddKey("hello");
|
||||||
Slice block = builder.Finish();
|
Slice block = builder.Finish();
|
||||||
FilterBlockReader reader(&policy_, block);
|
FilterBlockReader reader(options_, block);
|
||||||
ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
|
ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
|
||||||
ASSERT_TRUE(reader.KeyMayMatch(100, "bar"));
|
ASSERT_TRUE(reader.KeyMayMatch(100, "bar"));
|
||||||
ASSERT_TRUE(reader.KeyMayMatch(100, "box"));
|
ASSERT_TRUE(reader.KeyMayMatch(100, "box"));
|
||||||
|
@ -74,7 +80,7 @@ TEST(FilterBlockTest, SingleChunk) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(FilterBlockTest, MultiChunk) {
|
TEST(FilterBlockTest, MultiChunk) {
|
||||||
FilterBlockBuilder builder(&policy_);
|
FilterBlockBuilder builder(options_);
|
||||||
|
|
||||||
// First filter
|
// First filter
|
||||||
builder.StartBlock(0);
|
builder.StartBlock(0);
|
||||||
|
@ -94,7 +100,7 @@ TEST(FilterBlockTest, MultiChunk) {
|
||||||
builder.AddKey("hello");
|
builder.AddKey("hello");
|
||||||
|
|
||||||
Slice block = builder.Finish();
|
Slice block = builder.Finish();
|
||||||
FilterBlockReader reader(&policy_, block);
|
FilterBlockReader reader(options_, block);
|
||||||
|
|
||||||
// Check first filter
|
// Check first filter
|
||||||
ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
|
ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
|
||||||
|
|
|
@ -4,6 +4,8 @@
|
||||||
|
|
||||||
#include "table/table.h"
|
#include "table/table.h"
|
||||||
|
|
||||||
|
#include "db/dbformat.h"
|
||||||
|
|
||||||
#include "leveldb/cache.h"
|
#include "leveldb/cache.h"
|
||||||
#include "leveldb/comparator.h"
|
#include "leveldb/comparator.h"
|
||||||
#include "leveldb/env.h"
|
#include "leveldb/env.h"
|
||||||
|
@ -207,7 +209,7 @@ void Table::ReadFilter(const Slice& filter_handle_value) {
|
||||||
if (block.heap_allocated) {
|
if (block.heap_allocated) {
|
||||||
rep_->filter_data = block.data.data(); // Will need to delete later
|
rep_->filter_data = block.data.data(); // Will need to delete later
|
||||||
}
|
}
|
||||||
rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data);
|
rep_->filter = new FilterBlockReader(rep_->options, block.data);
|
||||||
}
|
}
|
||||||
|
|
||||||
Table::~Table() {
|
Table::~Table() {
|
||||||
|
@ -318,7 +320,66 @@ Iterator* Table::BlockReader(void* arg,
|
||||||
return BlockReader(arg, options, index_value, nullptr, for_compaction);
|
return BlockReader(arg, options, index_value, nullptr, for_compaction);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This will be broken if the user specifies an unusual implementation
|
||||||
|
// of Options.comparator, or if the user specifies an unusual
|
||||||
|
// definition of prefixes in Options.filter_policy. In particular, we
|
||||||
|
// require the following three properties:
|
||||||
|
//
|
||||||
|
// 1) key.starts_with(prefix(key))
|
||||||
|
// 2) Compare(prefix(key), key) <= 0.
|
||||||
|
// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
|
||||||
|
bool Table::PrefixMayMatch(const Slice& internal_prefix) const {
|
||||||
|
FilterBlockReader* filter = rep_->filter;
|
||||||
|
bool may_match = true;
|
||||||
|
Status s;
|
||||||
|
|
||||||
|
if (filter == nullptr) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
|
||||||
|
iiter->Seek(internal_prefix);
|
||||||
|
if (! iiter->Valid()) {
|
||||||
|
// we're past end of file
|
||||||
|
may_match = false;
|
||||||
|
} else if (iiter->key().starts_with(internal_prefix)) {
|
||||||
|
// we need to check for this subtle case because our only
|
||||||
|
// guarantee is that "the key is a string >= last key in that data
|
||||||
|
// block" according to the doc/table_format.txt spec.
|
||||||
|
//
|
||||||
|
// Suppose iiter->key() starts with the desired prefix; it is not
|
||||||
|
// necessarily the case that the corresponding data block will
|
||||||
|
// contain the prefix, since iiter->key() need not be in the
|
||||||
|
// block. However, the next data block may contain the prefix, so
|
||||||
|
// we return true to play it safe.
|
||||||
|
may_match = true;
|
||||||
|
} else {
|
||||||
|
// iiter->key() does NOT start with the desired prefix. Because
|
||||||
|
// Seek() finds the first key that is >= the seek target, this
|
||||||
|
// means that iiter->key() > prefix. Thus, any data blocks coming
|
||||||
|
// after the data block corresponding to iiter->key() cannot
|
||||||
|
// possibly contain the key. Thus, the corresponding data block
|
||||||
|
// is the only one which could potentially contain the prefix.
|
||||||
|
Slice handle_value = iiter->value();
|
||||||
|
BlockHandle handle;
|
||||||
|
s = handle.DecodeFrom(&handle_value);
|
||||||
|
assert(s.ok());
|
||||||
|
may_match = filter->PrefixMayMatch(handle.offset(), internal_prefix);
|
||||||
|
}
|
||||||
|
delete iiter;
|
||||||
|
return may_match;
|
||||||
|
}
|
||||||
|
|
||||||
Iterator* Table::NewIterator(const ReadOptions& options) const {
|
Iterator* Table::NewIterator(const ReadOptions& options) const {
|
||||||
|
if (options.prefix) {
|
||||||
|
InternalKey internal_prefix(*options.prefix, 0, kTypeValue);
|
||||||
|
if (!PrefixMayMatch(internal_prefix.Encode())) {
|
||||||
|
// nothing in this file can match the prefix, so we should not
|
||||||
|
// bother doing I/O to this file when iterating.
|
||||||
|
return NewEmptyIterator();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return NewTwoLevelIterator(
|
return NewTwoLevelIterator(
|
||||||
rep_->index_block->NewIterator(rep_->options.comparator),
|
rep_->index_block->NewIterator(rep_->options.comparator),
|
||||||
&Table::BlockReader, const_cast<Table*>(this), options, rep_->soptions);
|
&Table::BlockReader, const_cast<Table*>(this), options, rep_->soptions);
|
||||||
|
|
|
@ -47,6 +47,8 @@ class Table {
|
||||||
|
|
||||||
~Table();
|
~Table();
|
||||||
|
|
||||||
|
bool PrefixMayMatch(const Slice& prefix) const;
|
||||||
|
|
||||||
// Returns a new iterator over the table contents.
|
// Returns a new iterator over the table contents.
|
||||||
// The result of NewIterator() is initially invalid (caller must
|
// The result of NewIterator() is initially invalid (caller must
|
||||||
// call one of the Seek methods on the iterator before using it).
|
// call one of the Seek methods on the iterator before using it).
|
||||||
|
|
|
@ -55,7 +55,7 @@ struct TableBuilder::Rep {
|
||||||
num_entries(0),
|
num_entries(0),
|
||||||
closed(false),
|
closed(false),
|
||||||
filter_block(opt.filter_policy == nullptr ? nullptr
|
filter_block(opt.filter_policy == nullptr ? nullptr
|
||||||
: new FilterBlockBuilder(opt.filter_policy)),
|
: new FilterBlockBuilder(opt)),
|
||||||
pending_index_entry(false) {
|
pending_index_entry(false) {
|
||||||
index_block_options.block_restart_interval = 1;
|
index_block_options.block_restart_interval = 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,43 @@
|
||||||
|
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#include "leveldb/slice_transform.h"
|
||||||
|
|
||||||
|
#include "leveldb/slice.h"
|
||||||
|
|
||||||
|
namespace leveldb {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
class FixedPrefixTransform : public SliceTransform {
|
||||||
|
private:
|
||||||
|
size_t prefix_len_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) { }
|
||||||
|
|
||||||
|
virtual const char* Name() const {
|
||||||
|
return "rocksdb.FixedPrefix";
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual Slice Transform(const Slice& src) const {
|
||||||
|
assert(InDomain(src));
|
||||||
|
return Slice(src.data(), prefix_len_);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool InDomain(const Slice& src) const {
|
||||||
|
return (src.size() >= prefix_len_);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool InRange(const Slice& dst) const {
|
||||||
|
return (dst.size() == prefix_len_);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
|
||||||
|
return new FixedPrefixTransform(prefix_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace leveldb
|
|
@ -33,6 +33,8 @@ Options::Options()
|
||||||
block_restart_interval(16),
|
block_restart_interval(16),
|
||||||
compression(kSnappyCompression),
|
compression(kSnappyCompression),
|
||||||
filter_policy(nullptr),
|
filter_policy(nullptr),
|
||||||
|
prefix_extractor(nullptr),
|
||||||
|
whole_key_filtering(true),
|
||||||
num_levels(7),
|
num_levels(7),
|
||||||
level0_file_num_compaction_trigger(4),
|
level0_file_num_compaction_trigger(4),
|
||||||
level0_slowdown_writes_trigger(8),
|
level0_slowdown_writes_trigger(8),
|
||||||
|
|
Loading…
Reference in New Issue