Extend format 3 to partitioned index/filters (#3958)

Summary:
format_version 3 changes the format of index blocks by storing user keys instead of the internal keys, which saves 8-bytes per key. This patch extends the format to top-level indexes in partitioned index/filters.
Closes https://github.com/facebook/rocksdb/pull/3958

Differential Revision: D8294615

Pulled By: maysamyabandeh

fbshipit-source-id: 17666cc16b8076c363972e2308e31547e835f0fe
This commit is contained in:
Maysam Yabandeh 2018-06-06 16:44:52 -07:00 committed by Facebook Github Bot
parent 5504a056f8
commit b73652169e
7 changed files with 69 additions and 34 deletions

View File

@ -449,12 +449,14 @@ Options DBTestBase::GetOptions(
break;
}
case kBlockBasedTableWithPartitionedIndexFormat3: {
table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
options.prefix_extractor.reset(NewNoopTransform());
table_options.format_version = 3;
// Format 3 changes the binary index format. Since partitioned index is a
// super-set of simple indexes, we are also using kTwoLevelIndexSearch to
// test this format.
table_options.format_version = 3;
table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
// The top-level index in partition filters are also affected by format 3.
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
table_options.partition_filters = true;
break;
}
case kBlockBasedTableWithIndexRestartInterval: {

View File

@ -237,16 +237,18 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
return NewTwoLevelIterator(
new BlockBasedTable::PartitionedIndexIteratorState(
table_, &partition_map_, index_key_includes_seq_),
index_block_->NewIterator(
icomparator_, icomparator_->user_comparator(), nullptr, true));
index_block_->NewIterator(icomparator_,
icomparator_->user_comparator(), nullptr,
true, nullptr, index_key_includes_seq_));
} else {
auto ro = ReadOptions();
ro.fill_cache = fill_cache;
bool kIsIndex = true;
return new BlockBasedTableIterator(
table_, ro, *icomparator_,
index_block_->NewIterator(
icomparator_, icomparator_->user_comparator(), nullptr, true),
index_block_->NewIterator(icomparator_,
icomparator_->user_comparator(), nullptr,
true, nullptr, index_key_includes_seq_),
false,
/* prefix_extractor */ nullptr, kIsIndex, index_key_includes_seq_);
}
@ -262,7 +264,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
BlockIter biter;
BlockHandle handle;
index_block_->NewIterator(icomparator_, icomparator_->user_comparator(),
&biter, true);
&biter, true, nullptr, index_key_includes_seq_);
// Index partitions are assumed to be consecuitive. Prefetch them all.
// Read the first block offset
biter.SeekToFirst();
@ -1308,7 +1310,9 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
return new PartitionedFilterBlockReader(
rep->prefix_filtering ? prefix_extractor : nullptr,
rep->whole_key_filtering, std::move(block), nullptr,
rep->ioptions.statistics, rep->internal_comparator, this);
rep->ioptions.statistics, rep->internal_comparator, this,
rep_->table_properties == nullptr ||
!rep_->table_properties->index_key_is_user_key);
}
case Rep::FilterType::kBlockFilter:

View File

@ -66,6 +66,8 @@ PartitionedIndexBuilder::PartitionedIndexBuilder(
: IndexBuilder(comparator),
index_block_builder_(table_opt.index_block_restart_interval,
table_opt.format_version),
index_block_builder_without_seq_(table_opt.index_block_restart_interval,
table_opt.format_version),
sub_index_builder_(nullptr),
table_opt_(table_opt),
seperator_is_key_plus_seq_(false) {}
@ -149,11 +151,20 @@ Status PartitionedIndexBuilder::Finish(
std::string handle_encoding;
last_partition_block_handle.EncodeTo(&handle_encoding);
index_block_builder_.Add(last_entry.key, handle_encoding);
if (!seperator_is_key_plus_seq_) {
index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
handle_encoding);
}
entries_.pop_front();
}
// If there is no sub_index left, then return the 2nd level index.
if (UNLIKELY(entries_.empty())) {
index_blocks->index_block_contents = index_block_builder_.Finish();
if (seperator_is_key_plus_seq_) {
index_blocks->index_block_contents = index_block_builder_.Finish();
} else {
index_blocks->index_block_contents =
index_block_builder_without_seq_.Finish();
}
return Status::OK();
} else {
// Finish the next partition index in line and Incomplete() to indicate we
@ -192,7 +203,9 @@ size_t PartitionedIndexBuilder::EstimateTopLevelIndexSize(
uint64_t size = it->value->EstimatedSize();
BlockHandle tmp_block_handle(offset, size);
tmp_block_handle.EncodeTo(&tmp_handle_encoding);
tmp_builder.Add(it->key, tmp_handle_encoding);
tmp_builder.Add(
seperator_is_key_plus_seq_ ? it->key : ExtractUserKey(it->key),
tmp_handle_encoding);
offset += size;
}
return tmp_builder.CurrentSizeEstimate();

View File

@ -368,6 +368,7 @@ class PartitionedIndexBuilder : public IndexBuilder {
};
std::list<Entry> entries_; // list of partitioned indexes and their keys
BlockBuilder index_block_builder_; // top-level index builder
BlockBuilder index_block_builder_without_seq_; // same for user keys
// the active partition index builder
ShortenedIndexBuilder* sub_index_builder_;
// the last key in the active partition index builder

View File

@ -24,6 +24,7 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
: FullFilterBlockBuilder(prefix_extractor, whole_key_filtering,
filter_bits_builder),
index_on_filter_block_builder_(index_block_restart_interval),
index_on_filter_block_builder_without_seq_(index_block_restart_interval),
p_index_builder_(p_index_builder),
filters_in_partition_(0),
num_added_(0) {
@ -65,6 +66,10 @@ Slice PartitionedFilterBlockBuilder::Finish(
std::string handle_encoding;
last_partition_block_handle.EncodeTo(&handle_encoding);
index_on_filter_block_builder_.Add(last_entry.key, handle_encoding);
if (!p_index_builder_->seperator_is_key_plus_seq()) {
index_on_filter_block_builder_without_seq_.Add(
ExtractUserKey(last_entry.key), handle_encoding);
}
filters.pop_front();
} else {
MaybeCutAFilterBlock();
@ -74,7 +79,11 @@ Slice PartitionedFilterBlockBuilder::Finish(
if (UNLIKELY(filters.empty())) {
*status = Status::OK();
if (finishing_filters) {
return index_on_filter_block_builder_.Finish();
if (p_index_builder_->seperator_is_key_plus_seq()) {
return index_on_filter_block_builder_.Finish();
} else {
return index_on_filter_block_builder_without_seq_.Finish();
}
} else {
// This is the rare case where no key was added to the filter
return Slice();
@ -91,12 +100,13 @@ Slice PartitionedFilterBlockBuilder::Finish(
PartitionedFilterBlockReader::PartitionedFilterBlockReader(
const SliceTransform* prefix_extractor, bool _whole_key_filtering,
BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/,
Statistics* stats, const Comparator& comparator,
const BlockBasedTable* table)
Statistics* stats, const InternalKeyComparator comparator,
const BlockBasedTable* table, const bool index_key_includes_seq)
: FilterBlockReader(contents.data.size(), stats, _whole_key_filtering),
prefix_extractor_(prefix_extractor),
comparator_(comparator),
table_(table) {
table_(table),
index_key_includes_seq_(index_key_includes_seq) {
idx_on_fltr_blk_.reset(new Block(std::move(contents),
kDisableGlobalSequenceNumber,
0 /* read_amp_bytes_per_bit */, stats));
@ -113,7 +123,8 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
BlockIter biter;
BlockHandle handle;
idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &biter, true);
idx_on_fltr_blk_->NewIterator(&comparator_, comparator_.user_comparator(),
&biter, true, nullptr, index_key_includes_seq_);
biter.SeekToFirst();
for (; biter.Valid(); biter.Next()) {
auto input = biter.value();
@ -207,7 +218,8 @@ bool PartitionedFilterBlockReader::PrefixMayMatch(
Slice PartitionedFilterBlockReader::GetFilterPartitionHandle(
const Slice& entry) {
BlockIter iter;
idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &iter, true);
idx_on_fltr_blk_->NewIterator(&comparator_, comparator_.user_comparator(),
&iter, true, nullptr, index_key_includes_seq_);
iter.Seek(entry);
if (UNLIKELY(!iter.Valid())) {
return Slice();
@ -269,7 +281,8 @@ void PartitionedFilterBlockReader::CacheDependencies(
auto rep = table_->rep_;
BlockIter biter;
BlockHandle handle;
idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &biter, true);
idx_on_fltr_blk_->NewIterator(&comparator_, comparator_.user_comparator(),
&biter, true, nullptr, index_key_includes_seq_);
// Index partitions are assumed to be consecuitive. Prefetch them all.
// Read the first block offset
biter.SeekToFirst();

View File

@ -41,6 +41,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
private:
// Filter data
BlockBuilder index_on_filter_block_builder_; // top-level index builder
BlockBuilder
index_on_filter_block_builder_without_seq_; // same for user keys
struct FilterEntry {
std::string key;
Slice filter;
@ -68,13 +70,11 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
class PartitionedFilterBlockReader : public FilterBlockReader,
public Cleanable {
public:
explicit PartitionedFilterBlockReader(const SliceTransform* prefix_extractor,
bool whole_key_filtering,
BlockContents&& contents,
FilterBitsReader* filter_bits_reader,
Statistics* stats,
const Comparator& comparator,
const BlockBasedTable* table);
explicit PartitionedFilterBlockReader(
const SliceTransform* prefix_extractor, bool whole_key_filtering,
BlockContents&& contents, FilterBitsReader* filter_bits_reader,
Statistics* stats, const InternalKeyComparator comparator,
const BlockBasedTable* table, const bool index_key_includes_seq);
virtual ~PartitionedFilterBlockReader();
virtual bool IsBlockBased() override { return false; }
@ -98,8 +98,9 @@ class PartitionedFilterBlockReader : public FilterBlockReader,
const SliceTransform* prefix_extractor_;
std::unique_ptr<Block> idx_on_fltr_blk_;
const Comparator& comparator_;
const InternalKeyComparator comparator_;
const BlockBasedTable* table_;
const bool index_key_includes_seq_;
std::unordered_map<uint64_t,
BlockBasedTable::CachableEntry<FilterBlockReader>>
filter_map_;

View File

@ -111,7 +111,7 @@ class PartitionedFilterBlockTest : public testing::Test {
std::unique_ptr<MockedBlockBasedTable> table;
PartitionedFilterBlockReader* NewReader(
PartitionedFilterBlockBuilder* builder) {
PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) {
BlockHandle bh;
Status status;
Slice slice;
@ -127,13 +127,14 @@ class PartitionedFilterBlockTest : public testing::Test {
ioptions, env_options, table_options_, icomp, false)));
auto reader = new PartitionedFilterBlockReader(
nullptr, true, BlockContents(slice, false, kNoCompression), nullptr,
nullptr, *icomp.user_comparator(), table.get());
nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq());
return reader;
}
void VerifyReader(PartitionedFilterBlockBuilder* builder,
bool empty = false) {
std::unique_ptr<PartitionedFilterBlockReader> reader(NewReader(builder));
PartitionedIndexBuilder* pib, bool empty = false) {
std::unique_ptr<PartitionedFilterBlockReader> reader(
NewReader(builder, pib));
// Querying added keys
const bool no_io = true;
for (auto key : keys) {
@ -182,7 +183,7 @@ class PartitionedFilterBlockTest : public testing::Test {
builder->Add(keys[i]);
CutABlock(pib.get(), keys[i]);
VerifyReader(builder.get());
VerifyReader(builder.get(), pib.get());
return CountNumOfIndexPartitions(pib.get());
}
@ -202,7 +203,7 @@ class PartitionedFilterBlockTest : public testing::Test {
builder->Add(keys[i]);
CutABlock(pib.get(), keys[i]);
VerifyReader(builder.get());
VerifyReader(builder.get(), pib.get());
}
void TestBlockPerAllKeys() {
@ -220,7 +221,7 @@ class PartitionedFilterBlockTest : public testing::Test {
builder->Add(keys[i]);
CutABlock(pib.get(), keys[i]);
VerifyReader(builder.get());
VerifyReader(builder.get(), pib.get());
}
void CutABlock(PartitionedIndexBuilder* builder,
@ -261,7 +262,7 @@ TEST_F(PartitionedFilterBlockTest, EmptyBuilder) {
std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
std::unique_ptr<PartitionedFilterBlockBuilder> builder(NewBuilder(pib.get()));
const bool empty = true;
VerifyReader(builder.get(), empty);
VerifyReader(builder.get(), pib.get(), empty);
}
TEST_F(PartitionedFilterBlockTest, OneBlock) {