diff --git a/HISTORY.md b/HISTORY.md index 6b89342f82..d12e28cb30 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,7 @@ ### New Features * Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of BlockHandle::offset of the non-head index entries in each restart interval. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 4 or above is used. ### Bug Fixes +* Fix a bug in misreporting the estimated partition index size in properties block. ## 5.15.0 (7/17/2018) ### Public API Change diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index b0234e7dc5..b2722dd726 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -765,7 +765,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock( ? rep_->table_options.filter_policy->Name() : ""; rep_->props.index_size = - rep_->index_builder->EstimatedSize() + kBlockTrailerSize; + rep_->index_builder->IndexSize() + kBlockTrailerSize; rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr ? rep_->ioptions.user_comparator->Name() : "nullptr"; @@ -796,7 +796,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock( assert(rep_->p_index_builder_ != nullptr); rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions(); rep_->props.top_level_index_size = - rep_->p_index_builder_->EstimateTopLevelIndexSize(rep_->offset); + rep_->p_index_builder_->TopLevelIndexSize(rep_->offset); } rep_->props.index_key_is_user_key = !rep_->index_builder->seperator_is_key_plus_seq(); diff --git a/table/index_builder.cc b/table/index_builder.cc index 6b8114f3e9..5eaecbad10 100644 --- a/table/index_builder.cc +++ b/table/index_builder.cc @@ -78,6 +78,12 @@ PartitionedIndexBuilder::PartitionedIndexBuilder( use_value_delta_encoding), sub_index_builder_(nullptr), table_opt_(table_opt), + // We start by false. After each partition we revise the value based on + // what the sub_index_builder has decided. If the feature is disabled + // entirely, this will be set to true after switching the first + // sub_index_builder. Otherwise, it could be set to true even one of the + // sub_index_builders could not safely exclude seq from the keys, then it + // wil be enforced on all sub_index_builders on ::Finish. seperator_is_key_plus_seq_(false), use_value_delta_encoding_(use_value_delta_encoding) {} @@ -92,7 +98,11 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { table_opt_.format_version, use_value_delta_encoding_); flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, - sub_index_builder_->index_block_builder_)); + // Note: this is sub-optimal since sub_index_builder_ could later reset + // seperator_is_key_plus_seq_ but the probability of that is low. + sub_index_builder_->seperator_is_key_plus_seq_ + ? sub_index_builder_->index_block_builder_ + : sub_index_builder_->index_block_builder_without_seq_)); partition_cut_requested_ = false; } @@ -152,6 +162,9 @@ void PartitionedIndexBuilder::AddIndexEntry( Status PartitionedIndexBuilder::Finish( IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) { + if (partition_cnt_ == 0) { + partition_cnt_ = entries_.size(); + } // It must be set to null after last key is added assert(sub_index_builder_ == nullptr); if (finishing_indexes == true) { @@ -181,6 +194,8 @@ Status PartitionedIndexBuilder::Finish( index_blocks->index_block_contents = index_block_builder_without_seq_.Finish(); } + top_level_index_size_ = index_blocks->index_block_contents.size(); + index_size_ += top_level_index_size_; return Status::OK(); } else { // Finish the next partition index in line and Incomplete() to indicate we @@ -189,48 +204,13 @@ Status PartitionedIndexBuilder::Finish( // Apply the policy to all sub-indexes entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_; auto s = entry.value->Finish(index_blocks); + index_size_ += index_blocks->index_block_contents.size(); finishing_indexes = true; return s.ok() ? Status::Incomplete() : s; } } -// Estimate size excluding the top-level index -// It is assumed that this method is called before writing index partition -// starts -size_t PartitionedIndexBuilder::EstimatedSize() const { - size_t total = 0; - for (auto it = entries_.begin(); it != entries_.end(); ++it) { - total += it->value->EstimatedSize(); - } - total += - sub_index_builder_ == nullptr ? 0 : sub_index_builder_->EstimatedSize(); - return total; -} - -// Since when this method is called we do not know the index block offsets yet, -// the top-level index does not exist. Hence we estimate the block offsets and -// create a temporary top-level index. -size_t PartitionedIndexBuilder::EstimateTopLevelIndexSize( - uint64_t offset) const { - BlockBuilder tmp_builder( - table_opt_.index_block_restart_interval); // tmp top-level index builder - for (auto it = entries_.begin(); it != entries_.end(); ++it) { - std::string tmp_handle_encoding; - uint64_t size = it->value->EstimatedSize(); - BlockHandle tmp_block_handle(offset, size); - tmp_block_handle.EncodeTo(&tmp_handle_encoding); - std::string handle_delta_encoding; - tmp_block_handle.EncodeSizeTo(&handle_delta_encoding); - const Slice handle_delta_encoding_slice(handle_delta_encoding); - tmp_builder.Add( - seperator_is_key_plus_seq_ ? it->key : ExtractUserKey(it->key), - tmp_handle_encoding, &handle_delta_encoding_slice); - offset += size; - } - return tmp_builder.CurrentSizeEstimate(); -} - size_t PartitionedIndexBuilder::NumPartitions() const { - return entries_.size(); + return partition_cnt_; } } // namespace rocksdb diff --git a/table/index_builder.h b/table/index_builder.h index bc3a2bd542..147108d3b1 100644 --- a/table/index_builder.h +++ b/table/index_builder.h @@ -97,13 +97,15 @@ class IndexBuilder { virtual Status Finish(IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) = 0; - // Get the estimated size for index block. - virtual size_t EstimatedSize() const = 0; + // Get the size for index block. Must be called after ::Finish. + virtual size_t IndexSize() const = 0; virtual bool seperator_is_key_plus_seq() { return true; } protected: const InternalKeyComparator* comparator_; + // Set after ::Finish is called + size_t index_size_ = 0; }; // This index builder builds space-efficient index block. @@ -175,15 +177,12 @@ class ShortenedIndexBuilder : public IndexBuilder { index_blocks->index_block_contents = index_block_builder_without_seq_.Finish(); } + index_size_ = index_blocks->index_block_contents.size(); return Status::OK(); } - virtual size_t EstimatedSize() const override { - if (seperator_is_key_plus_seq_) { - return index_block_builder_.CurrentSizeEstimate(); - } else { - return index_block_builder_without_seq_.CurrentSizeEstimate(); - } + virtual size_t IndexSize() const override { + return index_size_; } virtual bool seperator_is_key_plus_seq() override { @@ -286,8 +285,8 @@ class HashIndexBuilder : public IndexBuilder { return Status::OK(); } - virtual size_t EstimatedSize() const override { - return primary_index_builder_.EstimatedSize() + prefix_block_.size() + + virtual size_t IndexSize() const override { + return primary_index_builder_.IndexSize() + prefix_block_.size() + prefix_meta_block_.size(); } @@ -354,8 +353,12 @@ class PartitionedIndexBuilder : public IndexBuilder { IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) override; - virtual size_t EstimatedSize() const override; - size_t EstimateTopLevelIndexSize(uint64_t) const; + virtual size_t IndexSize() const override { + return index_size_; + } + size_t TopLevelIndexSize(uint64_t) const { + return top_level_index_size_; + } size_t NumPartitions() const; inline bool ShouldCutFilterBlock() { @@ -380,6 +383,11 @@ class PartitionedIndexBuilder : public IndexBuilder { bool get_use_value_delta_encoding() { return use_value_delta_encoding_; } private: + // Set after ::Finish is called + size_t top_level_index_size_ = 0; + // Set after ::Finish is called + size_t partition_cnt_ = 0; + void MakeNewSubIndexBuilder(); struct Entry {