From df38c1ce660628f05b4686eeaf0b548295ce7967 Mon Sep 17 00:00:00 2001 From: Mike Kolupaev Date: Mon, 22 Apr 2019 08:17:45 -0700 Subject: [PATCH] Add BlockBasedTableOptions::index_shortening (#5174) Summary: Introduce BlockBasedTableOptions::index_shortening to give users control on which key shortening techniques to be used in building index blocks. Before this patch, both separators and successor keys where shortened in indexes. With this patch, the default is set to kShortenSeparators to only shorten the separators. Since each index block has many separators and only one successor (last key), the change should not have negative impact on index block size. However it should prevent many unnecessary block loads where due to approximation introduced by shorted successor, seek would land us to the previous block and then fix it by moving to the next one. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5174 Differential Revision: D14884185 Pulled By: al13n321 fbshipit-source-id: 1b08bc8c03edcf09b6b8c16e9a7eea08ad4dd534 --- HISTORY.md | 2 +- db/db_block_cache_test.cc | 10 +++++--- db/db_bloom_filter_test.cc | 9 +++++-- include/rocksdb/table.h | 35 ++++++++++++++++++++++++++ options/options_helper.cc | 21 ++++++++++++++++ options/options_helper.h | 6 +++++ options/options_parser.cc | 6 +++++ options/options_settable_test.cc | 1 + table/block_based_table_factory.cc | 3 +++ table/block_based_table_factory.h | 4 +++ table/index_builder.cc | 26 +++++++++---------- table/index_builder.h | 40 +++++++++++++++++++----------- 12 files changed, 128 insertions(+), 35 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 196924feac..1b47052577 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,7 +4,7 @@ ### New Features * Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator. * Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level. - +* Block-based table index now contains exact highest key in the file, rather than an upper bound. This may improve Get() and iterator Seek() performance in some situations, especially when direct IO is enabled and block cache is disabled. A setting BlockBasedTableOptions::index_shortening is introduced to control this behavior. Set it to kShortenSeparatorsAndSuccessor to get the old behavior. ### Public API Change * Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering. * Change the behavior of OptimizeForSmallDb(): use a 16MB block cache, put index and filter blocks into it, and cost the memtable size to it. DBOptions.OptimizeForSmallDb() and ColumnFamilyOptions.OptimizeForSmallDb() start to take an optional cache object. diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index ad906dbcb5..f6e1aad323 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -346,14 +346,14 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { options.statistics = rocksdb::CreateDBStatistics(); BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; - // 200 bytes are enough to hold the first two blocks - std::shared_ptr cache = NewLRUCache(200, 0, false); + // 500 bytes are enough to hold the first two blocks + std::shared_ptr cache = NewLRUCache(500, 0, false); table_options.block_cache = cache; table_options.filter_policy.reset(NewBloomFilterPolicy(20, true)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_OK(Put(1, "key", "val")); + ASSERT_OK(Put(1, "longer_key", "val")); // Create a new table ASSERT_OK(Flush(1)); size_t index_bytes_insert = @@ -367,7 +367,9 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { cache->SetCapacity(index_bytes_insert + filter_bytes_insert); ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); - ASSERT_OK(Put(1, "key2", "val")); + // Note that the second key needs to be no longer than the first one. + // Otherwise the second index block may not fit in cache. + ASSERT_OK(Put(1, "key", "val")); // Create a new table ASSERT_OK(Flush(1)); // cache evicted old index and block entries diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index de612d04b6..a2a01d6b4c 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -1004,13 +1004,16 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(value3, iter->value().ToString()); - ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count); + // The seek doesn't check block-based bloom filter because last index key + // starts with the same prefix we're seeking to. + uint64_t expected_hits = use_block_based_builder_ ? 1 : 2; + ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); iter->Seek(key2); ASSERT_OK(iter->status()); ASSERT_TRUE(!iter->Valid()); ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); - ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); } INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam, @@ -1335,6 +1338,8 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { table_options.cache_index_and_filter_blocks = true; table_options.filter_policy.reset( NewBloomFilterPolicy(10, use_block_based_builder)); + table_options.index_shortening = BlockBasedTableOptions:: + IndexShorteningMode::kShortenSeparatorsAndSuccessor; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index b154a60a00..88fcc78ed8 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -260,6 +260,41 @@ struct BlockBasedTableOptions { // Align data blocks on lesser of page size and block size bool block_align = false; + + // This enum allows trading off increased index size for improved iterator + // seek performance in some situations, particularly when block cache is + // disabled (ReadOptions::fill_cache = false) and direct IO is + // enabled (DBOptions::use_direct_reads = true). + // The default mode is the best tradeoff for most use cases. + // This option only affects newly written tables. + // + // The index contains a key separating each pair of consecutive blocks. + // Let A be the highest key in one block, B the lowest key in the next block, + // and I the index entry separating these two blocks: + // [ ... A] I [B ...] + // I is allowed to be anywhere in [A, B). + // If an iterator is seeked to a key in (A, I], we'll unnecessarily read the + // first block, then immediately fall through to the second block. + // However, if I=A, this can't happen, and we'll read only the second block. + // In kNoShortening mode, we use I=A. In other modes, we use the shortest + // key in [A, B), which usually significantly reduces index size. + // + // There's a similar story for the last index entry, which is an upper bound + // of the highest key in the file. If it's shortened and therefore + // overestimated, iterator is likely to unnecessarily read the last data block + // from each file on each seek. + enum class IndexShorteningMode : char { + // Use full keys. + kNoShortening, + // Shorten index keys between blocks, but use full key for the last index + // key, which is the upper bound of the whole file. + kShortenSeparators, + // Shorten both keys between blocks and key after last block. + kShortenSeparatorsAndSuccessor, + }; + + IndexShorteningMode index_shortening = + IndexShorteningMode::kShortenSeparators; }; // Table Properties that are specific to block-based table properties. diff --git a/options/options_helper.cc b/options/options_helper.cc index d7170fed7d..c6ed5c26b0 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -522,6 +522,11 @@ bool ParseOptionHelper(char* opt_address, const OptionType& opt_type, block_base_table_data_block_index_type_string_map, value, reinterpret_cast( opt_address)); + case OptionType::kBlockBasedTableIndexShorteningMode: + return ParseEnum( + block_base_table_index_shortening_mode_string_map, value, + reinterpret_cast( + opt_address)); case OptionType::kEncodingType: return ParseEnum( encoding_type_string_map, value, @@ -717,6 +722,12 @@ bool SerializeSingleOptionHelper(const char* opt_address, *reinterpret_cast( opt_address), value); + case OptionType::kBlockBasedTableIndexShorteningMode: + return SerializeEnum( + block_base_table_index_shortening_mode_string_map, + *reinterpret_cast( + opt_address), + value); case OptionType::kFlushBlockPolicyFactory: { const auto* ptr = reinterpret_cast*>( @@ -1640,6 +1651,16 @@ std::unordered_map {"kDataBlockBinaryAndHash", BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}}; +std::unordered_map + OptionsHelper::block_base_table_index_shortening_mode_string_map = { + {"kNoShortening", + BlockBasedTableOptions::IndexShorteningMode::kNoShortening}, + {"kShortenSeparators", + BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators}, + {"kShortenSeparatorsAndSuccessor", + BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor}}; + std::unordered_map OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain}, {"kPrefix", kPrefix}}; diff --git a/options/options_helper.h b/options/options_helper.h index 1d3d880a62..0045220189 100644 --- a/options/options_helper.h +++ b/options/options_helper.h @@ -72,6 +72,7 @@ enum class OptionType { kMemTableRepFactory, kBlockBasedTableIndexType, kBlockBasedTableDataBlockIndexType, + kBlockBasedTableIndexShorteningMode, kFilterPolicy, kFlushBlockPolicyFactory, kChecksumType, @@ -169,6 +170,9 @@ struct OptionsHelper { static std::unordered_map block_base_table_data_block_index_type_string_map; + static std::unordered_map + block_base_table_index_shortening_mode_string_map; static std::unordered_map encoding_type_string_map; static std::unordered_map compaction_style_string_map; @@ -211,6 +215,8 @@ static auto& block_base_table_index_type_string_map = OptionsHelper::block_base_table_index_type_string_map; static auto& block_base_table_data_block_index_type_string_map = OptionsHelper::block_base_table_data_block_index_type_string_map; +static auto& block_base_table_index_shortening_mode_string_map = + OptionsHelper::block_base_table_index_shortening_mode_string_map; static auto& encoding_type_string_map = OptionsHelper::encoding_type_string_map; static auto& compaction_style_string_map = OptionsHelper::compaction_style_string_map; diff --git a/options/options_parser.cc b/options/options_parser.cc index 2a85fa5343..f09e53e4a4 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -569,6 +569,12 @@ bool AreEqualOptions( offset1) == *reinterpret_cast( offset2)); + case OptionType::kBlockBasedTableIndexShorteningMode: + return ( + *reinterpret_cast( + offset1) == + *reinterpret_cast( + offset2)); case OptionType::kWALRecoveryMode: return (*reinterpret_cast(offset1) == *reinterpret_cast(offset2)); diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 58e0527756..09e6b642a5 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -143,6 +143,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "pin_top_level_index_and_filter=1;" "index_type=kHashSearch;" "data_block_index_type=kDataBlockBinaryAndHash;" + "index_shortening=kNoShortening;" "data_block_hash_table_util_ratio=0.75;" "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;" "block_cache=1M;block_cache_compressed=1k;block_size=1024;" diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index e8fb75414c..47fe8e1b0e 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -301,6 +301,9 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { snprintf(buffer, kBufferSize, " data_block_index_type: %d\n", table_options_.data_block_index_type); ret.append(buffer); + snprintf(buffer, kBufferSize, " index_shortening: %d\n", + static_cast(table_options_.index_shortening)); + ret.append(buffer); snprintf(buffer, kBufferSize, " data_block_hash_table_util_ratio: %lf\n", table_options_.data_block_hash_table_util_ratio); ret.append(buffer); diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index 100bb0bc41..83676e9b9c 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -126,6 +126,10 @@ static std::unordered_map {offsetof(struct BlockBasedTableOptions, data_block_index_type), OptionType::kBlockBasedTableDataBlockIndexType, OptionVerificationType::kNormal, false, 0}}, + {"index_shortening", + {offsetof(struct BlockBasedTableOptions, index_shortening), + OptionType::kBlockBasedTableIndexShorteningMode, + OptionVerificationType::kNormal, false, 0}}, {"data_block_hash_table_util_ratio", {offsetof(struct BlockBasedTableOptions, data_block_hash_table_util_ratio), diff --git a/table/index_builder.cc b/table/index_builder.cc index cd28c42a8b..63cb80598f 100644 --- a/table/index_builder.cc +++ b/table/index_builder.cc @@ -34,25 +34,22 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( case BlockBasedTableOptions::kBinarySearch: { result = new ShortenedIndexBuilder( comparator, table_opt.index_block_restart_interval, - table_opt.format_version, use_value_delta_encoding); - } - break; + table_opt.format_version, use_value_delta_encoding, + table_opt.index_shortening); + } break; case BlockBasedTableOptions::kHashSearch: { - result = new HashIndexBuilder(comparator, int_key_slice_transform, - table_opt.index_block_restart_interval, - table_opt.format_version, - use_value_delta_encoding); - } - break; + result = new HashIndexBuilder( + comparator, int_key_slice_transform, + table_opt.index_block_restart_interval, table_opt.format_version, + use_value_delta_encoding, table_opt.index_shortening); + } break; case BlockBasedTableOptions::kTwoLevelIndexSearch: { result = PartitionedIndexBuilder::CreateIndexBuilder( comparator, use_value_delta_encoding, table_opt); - } - break; + } break; default: { assert(!"Do not recognize the index type "); - } - break; + } break; } return result; } @@ -95,7 +92,8 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { assert(sub_index_builder_ == nullptr); sub_index_builder_ = new ShortenedIndexBuilder( comparator_, table_opt_.index_block_restart_interval, - table_opt_.format_version, use_value_delta_encoding_); + table_opt_.format_version, use_value_delta_encoding_, + table_opt_.index_shortening); flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, // Note: this is sub-optimal since sub_index_builder_ could later reset diff --git a/table/index_builder.h b/table/index_builder.h index 87d7b7a71b..2f349fc547 100644 --- a/table/index_builder.h +++ b/table/index_builder.h @@ -119,17 +119,19 @@ class IndexBuilder { // substitute key that serves the same function. class ShortenedIndexBuilder : public IndexBuilder { public: - explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator, - const int index_block_restart_interval, - const uint32_t format_version, - const bool use_value_delta_encoding) + explicit ShortenedIndexBuilder( + const InternalKeyComparator* comparator, + const int index_block_restart_interval, const uint32_t format_version, + const bool use_value_delta_encoding, + BlockBasedTableOptions::IndexShorteningMode shortening_mode) : IndexBuilder(comparator), index_block_builder_(index_block_restart_interval, true /*use_delta_encoding*/, use_value_delta_encoding), index_block_builder_without_seq_(index_block_restart_interval, true /*use_delta_encoding*/, - use_value_delta_encoding) { + use_value_delta_encoding), + shortening_mode_(shortening_mode) { // Making the default true will disable the feature for old versions seperator_is_key_plus_seq_ = (format_version <= 2); } @@ -138,8 +140,11 @@ class ShortenedIndexBuilder : public IndexBuilder { const Slice* first_key_in_next_block, const BlockHandle& block_handle) override { if (first_key_in_next_block != nullptr) { - comparator_->FindShortestSeparator(last_key_in_current_block, - *first_key_in_next_block); + if (shortening_mode_ != + BlockBasedTableOptions::IndexShorteningMode::kNoShortening) { + comparator_->FindShortestSeparator(last_key_in_current_block, + *first_key_in_next_block); + } if (!seperator_is_key_plus_seq_ && comparator_->user_comparator()->Compare( ExtractUserKey(*last_key_in_current_block), @@ -147,7 +152,10 @@ class ShortenedIndexBuilder : public IndexBuilder { seperator_is_key_plus_seq_ = true; } } else { - comparator_->FindShortSuccessor(last_key_in_current_block); + if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor) { + comparator_->FindShortSuccessor(last_key_in_current_block); + } } auto sep = Slice(*last_key_in_current_block); @@ -193,6 +201,7 @@ class ShortenedIndexBuilder : public IndexBuilder { BlockBuilder index_block_builder_; BlockBuilder index_block_builder_without_seq_; bool seperator_is_key_plus_seq_; + BlockBasedTableOptions::IndexShorteningMode shortening_mode_; BlockHandle last_encoded_handle_; }; @@ -225,13 +234,16 @@ class ShortenedIndexBuilder : public IndexBuilder { // data copy or small heap allocations for prefixes. class HashIndexBuilder : public IndexBuilder { public: - explicit HashIndexBuilder(const InternalKeyComparator* comparator, - const SliceTransform* hash_key_extractor, - int index_block_restart_interval, - int format_version, bool use_value_delta_encoding) + explicit HashIndexBuilder( + const InternalKeyComparator* comparator, + const SliceTransform* hash_key_extractor, + int index_block_restart_interval, int format_version, + bool use_value_delta_encoding, + BlockBasedTableOptions::IndexShorteningMode shortening_mode) : IndexBuilder(comparator), primary_index_builder_(comparator, index_block_restart_interval, - format_version, use_value_delta_encoding), + format_version, use_value_delta_encoding, + shortening_mode), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, @@ -389,7 +401,7 @@ class PartitionedIndexBuilder : public IndexBuilder { std::unique_ptr value; }; std::list entries_; // list of partitioned indexes and their keys - BlockBuilder index_block_builder_; // top-level index builder + BlockBuilder index_block_builder_; // top-level index builder BlockBuilder index_block_builder_without_seq_; // same for user keys // the active partition index builder ShortenedIndexBuilder* sub_index_builder_;