Add BlockBasedTableOptions::index_shortening (#5174)

Summary: Introduce BlockBasedTableOptions::index_shortening to give users control on which key shortening techniques to be used in building index blocks. Before this patch, both separators and successor keys where shortened in indexes. With this patch, the default is set to kShortenSeparators to only shorten the separators. Since each index block has many separators and only one successor (last key), the change should not have negative impact on index block size. However it should prevent many unnecessary block loads where due to approximation introduced by shorted successor, seek would land us to the previous block and then fix it by moving to the next one. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5174 Differential Revision: D14884185 Pulled By: al13n321 fbshipit-source-id: 1b08bc8c03edcf09b6b8c16e9a7eea08ad4dd534
2019-04-22 08:17:45 -07:00 · 2019-04-22 08:17:45 -07:00 · df38c1ce66
parent de76909464
commit df38c1ce66
12 changed files with 128 additions and 35 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -4,7 +4,7 @@
 ### New Features
 * Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator.
 * Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level.
-
+* Block-based table index now contains exact highest key in the file, rather than an upper bound. This may improve Get() and iterator Seek() performance in some situations, especially when direct IO is enabled and block cache is disabled. A setting BlockBasedTableOptions::index_shortening is introduced to control this behavior. Set it to kShortenSeparatorsAndSuccessor to get the old behavior.
 ### Public API Change
 * Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering.
 * Change the behavior of OptimizeForSmallDb(): use a 16MB block cache, put index and filter blocks into it, and cost the memtable size to it. DBOptions.OptimizeForSmallDb() and ColumnFamilyOptions.OptimizeForSmallDb() start to take an optional cache object.
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@ -346,14 +346,14 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
  options.statistics = rocksdb::CreateDBStatistics();
  BlockBasedTableOptions table_options;
  table_options.cache_index_and_filter_blocks = true;
-  // 200 bytes are enough to hold the first two blocks
-  std::shared_ptr<Cache> cache = NewLRUCache(200, 0, false);
+  // 500 bytes are enough to hold the first two blocks
+  std::shared_ptr<Cache> cache = NewLRUCache(500, 0, false);
  table_options.block_cache = cache;
  table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
  options.table_factory.reset(new BlockBasedTableFactory(table_options));
  CreateAndReopenWithCF({"pikachu"}, options);

-  ASSERT_OK(Put(1, "key", "val"));
+  ASSERT_OK(Put(1, "longer_key", "val"));
  // Create a new table
  ASSERT_OK(Flush(1));
  size_t index_bytes_insert =
@ -367,7 +367,9 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
  cache->SetCapacity(index_bytes_insert + filter_bytes_insert);
  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
-  ASSERT_OK(Put(1, "key2", "val"));
+  // Note that the second key needs to be no longer than the first one.
+  // Otherwise the second index block may not fit in cache.
+  ASSERT_OK(Put(1, "key", "val"));
  // Create a new table
  ASSERT_OK(Flush(1));
  // cache evicted old index and block entries
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@ -1004,13 +1004,16 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
  ASSERT_OK(iter->status());
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ(value3, iter->value().ToString());
-  ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count);
+  // The seek doesn't check block-based bloom filter because last index key
+  // starts with the same prefix we're seeking to.
+  uint64_t expected_hits = use_block_based_builder_ ? 1 : 2;
+  ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);

  iter->Seek(key2);
  ASSERT_OK(iter->status());
  ASSERT_TRUE(!iter->Valid());
  ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
-  ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count);
+  ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
 }

 INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam,
@ -1335,6 +1338,8 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
    table_options.cache_index_and_filter_blocks = true;
    table_options.filter_policy.reset(
        NewBloomFilterPolicy(10, use_block_based_builder));
+    table_options.index_shortening = BlockBasedTableOptions::
+        IndexShorteningMode::kShortenSeparatorsAndSuccessor;
    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
    DestroyAndReopen(options);

--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@ -260,6 +260,41 @@ struct BlockBasedTableOptions {

  // Align data blocks on lesser of page size and block size
  bool block_align = false;
+
+  // This enum allows trading off increased index size for improved iterator
+  // seek performance in some situations, particularly when block cache is
+  // disabled (ReadOptions::fill_cache = false) and direct IO is
+  // enabled (DBOptions::use_direct_reads = true).
+  // The default mode is the best tradeoff for most use cases.
+  // This option only affects newly written tables.
+  //
+  // The index contains a key separating each pair of consecutive blocks.
+  // Let A be the highest key in one block, B the lowest key in the next block,
+  // and I the index entry separating these two blocks:
+  // [ ... A] I [B ...]
+  // I is allowed to be anywhere in [A, B).
+  // If an iterator is seeked to a key in (A, I], we'll unnecessarily read the
+  // first block, then immediately fall through to the second block.
+  // However, if I=A, this can't happen, and we'll read only the second block.
+  // In kNoShortening mode, we use I=A. In other modes, we use the shortest
+  // key in [A, B), which usually significantly reduces index size.
+  //
+  // There's a similar story for the last index entry, which is an upper bound
+  // of the highest key in the file. If it's shortened and therefore
+  // overestimated, iterator is likely to unnecessarily read the last data block
+  // from each file on each seek.
+  enum class IndexShorteningMode : char {
+    // Use full keys.
+    kNoShortening,
+    // Shorten index keys between blocks, but use full key for the last index
+    // key, which is the upper bound of the whole file.
+    kShortenSeparators,
+    // Shorten both keys between blocks and key after last block.
+    kShortenSeparatorsAndSuccessor,
+  };
+
+  IndexShorteningMode index_shortening =
+      IndexShorteningMode::kShortenSeparators;
 };

 // Table Properties that are specific to block-based table properties.
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@ -522,6 +522,11 @@ bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
          block_base_table_data_block_index_type_string_map, value,
          reinterpret_cast<BlockBasedTableOptions::DataBlockIndexType*>(
              opt_address));
+    case OptionType::kBlockBasedTableIndexShorteningMode:
+      return ParseEnum<BlockBasedTableOptions::IndexShorteningMode>(
+        block_base_table_index_shortening_mode_string_map, value,
+        reinterpret_cast<BlockBasedTableOptions::IndexShorteningMode*>(
+            opt_address));
    case OptionType::kEncodingType:
      return ParseEnum<EncodingType>(
          encoding_type_string_map, value,
@ -717,6 +722,12 @@ bool SerializeSingleOptionHelper(const char* opt_address,
          *reinterpret_cast<const BlockBasedTableOptions::DataBlockIndexType*>(
              opt_address),
          value);
+    case OptionType::kBlockBasedTableIndexShorteningMode:
+      return SerializeEnum<BlockBasedTableOptions::IndexShorteningMode>(
+          block_base_table_index_shortening_mode_string_map,
+          *reinterpret_cast<const BlockBasedTableOptions::IndexShorteningMode*>(
+              opt_address),
+          value);
    case OptionType::kFlushBlockPolicyFactory: {
      const auto* ptr =
          reinterpret_cast<const std::shared_ptr<FlushBlockPolicyFactory>*>(
@ -1640,6 +1651,16 @@ std::unordered_map<std::string, BlockBasedTableOptions::DataBlockIndexType>
        {"kDataBlockBinaryAndHash",
         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};

+std::unordered_map<std::string, BlockBasedTableOptions::IndexShorteningMode>
+    OptionsHelper::block_base_table_index_shortening_mode_string_map = {
+      {"kNoShortening",
+       BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
+      {"kShortenSeparators",
+       BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
+      {"kShortenSeparatorsAndSuccessor",
+       BlockBasedTableOptions::IndexShorteningMode::
+           kShortenSeparatorsAndSuccessor}};
+
 std::unordered_map<std::string, EncodingType>
    OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain},
                                               {"kPrefix", kPrefix}};
--- a/options/options_helper.h
+++ b/options/options_helper.h
@ -72,6 +72,7 @@ enum class OptionType {
  kMemTableRepFactory,
  kBlockBasedTableIndexType,
  kBlockBasedTableDataBlockIndexType,
+  kBlockBasedTableIndexShorteningMode,
  kFilterPolicy,
  kFlushBlockPolicyFactory,
  kChecksumType,
@ -169,6 +170,9 @@ struct OptionsHelper {
  static std::unordered_map<std::string,
                            BlockBasedTableOptions::DataBlockIndexType>
      block_base_table_data_block_index_type_string_map;
+  static std::unordered_map<std::string,
+                            BlockBasedTableOptions::IndexShorteningMode>
+      block_base_table_index_shortening_mode_string_map;
  static std::unordered_map<std::string, EncodingType> encoding_type_string_map;
  static std::unordered_map<std::string, CompactionStyle>
      compaction_style_string_map;
@ -211,6 +215,8 @@ static auto& block_base_table_index_type_string_map =
    OptionsHelper::block_base_table_index_type_string_map;
 static auto& block_base_table_data_block_index_type_string_map =
    OptionsHelper::block_base_table_data_block_index_type_string_map;
+static auto& block_base_table_index_shortening_mode_string_map =
+    OptionsHelper::block_base_table_index_shortening_mode_string_map;
 static auto& encoding_type_string_map = OptionsHelper::encoding_type_string_map;
 static auto& compaction_style_string_map =
    OptionsHelper::compaction_style_string_map;
--- a/options/options_parser.cc
+++ b/options/options_parser.cc
@ -569,6 +569,12 @@ bool AreEqualOptions(
              offset1) ==
          *reinterpret_cast<const BlockBasedTableOptions::DataBlockIndexType*>(
              offset2));
+    case OptionType::kBlockBasedTableIndexShorteningMode:
+      return (
+          *reinterpret_cast<const BlockBasedTableOptions::IndexShorteningMode*>(
+              offset1) ==
+          *reinterpret_cast<const BlockBasedTableOptions::IndexShorteningMode*>(
+              offset2));
    case OptionType::kWALRecoveryMode:
      return (*reinterpret_cast<const WALRecoveryMode*>(offset1) ==
              *reinterpret_cast<const WALRecoveryMode*>(offset2));
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@ -143,6 +143,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
      "pin_top_level_index_and_filter=1;"
      "index_type=kHashSearch;"
      "data_block_index_type=kDataBlockBinaryAndHash;"
+      "index_shortening=kNoShortening;"
      "data_block_hash_table_util_ratio=0.75;"
      "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
      "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@ -301,6 +301,9 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
  snprintf(buffer, kBufferSize, "  data_block_index_type: %d\n",
           table_options_.data_block_index_type);
  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_shortening: %d\n",
+           static_cast<int>(table_options_.index_shortening));
+  ret.append(buffer);
  snprintf(buffer, kBufferSize, "  data_block_hash_table_util_ratio: %lf\n",
           table_options_.data_block_hash_table_util_ratio);
  ret.append(buffer);
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@ -126,6 +126,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {offsetof(struct BlockBasedTableOptions, data_block_index_type),
          OptionType::kBlockBasedTableDataBlockIndexType,
          OptionVerificationType::kNormal, false, 0}},
+        {"index_shortening",
+         {offsetof(struct BlockBasedTableOptions, index_shortening),
+          OptionType::kBlockBasedTableIndexShorteningMode,
+          OptionVerificationType::kNormal, false, 0}},
        {"data_block_hash_table_util_ratio",
         {offsetof(struct BlockBasedTableOptions,
                   data_block_hash_table_util_ratio),
--- a/table/index_builder.cc
+++ b/table/index_builder.cc
@ -34,25 +34,22 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
    case BlockBasedTableOptions::kBinarySearch: {
      result = new ShortenedIndexBuilder(
          comparator, table_opt.index_block_restart_interval,
-          table_opt.format_version, use_value_delta_encoding);
-    }
-  break;
+          table_opt.format_version, use_value_delta_encoding,
+          table_opt.index_shortening);
+    } break;
    case BlockBasedTableOptions::kHashSearch: {
-      result = new HashIndexBuilder(comparator, int_key_slice_transform,
-                                    table_opt.index_block_restart_interval,
-                                    table_opt.format_version,
-                                    use_value_delta_encoding);
-    }
-  break;
+      result = new HashIndexBuilder(
+          comparator, int_key_slice_transform,
+          table_opt.index_block_restart_interval, table_opt.format_version,
+          use_value_delta_encoding, table_opt.index_shortening);
+    } break;
    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
      result = PartitionedIndexBuilder::CreateIndexBuilder(
          comparator, use_value_delta_encoding, table_opt);
-    }
-    break;
+    } break;
    default: {
      assert(!"Do not recognize the index type ");
-    }
-  break;
+    } break;
  }
  return result;
 }
@ -95,7 +92,8 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
  assert(sub_index_builder_ == nullptr);
  sub_index_builder_ = new ShortenedIndexBuilder(
      comparator_, table_opt_.index_block_restart_interval,
-      table_opt_.format_version, use_value_delta_encoding_);
+      table_opt_.format_version, use_value_delta_encoding_,
+      table_opt_.index_shortening);
  flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
      table_opt_.metadata_block_size, table_opt_.block_size_deviation,
      // Note: this is sub-optimal since sub_index_builder_ could later reset
--- a/table/index_builder.h
+++ b/table/index_builder.h
@ -119,17 +119,19 @@ class IndexBuilder {
 //     substitute key that serves the same function.
 class ShortenedIndexBuilder : public IndexBuilder {
 public:
-  explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
-                                 const int index_block_restart_interval,
-                                 const uint32_t format_version,
-                                 const bool use_value_delta_encoding)
+  explicit ShortenedIndexBuilder(
+      const InternalKeyComparator* comparator,
+      const int index_block_restart_interval, const uint32_t format_version,
+      const bool use_value_delta_encoding,
+      BlockBasedTableOptions::IndexShorteningMode shortening_mode)
      : IndexBuilder(comparator),
        index_block_builder_(index_block_restart_interval,
                             true /*use_delta_encoding*/,
                             use_value_delta_encoding),
        index_block_builder_without_seq_(index_block_restart_interval,
                                         true /*use_delta_encoding*/,
-                                         use_value_delta_encoding) {
+                                         use_value_delta_encoding),
+        shortening_mode_(shortening_mode) {
    // Making the default true will disable the feature for old versions
    seperator_is_key_plus_seq_ = (format_version <= 2);
  }
@ -138,8 +140,11 @@ class ShortenedIndexBuilder : public IndexBuilder {
                             const Slice* first_key_in_next_block,
                             const BlockHandle& block_handle) override {
    if (first_key_in_next_block != nullptr) {
-      comparator_->FindShortestSeparator(last_key_in_current_block,
-                                         *first_key_in_next_block);
+      if (shortening_mode_ !=
+          BlockBasedTableOptions::IndexShorteningMode::kNoShortening) {
+        comparator_->FindShortestSeparator(last_key_in_current_block,
+                                           *first_key_in_next_block);
+      }
      if (!seperator_is_key_plus_seq_ &&
          comparator_->user_comparator()->Compare(
              ExtractUserKey(*last_key_in_current_block),
@ -147,7 +152,10 @@ class ShortenedIndexBuilder : public IndexBuilder {
        seperator_is_key_plus_seq_ = true;
      }
    } else {
-      comparator_->FindShortSuccessor(last_key_in_current_block);
+      if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode::
+                                  kShortenSeparatorsAndSuccessor) {
+        comparator_->FindShortSuccessor(last_key_in_current_block);
+      }
    }
    auto sep = Slice(*last_key_in_current_block);

@ -193,6 +201,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
  BlockBuilder index_block_builder_;
  BlockBuilder index_block_builder_without_seq_;
  bool seperator_is_key_plus_seq_;
+  BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
  BlockHandle last_encoded_handle_;
 };

@ -225,13 +234,16 @@ class ShortenedIndexBuilder : public IndexBuilder {
 // data copy or small heap allocations for prefixes.
 class HashIndexBuilder : public IndexBuilder {
 public:
-  explicit HashIndexBuilder(const InternalKeyComparator* comparator,
-                            const SliceTransform* hash_key_extractor,
-                            int index_block_restart_interval,
-                            int format_version, bool use_value_delta_encoding)
+  explicit HashIndexBuilder(
+      const InternalKeyComparator* comparator,
+      const SliceTransform* hash_key_extractor,
+      int index_block_restart_interval, int format_version,
+      bool use_value_delta_encoding,
+      BlockBasedTableOptions::IndexShorteningMode shortening_mode)
      : IndexBuilder(comparator),
        primary_index_builder_(comparator, index_block_restart_interval,
-                               format_version, use_value_delta_encoding),
+                               format_version, use_value_delta_encoding,
+                               shortening_mode),
        hash_key_extractor_(hash_key_extractor) {}

  virtual void AddIndexEntry(std::string* last_key_in_current_block,
@ -389,7 +401,7 @@ class PartitionedIndexBuilder : public IndexBuilder {
    std::unique_ptr<ShortenedIndexBuilder> value;
  };
  std::list<Entry> entries_;  // list of partitioned indexes and their keys
-  BlockBuilder index_block_builder_;  // top-level index builder
+  BlockBuilder index_block_builder_;              // top-level index builder
  BlockBuilder index_block_builder_without_seq_;  // same for user keys
  // the active partition index builder
  ShortenedIndexBuilder* sub_index_builder_;