Enable hash index for block-based table

Summary: Based on previous patches, this diff eventually provides the end-to-end mechanism for users to specify the hash-index. Test Plan: Wrote several new unit tests. Reviewers: sdong, haobo, dhruba Reviewed By: sdong CC: leveldb Differential Revision: https://reviews.facebook.net/D16539
2024-11-25 22:44:05 +00:00 · 2014-04-10 14:19:43 -07:00 · 2014-04-10 14:19:43 -07:00 · 75b59d5146
parent 7a92537fc4
commit 75b59d5146
10 changed files with 521 additions and 79 deletions
--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -266,6 +266,8 @@ class DBTest {
  // Sequence of option configurations to try
  enum OptionConfig {
    kDefault,
+    kBlockBasedTableWithPrefixHashIndex,
+    kBlockBasedTableWithWholeKeyHashIndex,
    kPlainTableFirstBytePrefix,
    kPlainTableAllBytesPrefix,
    kVectorRep,
@ -303,7 +305,8 @@ class DBTest {
    kSkipDeletesFilterFirst = 1,
    kSkipUniversalCompaction = 2,
    kSkipMergePut = 4,
-    kSkipPlainTable = 8
+    kSkipPlainTable = 8,
+    kSkipHashIndex = 16
  };

  DBTest() : option_config_(kDefault),
@ -343,6 +346,12 @@ class DBTest {
              || option_config_ == kPlainTableFirstBytePrefix)) {
        continue;
      }
+      if ((skip_mask & kSkipPlainTable) &&
+          (option_config_ == kBlockBasedTableWithPrefixHashIndex ||
+           option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) {
+        continue;
+      }
+
      break;
    }

@ -439,6 +448,20 @@ class DBTest {
      case kInfiniteMaxOpenFiles:
        options.max_open_files = -1;
        break;
+      case kBlockBasedTableWithPrefixHashIndex: {
+        BlockBasedTableOptions table_options;
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+        break;
+      }
+      case kBlockBasedTableWithWholeKeyHashIndex: {
+        BlockBasedTableOptions table_options;
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewNoopTransform());
+        break;
+      }
      default:
        break;
    }
@ -1363,7 +1386,7 @@ TEST(DBTest, KeyMayExist) {

    // KeyMayExist function only checks data in block caches, which is not used
    // by plain table format.
-  } while (ChangeOptions(kSkipPlainTable));
+  } while (ChangeOptions(kSkipPlainTable | kSkipHashIndex));
 }

 TEST(DBTest, NonBlockingIteration) {
@ -6184,7 +6207,9 @@ TEST(DBTest, Randomized) {
      int minimum = 0;
      if (option_config_ == kHashSkipList ||
          option_config_ == kHashLinkList ||
-          option_config_ == kPlainTableFirstBytePrefix) {
+          option_config_ == kPlainTableFirstBytePrefix ||
+          option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
+          option_config_ == kBlockBasedTableWithPrefixHashIndex) {
        minimum = 1;
      }
      if (p < 45) {                               // Put
@ -6224,8 +6249,15 @@ TEST(DBTest, Randomized) {
      }

      if ((step % 100) == 0) {
-        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
-        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        // For DB instances that use the hash index + block-based table, the
+        // iterator will be invalid right when seeking a non-existent key, right
+        // than return a key that is close to it.
+        if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
+            option_config_ != kBlockBasedTableWithPrefixHashIndex) {
+          ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+          ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        }
+
        // Save a snapshot from each DB this time that we'll use next
        // time we compare things, to make sure the current state is
        // preserved with the snapshot
--- a/db/dbformat.h
+++ b/db/dbformat.h
@ -13,6 +13,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/types.h"
 #include "util/coding.h"
@ -304,4 +305,34 @@ class IterKey {
  void operator=(const IterKey&) = delete;
 };

+class InternalKeySliceTransform : public SliceTransform {
+ public:
+  explicit InternalKeySliceTransform(const SliceTransform* transform)
+      : transform_(transform) {}
+
+  virtual const char* Name() const { return transform_->Name(); }
+
+  virtual Slice Transform(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->Transform(user_key);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->InDomain(user_key);
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    auto user_key = ExtractUserKey(dst);
+    return transform_->InRange(user_key);
+  }
+
+  const SliceTransform* user_prefix_extractor() const { return transform_; }
+
+ private:
+  // Like comparator, InternalKeySliceTransform will not take care of the
+  // deletion of transform_
+  const SliceTransform* const transform_;
+};
+
 }  // namespace rocksdb
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@ -60,6 +60,12 @@ struct BlockBasedTableOptions {
    // A space efficient index block that is optimized for
    // binary-search-based index.
    kBinarySearch,
+
+    // The hash index, if enabled, will do the hash lookup when
+    // `ReadOption.prefix_seek == true`. User should also specify
+    // `Options.prefix_extractor` to allow the index block to correctly
+    // extract the prefix of the given key and perform hash table lookup.
+    kHashSearch,
  };

  IndexType index_type = kBinarySearch;
--- a/table/block.cc
+++ b/table/block.cc
@ -11,16 +11,20 @@

 #include "table/block.h"

-#include <vector>
 #include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
 #include "rocksdb/comparator.h"
+#include "table/block_hash_index.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/logging.h"

 namespace rocksdb {

-inline uint32_t Block::NumRestarts() const {
+uint32_t Block::NumRestarts() const {
  assert(size_ >= 2*sizeof(uint32_t));
  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
 }
@ -92,6 +96,7 @@ class Block::Iter : public Iterator {
  std::string key_;
  Slice value_;
  Status status_;
+  BlockHashIndex* hash_index_;

  inline int Compare(const Slice& a, const Slice& b) const {
    return comparator_->Compare(a, b);
@ -118,16 +123,15 @@ class Block::Iter : public Iterator {
  }

 public:
-  Iter(const Comparator* comparator,
-       const char* data,
-       uint32_t restarts,
-       uint32_t num_restarts)
+  Iter(const Comparator* comparator, const char* data, uint32_t restarts,
+       uint32_t num_restarts, BlockHashIndex* hash_index)
      : comparator_(comparator),
        data_(data),
        restarts_(restarts),
        num_restarts_(num_restarts),
        current_(restarts_),
-        restart_index_(num_restarts_) {
+        restart_index_(num_restarts_),
+        hash_index_(hash_index) {
    assert(num_restarts_ > 0);
  }

@ -169,45 +173,22 @@ class Block::Iter : public Iterator {
  }

  virtual void Seek(const Slice& target) {
-    // Binary search in restart array to find the first restart point
-    // with a key >= target
-    uint32_t left = 0;
-    uint32_t right = num_restarts_ - 1;
-    while (left < right) {
-      uint32_t mid = (left + right + 1) / 2;
-      uint32_t region_offset = GetRestartPoint(mid);
-      uint32_t shared, non_shared, value_length;
-      const char* key_ptr = DecodeEntry(data_ + region_offset,
-                                        data_ + restarts_,
-                                        &shared, &non_shared, &value_length);
-      if (key_ptr == nullptr || (shared != 0)) {
-        CorruptionError();
-        return;
-      }
-      Slice mid_key(key_ptr, non_shared);
-      if (Compare(mid_key, target) < 0) {
-        // Key at "mid" is smaller than "target".  Therefore all
-        // blocks before "mid" are uninteresting.
-        left = mid;
-      } else {
-        // Key at "mid" is >= "target".  Therefore all blocks at or
-        // after "mid" are uninteresting.
-        right = mid - 1;
-      }
-    }
+    uint32_t index = 0;
+    bool ok = hash_index_ ? HashSeek(target, &index)
+                          : BinarySeek(target, 0, num_restarts_ - 1, &index);

+    if (!ok) {
+      return;
+    }
+    SeekToRestartPoint(index);
    // Linear search (within restart block) for first key >= target
-    SeekToRestartPoint(left);
+
    while (true) {
-      if (!ParseNextKey()) {
-        return;
-      }
-      if (Compare(key_, target) >= 0) {
+      if (!ParseNextKey() || Compare(key_, target) >= 0) {
        return;
      }
    }
  }
-
  virtual void SeekToFirst() {
    SeekToRestartPoint(0);
    ParseNextKey();
@ -257,6 +238,53 @@ class Block::Iter : public Iterator {
      return true;
    }
  }
+  // Binary search in restart array to find the first restart point
+  // with a key >= target
+  bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+                  uint32_t* index) {
+    assert(left <= right);
+
+    while (left < right) {
+      uint32_t mid = (left + right + 1) / 2;
+      uint32_t region_offset = GetRestartPoint(mid);
+      uint32_t shared, non_shared, value_length;
+      const char* key_ptr =
+          DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
+                      &non_shared, &value_length);
+      if (key_ptr == nullptr || (shared != 0)) {
+        CorruptionError();
+        return false;
+      }
+      Slice mid_key(key_ptr, non_shared);
+      if (Compare(mid_key, target) < 0) {
+        // Key at "mid" is smaller than "target". Therefore all
+        // blocks before "mid" are uninteresting.
+        left = mid;
+      } else {
+        // Key at "mid" is >= "target". Therefore all blocks at or
+        // after "mid" are uninteresting.
+        right = mid - 1;
+      }
+    }
+
+    *index = left;
+    return true;
+  }
+
+  bool HashSeek(const Slice& target, uint32_t* index) {
+    assert(hash_index_);
+    auto restart_index = hash_index_->GetRestartIndex(target);
+    if (restart_index == nullptr) {
+      current_ = restarts_;
+      return 0;
+    }
+
+    // the elements in restart_array[index : index + num_blocks]
+    // are all with same prefix. We'll do binary search in that small range.
+    auto left = restart_index->first_index;
+    auto right = restart_index->first_index + restart_index->num_blocks - 1;
+    return BinarySeek(target, left, right, index);
+  }
 };

 Iterator* Block::NewIterator(const Comparator* cmp) {
@ -267,8 +295,13 @@ Iterator* Block::NewIterator(const Comparator* cmp) {
  if (num_restarts == 0) {
    return NewEmptyIterator();
  } else {
-    return new Iter(cmp, data_, restart_offset_, num_restarts);
+    return new Iter(cmp, data_, restart_offset_, num_restarts,
+                    hash_index_.get());
  }
 }

+void Block::SetBlockHashIndex(BlockHashIndex* hash_index) {
+  hash_index_.reset(hash_index);
+}
+
 }  // namespace rocksdb
--- a/table/block.h
+++ b/table/block.h
@ -10,6 +10,7 @@
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
+
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"

@ -17,6 +18,7 @@ namespace rocksdb {

 struct BlockContents;
 class Comparator;
+class BlockHashIndex;

 class Block {
 public:
@ -26,20 +28,28 @@ class Block {
  ~Block();

  size_t size() const { return size_; }
-  bool   cachable() const { return cachable_; }
+  const char* data() const { return data_; }
+  bool cachable() const { return cachable_; }
+  uint32_t NumRestarts() const;
  CompressionType compression_type() const { return compression_type_; }
+
+  // If hash index lookup is enabled and `use_hash_index` is true. This block
+  // will do hash lookup for the key prefix.
+  //
+  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+  // the iterator will simply be set as "invalid", rather than returning
+  // the key that is just pass the target key.
  Iterator* NewIterator(const Comparator* comparator);
-  const char* data() { return data_; }
+  void SetBlockHashIndex(BlockHashIndex* hash_index);

 private:
-  uint32_t NumRestarts() const;
-
  const char* data_;
  size_t size_;
  uint32_t restart_offset_;     // Offset in data_ of restart array
  bool owned_;                  // Block owns data_[]
  bool cachable_;
  CompressionType compression_type_;
+  std::unique_ptr<BlockHashIndex> hash_index_;

  // No copying allowed
  Block(const Block&);
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@ -97,9 +97,9 @@ class IndexBuilder {
 //  2. Shorten the key length for index block. Other than honestly using the
 //     last key in the data block as the index key, we instead find a shortest
 //     substitute key that serves the same function.
-class BinarySearchIndexBuilder : public IndexBuilder {
+class ShortenedIndexBuilder : public IndexBuilder {
 public:
-  explicit BinarySearchIndexBuilder(const Comparator* comparator)
+  explicit ShortenedIndexBuilder(const Comparator* comparator)
      : IndexBuilder(comparator),
        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}

@ -128,11 +128,41 @@ class BinarySearchIndexBuilder : public IndexBuilder {
  BlockBuilder index_block_builder_;
 };

+// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
+// ShortenedIndexBuilder, but preserves the full key instead the substitude key.
+// with the reason being that hash index is based on "prefix".
+class FullKeyIndexBuilder : public IndexBuilder {
+ public:
+  explicit FullKeyIndexBuilder(const Comparator* comparator)
+      : IndexBuilder(comparator),
+        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
+
+  virtual void AddEntry(std::string* last_key_in_current_block,
+                        const Slice* first_key_in_next_block,
+                        const BlockHandle& block_handle) override {
+    std::string handle_encoding;
+    block_handle.EncodeTo(&handle_encoding);
+    index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
+  }
+
+  virtual Slice Finish() override { return index_block_builder_.Finish(); }
+
+  virtual size_t EstimatedSize() const {
+    return index_block_builder_.CurrentSizeEstimate();
+  }
+
+ private:
+  BlockBuilder index_block_builder_;
+};
+
 // Create a index builder based on its type.
 IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
  switch (type) {
    case BlockBasedTableOptions::kBinarySearch: {
-      return new BinarySearchIndexBuilder(comparator);
+      return new ShortenedIndexBuilder(comparator);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      return new FullKeyIndexBuilder(comparator);
    }
    default: {
      assert(!"Do not recognize the index type ");
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@ -25,6 +25,7 @@

 #include "table/block.h"
 #include "table/filter_block.h"
+#include "table/block_hash_index.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
@ -180,19 +181,51 @@ class BinarySearchIndexReader : public IndexReader {
  std::unique_ptr<Block> index_block_;
 };

-// TODO(kailiu) This class is only a stub for now. And the comment below is also
-// not completed.
 // Index that leverages an internal hash table to quicken the lookup for a given
 // key.
+// @param data_iter_gen, equavalent to BlockBasedTable::NewIterator(). But that
+// functions requires index to be initalized. To avoid this problem external
+// caller will pass a function that can create the iterator over the entries
+// without the table to be fully initialized.
 class HashIndexReader : public IndexReader {
 public:
  static Status Create(RandomAccessFile* file, const BlockHandle& index_handle,
                       Env* env, const Comparator* comparator,
-                       BlockBasedTable* table,
+                       std::function<Iterator*(Iterator*)> data_iter_gen,
                       const SliceTransform* prefix_extractor,
                       IndexReader** index_reader) {
-    return Status::NotSupported("not implemented yet!");
+    assert(prefix_extractor);
+    Block* index_block = nullptr;
+    auto s =
+        ReadBlockFromFile(file, ReadOptions(), index_handle, &index_block, env);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    *index_reader = new HashIndexReader(comparator, index_block);
+    std::unique_ptr<Iterator> index_iter(index_block->NewIterator(nullptr));
+    std::unique_ptr<Iterator> data_iter(
+        data_iter_gen(index_block->NewIterator(nullptr)));
+    auto hash_index = CreateBlockHashIndex(index_iter.get(), data_iter.get(),
+                                           index_block->NumRestarts(),
+                                           comparator, prefix_extractor);
+    index_block->SetBlockHashIndex(hash_index);
+    return s;
  }
+
+  virtual Iterator* NewIterator() override {
+    return index_block_->NewIterator(comparator_);
+  }
+
+  virtual size_t size() const override { return index_block_->size(); }
+
+ private:
+  HashIndexReader(const Comparator* comparator, Block* index_block)
+      : IndexReader(comparator), index_block_(index_block) {
+    assert(index_block_ != nullptr);
+  }
+  std::unique_ptr<Block> index_block_;
 };


@ -223,6 +256,11 @@ struct BlockBasedTable::Rep {

  std::shared_ptr<const TableProperties> table_properties;
  BlockBasedTableOptions::IndexType index_type;
+  // TODO(kailiu) It is very ugly to use internal key in table, since table
+  // module should not be relying on db module. However to make things easier
+  // and compatible with existing code, we introduce a wrapper that allows
+  // block to extract prefix without knowing if a key is internal or not.
+  unique_ptr<SliceTransform> internal_prefix_transform;
 };

 BlockBasedTable::~BlockBasedTable() {
@ -747,8 +785,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
  return { filter, cache_handle };
 }

-Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options)
-    const {
+Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
  // index reader has already been pre-populated.
  if (rep_->index_reader) {
    return rep_->index_reader->NewIterator();
@ -978,7 +1015,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
 //  3. options
 //  4. internal_comparator
 //  5. index_type
-Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) const {
+Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) {
  // Some old version of block-based tables don't have index type present in
  // table properties. If that's the case we can safely use the kBinarySearch.
  auto index_type = BlockBasedTableOptions::kBinarySearch;
@ -989,11 +1026,30 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) const {
        DecodeFixed32(pos->second.c_str()));
  }

+  auto file = rep_->file.get();
+  const auto& index_handle = rep_->index_handle;
+  auto env = rep_->options.env;
+  auto comparator = &rep_->internal_comparator;
+
  switch (index_type) {
    case BlockBasedTableOptions::kBinarySearch: {
-      return BinarySearchIndexReader::Create(
-          rep_->file.get(), rep_->index_handle, rep_->options.env,
-          &rep_->internal_comparator, index_reader);
+      return BinarySearchIndexReader::Create(file, index_handle, env,
+                                             comparator, index_reader);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      // We need to wrap data with internal_prefix_transform to make sure it can
+      // handle prefix correctly.
+      rep_->internal_prefix_transform.reset(
+          new InternalKeySliceTransform(rep_->options.prefix_extractor.get()));
+      return HashIndexReader::Create(
+          file, index_handle, env, comparator,
+          [&](Iterator* index_iter) {
+            return NewTwoLevelIterator(
+                index_iter, &BlockBasedTable::DataBlockReader,
+                const_cast<BlockBasedTable*>(this), ReadOptions(),
+                rep_->soptions, rep_->internal_comparator);
+          },
+          rep_->internal_prefix_transform.get(), index_reader);
    }
    default: {
      std::string error_message =
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@ -131,7 +131,7 @@ class BlockBasedTable : public TableReader {
  //  2. index is not present in block cache.
  //  3. We disallowed any io to be performed, that is, read_options ==
  //     kBlockCacheTier
-  Iterator* NewIndexIterator(const ReadOptions& read_options) const;
+  Iterator* NewIndexIterator(const ReadOptions& read_options);

  // Read block cache from block caches (if set): block_cache and
  // block_cache_compressed.
@ -164,7 +164,7 @@ class BlockBasedTable : public TableReader {

  void ReadMeta(const Footer& footer);
  void ReadFilter(const Slice& filter_handle_value);
-  Status CreateIndexReader(IndexReader** index_reader) const;
+  Status CreateIndexReader(IndexReader** index_reader);

  // Read the meta block from sst.
  static Status ReadMetaBlock(
--- a/table/block_test.cc
+++ b/table/block_test.cc
@ -3,7 +3,10 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+#include <stdio.h>
 #include <string>
+#include <vector>
+
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
@ -11,9 +14,11 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
 #include "table/block.h"
 #include "table/block_builder.h"
 #include "table/format.h"
+#include "table/block_hash_index.h"
 #include "util/random.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@ -25,6 +30,40 @@ static std::string RandomString(Random* rnd, int len) {
  test::RandomString(rnd, len, &r);
  return r;
 }
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random *rnd) {
+  char buf[50];
+  char *p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += RandomString(rnd, padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string> *keys,
+                       std::vector<std::string> *values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(RandomString(&rnd, 100));
+    }
+  }
+}

 class BlockTest {};

@ -39,24 +78,11 @@ TEST(BlockTest, SimpleTest) {
  std::vector<std::string> values;
  BlockBuilder builder(options, ic.get());
  int num_records = 100000;
-  char buf[10];
-  char* p = &buf[0];

+  GenerateRandomKVs(&keys, &values, 0, num_records);
  // add a bunch of records to a block
  for (int i = 0; i < num_records; i++) {
-    // generate random kvs
-    sprintf(p, "%6d", i);
-    std::string k(p);
-    std::string v = RandomString(&rnd, 100); // 100 byte values
-
-    // write kvs to the block
-    Slice key(k);
-    Slice value(v);
-    builder.Add(key, value);
-
-    // remember kvs in a lookaside array
-    keys.push_back(k);
-    values.push_back(v);
+    builder.Add(keys[i], values[i]);
  }

  // read serialized contents of the block
@ -101,6 +127,114 @@ TEST(BlockTest, SimpleTest) {
  delete iter;
 }

+// return the block contents
+BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
+                               const std::vector<std::string> &keys,
+                               const std::vector<std::string> &values,
+                               const int prefix_group_size = 1) {
+  builder->reset(
+      new BlockBuilder(1 /* restart interval */, BytewiseComparator()));
+
+  // Add only half of the keys
+  for (size_t i = 0; i < keys.size(); ++i) {
+    (*builder)->Add(keys[i], values[i]);
+  }
+  Slice rawblock = (*builder)->Finish();
+
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  contents.heap_allocated = false;
+
+  return contents;
+}
+
+void CheckBlockContents(BlockContents contents, const int max_key,
+                        const std::vector<std::string> &keys,
+                        const std::vector<std::string> &values) {
+  const size_t prefix_size = 6;
+  // create block reader
+  Block reader1(contents);
+  Block reader2(contents);
+
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(prefix_size));
+
+  {
+    auto iter1 = reader1.NewIterator(nullptr);
+    auto iter2 = reader1.NewIterator(nullptr);
+    reader1.SetBlockHashIndex(CreateBlockHashIndex(iter1, iter2, keys.size(),
+                                                   BytewiseComparator(),
+                                                   prefix_extractor.get()));
+
+    delete iter1;
+    delete iter2;
+  }
+
+  std::unique_ptr<Iterator> hash_iter(
+      reader1.NewIterator(BytewiseComparator()));
+
+  std::unique_ptr<Iterator> regular_iter(
+      reader2.NewIterator(BytewiseComparator()));
+
+  // Seek existent keys
+  for (size_t i = 0; i < keys.size(); i++) {
+    hash_iter->Seek(keys[i]);
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    Slice v = hash_iter->value();
+    ASSERT_EQ(v.ToString().compare(values[i]), 0);
+  }
+
+  // Seek non-existent keys.
+  // For hash index, if no key with a given prefix is not found, iterator will
+  // simply be set as invalid; whereas the binary search based iterator will
+  // return the one that is closest.
+  for (int i = 1; i < max_key - 1; i += 2) {
+    auto key = GenerateKey(i, 0, 0, nullptr);
+    hash_iter->Seek(key);
+    ASSERT_TRUE(!hash_iter->Valid());
+
+    regular_iter->Seek(key);
+    ASSERT_TRUE(regular_iter->Valid());
+  }
+}
+
+// In this test case, no two key share same prefix.
+TEST(BlockTest, SimpleIndexHash) {
+  const int kMaxKey = 100000;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  GenerateRandomKVs(&keys, &values, 0 /* first key id */,
+                    kMaxKey /* last key id */, 2 /* step */,
+                    8 /* padding size (8 bytes randomly generated suffix) */);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values);
+
+  CheckBlockContents(contents, kMaxKey, keys, values);
+}
+
+TEST(BlockTest, IndexHashWithSharedPrefix) {
+  const int kMaxKey = 100000;
+  // for each prefix, there will be 5 keys starts with it.
+  const int kPrefixGroup = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  // Generate keys with same prefix.
+  GenerateRandomKVs(&keys, &values, 0,  // first key id
+                    kMaxKey,            // last key id
+                    2,                  // step
+                    10,                 // padding size,
+                    kPrefixGroup);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
+
+  CheckBlockContents(contents, kMaxKey, keys, values);
+}
+
 }  // namespace rocksdb

 int main(int argc, char** argv) {
--- a/table/table_test.cc
+++ b/table/table_test.cc
@ -1055,6 +1055,116 @@ static std::string RandomString(Random* rnd, int len) {
  return r;
 }

+void AddInternalKey(TableConstructor* c, const std::string prefix,
+                    int suffix_len = 800) {
+  static Random rnd(1023);
+  InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);
+  c->Add(k.Encode().ToString(), "v");
+}
+
+TEST(TableTest, HashIndexTest) {
+  TableConstructor c(BytewiseComparator());
+
+  // keys with prefix length 3, make sure the key/value is big enough to fill
+  // one block
+  AddInternalKey(&c, "0015");
+  AddInternalKey(&c, "0035");
+
+  AddInternalKey(&c, "0054");
+  AddInternalKey(&c, "0055");
+
+  AddInternalKey(&c, "0056");
+  AddInternalKey(&c, "0057");
+
+  AddInternalKey(&c, "0058");
+  AddInternalKey(&c, "0075");
+
+  AddInternalKey(&c, "0076");
+  AddInternalKey(&c, "0095");
+
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  BlockBasedTableOptions table_options;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.block_cache = NewLRUCache(1024);
+  options.block_size = 1700;
+
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  c.Finish(options, *comparator, &keys, &kvmap);
+  auto reader = c.table_reader();
+
+  auto props = c.table_reader()->GetTableProperties();
+  ASSERT_EQ(5u, props->num_data_blocks);
+
+  std::unique_ptr<Iterator> hash_iter(reader->NewIterator(ReadOptions()));
+
+  // -- Find keys do not exist, but have common prefix.
+  std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
+  std::vector<std::string> lower_bound = {keys[0], keys[1], keys[2],
+                                          keys[7], keys[9], };
+
+  // find the lower bound of the prefix
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    hash_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode());
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    // seek the first element in the block
+    ASSERT_EQ(lower_bound[i], hash_iter->key().ToString());
+    ASSERT_EQ("v", hash_iter->value().ToString());
+  }
+
+  // find the upper bound of prefixes
+  std::vector<std::string> upper_bound = {keys[1], keys[2], keys[7], keys[9], };
+
+  // find existing keys
+  for (const auto& item : kvmap) {
+    auto ukey = ExtractUserKey(item.first).ToString();
+    hash_iter->Seek(ukey);
+
+    // ASSERT_OK(regular_iter->status());
+    ASSERT_OK(hash_iter->status());
+
+    // ASSERT_TRUE(regular_iter->Valid());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    ASSERT_EQ(item.first, hash_iter->key().ToString());
+    ASSERT_EQ(item.second, hash_iter->value().ToString());
+  }
+
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    // the key is greater than any existing keys.
+    auto key = prefixes[i] + "9";
+    hash_iter->Seek(InternalKey(key, 0, kTypeValue).Encode());
+
+    ASSERT_OK(hash_iter->status());
+    if (i == prefixes.size() - 1) {
+      // last key
+      ASSERT_TRUE(!hash_iter->Valid());
+    } else {
+      ASSERT_TRUE(hash_iter->Valid());
+      // seek the first element in the block
+      ASSERT_EQ(upper_bound[i], hash_iter->key().ToString());
+      ASSERT_EQ("v", hash_iter->value().ToString());
+    }
+  }
+
+  // find keys with prefix that don't match any of the existing prefixes.
+  std::vector<std::string> non_exist_prefixes = {"002", "004", "006", "008"};
+  for (const auto& prefix : non_exist_prefixes) {
+    hash_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode());
+    // regular_iter->Seek(prefix);
+
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(!hash_iter->Valid());
+  }
+}
+
 // It's very hard to figure out the index block size of a block accurately.
 // To make sure we get the index size, we just make sure as key number
 // grows, the filter block size also grows.