Prefix scan: db_bench and bug fixes

Summary: If use_prefix_filters is set and read_range>1, then the random seeks will set a the prefix filter to be the prefix of the key which was randomly selected as the target. Still need to add statistics (perhaps in a separate diff). Test Plan: ./db_bench --benchmarks=fillseq,prefixscanrandom --num=10000000 --statistics=1 --use_prefix_blooms=1 --use_prefix_api=1 --bloom_bits=10 Reviewers: dhruba Reviewed By: dhruba CC: leveldb, haobo Differential Revision: https://reviews.facebook.net/D12273
2013-08-22 16:06:50 -07:00 · 2013-08-22 16:06:50 -07:00 · c2bd8f4824
parent 60bf2b7d4a
commit c2bd8f4824
10 changed files with 164 additions and 16 deletions
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@ -41,8 +41,9 @@
 //      readrandom    -- read N times in random order
 //      readmissing   -- read N missing keys in random order
 //      readhot       -- read N times in random order from 1% section of DB
-//      readwhilewriting -- 1 writer, N threads doing random reads
-//      readrandomwriterandom - N threads doing random-read, random-write
+//      readwhilewriting      -- 1 writer, N threads doing random reads
+//      readrandomwriterandom -- N threads doing random-read, random-write
+//      prefixscanrandom      -- prefix scan N times in random order
 //      updaterandom  -- N threads doing read-modify-write for random keys
 //      appendrandom  -- N threads doing read-modify-write with growing values
 //      mergerandom   -- same as updaterandom/appendrandom using merge operator
@ -95,6 +96,13 @@ static long FLAGS_reads = -1;
 // When ==1 reads use ::Get, when >1 reads use an iterator
 static long FLAGS_read_range = 1;

+// Whether to place prefixes in blooms
+static bool FLAGS_use_prefix_blooms = false;
+
+// Whether to set ReadOptions.prefix for prefixscanrandom.  If this
+// true, use_prefix_blooms must also be true.
+static bool FLAGS_use_prefix_api = false;
+
 // Seed base for random number generators. When 0 it is deterministic.
 static long FLAGS_seed = 0;

@ -631,6 +639,7 @@ class Benchmark {
 private:
  shared_ptr<Cache> cache_;
  const FilterPolicy* filter_policy_;
+  const SliceTransform* prefix_extractor_;
  DB* db_;
  long num_;
  int value_size_;
@ -773,6 +782,7 @@ class Benchmark {
    filter_policy_(FLAGS_bloom_bits >= 0
                   ? NewBloomFilterPolicy(FLAGS_bloom_bits)
                   : nullptr),
+    prefix_extractor_(NewFixedPrefixTransform(FLAGS_key_size-1)),
    db_(nullptr),
    num_(FLAGS_num),
    value_size_(FLAGS_value_size),
@ -799,6 +809,7 @@ class Benchmark {
  ~Benchmark() {
    delete db_;
    delete filter_policy_;
+    delete prefix_extractor_;
  }

  //this function will construct string format for key. e.g "%016d"
@ -894,6 +905,8 @@ class Benchmark {
      } else if (name == Slice("readrandomsmall")) {
        reads_ /= 1000;
        method = &Benchmark::ReadRandom;
+      } else if (name == Slice("prefixscanrandom")) {
+        method = &Benchmark::PrefixScanRandom;
      } else if (name == Slice("deleteseq")) {
        method = &Benchmark::DeleteSeq;
      } else if (name == Slice("deleterandom")) {
@ -1146,6 +1159,8 @@ class Benchmark {
      FLAGS_compaction_universal_min_merge_width;
    options.block_size = FLAGS_block_size;
    options.filter_policy = filter_policy_;
+    options.prefix_extractor = FLAGS_use_prefix_blooms ? prefix_extractor_
+                                                       : nullptr;
    options.max_open_files = FLAGS_open_files;
    options.statistics = dbstats;
    options.env = FLAGS_env;
@ -1467,6 +1482,41 @@ class Benchmark {
    thread->stats.AddMessage(msg);
  }

+  void PrefixScanRandom(ThreadState* thread) {
+    if (FLAGS_use_prefix_api) {
+      assert(FLAGS_use_prefix_blooms);
+      assert(FLAGS_bloom_bits >= 1);
+    }
+
+    ReadOptions options(FLAGS_verify_checksum, true);
+    Duration duration(FLAGS_duration, reads_);
+
+    long found = 0;
+
+    while (!duration.Done(1)) {
+      std::string value;
+      const int k = thread->rand.Next() % FLAGS_num;
+      unique_ptr<char []> key = GenerateKeyFromInt(k);
+      Slice skey(key.get());
+      Slice prefix = prefix_extractor_->Transform(skey);
+      options.prefix = FLAGS_use_prefix_api ? &prefix : nullptr;
+
+      Iterator* iter = db_->NewIterator(options);
+      for (iter->Seek(skey);
+           iter->Valid() && iter->key().starts_with(prefix);
+           iter->Next()) {
+        found++;
+      }
+      delete iter;
+
+      thread->stats.FinishedSingleOp(db_);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%ld of %ld found)", found, reads_);
+    thread->stats.AddMessage(msg);
+  }
+
  void ReadMissing(ThreadState* thread) {
    FLAGS_warn_missing_keys = false;    // Never warn about missing keys

@ -2170,6 +2220,13 @@ int main(int argc, char** argv) {
      FLAGS_reads = n;
    } else if (sscanf(argv[i], "--read_range=%d%c", &n, &junk) == 1) {
      FLAGS_read_range = n;
+
+    } else if (sscanf(argv[i], "--use_prefix_blooms=%d%c", &n, &junk) == 1 &&
+               (n == 0 || n == 1)) {
+      FLAGS_use_prefix_blooms = n;
+    } else if (sscanf(argv[i], "--use_prefix_api=%d%c", &n, &junk) == 1 &&
+               (n == 0 || n == 1)) {
+      FLAGS_use_prefix_api = n;
    } else if (sscanf(argv[i], "--duration=%d%c", &n, &junk) == 1) {
      FLAGS_duration = n;
    } else if (sscanf(argv[i], "--seed=%ld%c", &l, &junk) == 1) {
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -135,6 +135,24 @@ Status TableCache::Get(const ReadOptions& options,
  return s;
 }

+bool TableCache::PrefixMayMatch(const ReadOptions& options,
+                                uint64_t file_number,
+                                uint64_t file_size,
+                                const Slice& internal_prefix,
+                                bool* table_io) {
+  Cache::Handle* handle = nullptr;
+  Status s = FindTable(storage_options_, file_number,
+                       file_size, &handle, table_io);
+  bool may_match = true;
+  if (s.ok()) {
+    Table* t =
+      reinterpret_cast<Table*>(cache_->Value(handle));
+    may_match = t->PrefixMayMatch(internal_prefix);
+    cache_->Release(handle);
+  }
+  return may_match;
+}
+
 void TableCache::Evict(uint64_t file_number) {
  char buf[sizeof(file_number)];
  EncodeFixed64(buf, file_number);
--- a/db/table_cache.h
+++ b/db/table_cache.h
@ -52,6 +52,12 @@ class TableCache {
             void (*mark_key_may_exist)(void*) = nullptr,
             const bool no_io = false);

+  // Determine whether the table may contain the specified prefix.  If
+  // the table index of blooms are not in memory, this may cause an I/O
+  bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number,
+                      uint64_t file_size, const Slice& internal_prefix,
+                      bool* table_io);
+
  // Evict any entry for the specified file number
  void Evict(uint64_t file_number);

--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -189,7 +189,14 @@ static Iterator* GetFileIterator(void* arg,
    return NewErrorIterator(
        Status::Corruption("FileReader invoked with unexpected value"));
  } else {
-    return cache->NewIterator(options,
+    ReadOptions options_copy;
+    if (options.prefix) {
+      // suppress prefix filtering since we have already checked the
+      // filters once at this point
+      options_copy = options;
+      options_copy.prefix = nullptr;
+    }
+    return cache->NewIterator(options.prefix ? options_copy : options,
                              soptions,
                              DecodeFixed64(file_value.data()),
                              DecodeFixed64(file_value.data() + 8),
@ -198,12 +205,45 @@ static Iterator* GetFileIterator(void* arg,
  }
 }

+bool Version::PrefixMayMatch(const ReadOptions& options,
+                             const EnvOptions& soptions,
+                             const Slice& internal_prefix,
+                             Iterator* level_iter) const {
+  bool may_match = true;
+  level_iter->Seek(internal_prefix);
+  if (!level_iter->Valid()) {
+    // we're past end of level
+    may_match = false;
+  } else if (ExtractUserKey(level_iter->key()).starts_with(
+                                             ExtractUserKey(internal_prefix))) {
+    // TODO(tylerharter): do we need this case?  Or are we guaranteed
+    // key() will always be the biggest value for this SST?
+    may_match = true;
+  } else {
+    may_match = vset_->table_cache_->PrefixMayMatch(
+                           options,
+                           DecodeFixed64(level_iter->value().data()),
+                           DecodeFixed64(level_iter->value().data() + 8),
+                           internal_prefix, nullptr);
+  }
+  return may_match;
+}
+
 Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
                                            const EnvOptions& soptions,
                                            int level) const {
-  return NewTwoLevelIterator(
-      new LevelFileNumIterator(vset_->icmp_, &files_[level]),
-      &GetFileIterator, vset_->table_cache_, options, soptions);
+  Iterator* level_iter = new LevelFileNumIterator(vset_->icmp_, &files_[level]);
+  if (options.prefix) {
+    InternalKey internal_prefix(*options.prefix, 0, kTypeValue);
+    if (!PrefixMayMatch(options, soptions,
+                        internal_prefix.Encode(), level_iter)) {
+      delete level_iter;
+      // nothing in this level can match the prefix
+      return NewEmptyIterator();
+    }
+  }
+  return NewTwoLevelIterator(level_iter, &GetFileIterator,
+                             vset_->table_cache_, options, soptions);
 }

 void Version::AddIterators(const ReadOptions& options,
--- a/db/version_set.h
+++ b/db/version_set.h
@ -152,6 +152,8 @@ class Version {
  Iterator* NewConcatenatingIterator(const ReadOptions&,
                                     const EnvOptions& soptions,
                                     int level) const;
+  bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
+                      const Slice& internal_prefix, Iterator* level_iter) const;

  VersionSet* vset_;            // VersionSet to which this Version belongs
  Version* next_;               // Next version in linked list
--- a/include/leveldb/statistics.h
+++ b/include/leveldb/statistics.h
@ -62,13 +62,21 @@ enum Tickers {
  NUMBER_MERGE_FAILURES = 22,
  SEQUENCE_NUMBER = 23,

-  TICKER_ENUM_MAX = 24
+  // number of times bloom was checked before creating iterator on a
+  // file, and the number of times the check was useful in avoiding
+  // iterator creation (and thus likely IOPs).
+  BLOOM_FILTER_PREFIX_CHECKED = 24,
+  BLOOM_FILTER_PREFIX_USEFUL = 25,
+
+  TICKER_ENUM_MAX = 26
 };

 const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
  { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
  { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
  { BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" },
+  { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
+  { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
  { COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" },
  { COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" },
  { COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" },
--- a/include/leveldb/status.h
+++ b/include/leveldb/status.h
@ -91,7 +91,7 @@ class Status {
    kNotSupported = 3,
    kInvalidArgument = 4,
    kIOError = 5,
-    kMergeInProgress = 6
+    kMergeInProgress = 6,
  };

  Code code() const {
--- a/table/filter_block.cc
+++ b/table/filter_block.cc
@ -45,6 +45,7 @@ bool FilterBlockBuilder::SamePrefix(const Slice &key1,
 }

 void FilterBlockBuilder::AddKey(const Slice& key) {
+  // get slice for most recently added entry
  Slice prev;
  if (start_.size() > 0) {
    size_t prev_start = start_[start_.size() - 1];
@ -53,17 +54,21 @@ void FilterBlockBuilder::AddKey(const Slice& key) {
    prev = Slice(base, length);
  }

+  // add key to filter if needed
  if (whole_key_filtering_) {
    start_.push_back(entries_.size());
    entries_.append(key.data(), key.size());
  }

-  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
+  // add prefix to filter if needed
+  Slice user_key = ExtractUserKey(key);
+  if (prefix_extractor_ && prefix_extractor_->InDomain(user_key)) {
    // this assumes prefix(prefix(key)) == prefix(key), as the last
    // entry in entries_ may be either a key or prefix, and we use
    // prefix(last entry) to get the prefix of the last key.
-    if (prev.size() == 0 || ! SamePrefix(key, prev)) {
-      Slice prefix = prefix_extractor_->Transform(key);
+    if (prev.size() == 0 ||
+        !SamePrefix(user_key, ExtractUserKey(prev))) {
+      Slice prefix = prefix_extractor_->Transform(user_key);
      InternalKey internal_prefix_tmp(prefix, 0, kTypeValue);
      Slice internal_prefix = internal_prefix_tmp.Encode();
      assert(comparator_->Compare(internal_prefix, key) <= 0);
--- a/table/table.cc
+++ b/table/table.cc
@ -328,6 +328,11 @@ Iterator* Table::BlockReader(void* arg,
 // 1) key.starts_with(prefix(key))
 // 2) Compare(prefix(key), key) <= 0.
 // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
+//
+// TODO(tylerharter): right now, this won't cause I/O since blooms are
+// in memory.  When blooms may need to be paged in, we should refactor so that
+// this is only ever called lazily.  In particular, this shouldn't be called
+// while the DB lock is held like it is now.
 bool Table::PrefixMayMatch(const Slice& internal_prefix) const {
  FilterBlockReader* filter = rep_->filter;
  bool may_match = true;
@ -337,12 +342,14 @@ bool Table::PrefixMayMatch(const Slice& internal_prefix) const {
    return true;
  }

-  Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
+  std::unique_ptr<Iterator> iiter(rep_->index_block->NewIterator(
+                                           rep_->options.comparator));
  iiter->Seek(internal_prefix);
-  if (! iiter->Valid()) {
+  if (!iiter->Valid()) {
    // we're past end of file
    may_match = false;
-  } else if (iiter->key().starts_with(internal_prefix)) {
+  } else if (ExtractUserKey(iiter->key()).starts_with(
+                                             ExtractUserKey(internal_prefix))) {
    // we need to check for this subtle case because our only
    // guarantee is that "the key is a string >= last key in that data
    // block" according to the doc/table_format.txt spec.
@ -366,7 +373,12 @@ bool Table::PrefixMayMatch(const Slice& internal_prefix) const {
    assert(s.ok());
    may_match = filter->PrefixMayMatch(handle.offset(), internal_prefix);
  }
-  delete iiter;
+
+  RecordTick(rep_->options.statistics, BLOOM_FILTER_PREFIX_CHECKED);
+  if (!may_match) {
+    RecordTick(rep_->options.statistics, BLOOM_FILTER_PREFIX_USEFUL);
+  }
+
  return may_match;
 }

--- a/table/table.h
+++ b/table/table.h
@ -47,7 +47,7 @@ class Table {

  ~Table();

-  bool PrefixMayMatch(const Slice& prefix) const;
+  bool PrefixMayMatch(const Slice& internal_prefix) const;

  // Returns a new iterator over the table contents.
  // The result of NewIterator() is initially invalid (caller must