rocksdb/db/forward_iterator_bench.cc

//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).

#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif

#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
#include <cstdio>
int main() {
  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
  return 1;
}
#elif defined(OS_MACOSX) || defined(OS_WIN)
// Block forward_iterator_bench under MAC and Windows
int main() { return 0; }
#else
#include <gflags/gflags.h>
#include <semaphore.h>
#include <atomic>
#include <bitset>
#include <chrono>
#include <climits>
#include <condition_variable>
#include <limits>
#include <mutex>
#include <queue>
#include <random>
#include <thread>

#include "rocksdb/cache.h"
#include "rocksdb/db.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "port/port.h"
#include "util/testharness.h"

const int MAX_SHARDS = 100000;

DEFINE_int32(writers, 8, "");
DEFINE_int32(readers, 8, "");
DEFINE_int64(rate, 100000, "");
DEFINE_int64(value_size, 300, "");
DEFINE_int64(shards, 1000, "");
DEFINE_int64(memtable_size, 500000000, "");
DEFINE_int64(block_cache_size, 300000000, "");
DEFINE_int64(block_size, 65536, "");
DEFINE_double(runtime, 300.0, "");
DEFINE_bool(cache_only_first, true, "");
DEFINE_bool(iterate_upper_bound, true, "");

struct Stats {
  char pad1[128] __attribute__((__unused__));
  std::atomic<uint64_t> written{0};
  char pad2[128] __attribute__((__unused__));
  std::atomic<uint64_t> read{0};
  std::atomic<uint64_t> cache_misses{0};
  char pad3[128] __attribute__((__unused__));
} stats;

struct Key {
  Key() {}
  Key(uint64_t shard_in, uint64_t seqno_in)
      : shard_be(htobe64(shard_in)), seqno_be(htobe64(seqno_in)) {}

  uint64_t shard() const { return be64toh(shard_be); }
  uint64_t seqno() const { return be64toh(seqno_be); }

 private:
  uint64_t shard_be;
  uint64_t seqno_be;
} __attribute__((__packed__));

struct Reader;
struct Writer;

struct ShardState {
  char pad1[128] __attribute__((__unused__));
  std::atomic<uint64_t> last_written{0};
  Writer* writer;
  Reader* reader;
  char pad2[128] __attribute__((__unused__));
  std::atomic<uint64_t> last_read{0};
  std::unique_ptr<rocksdb::Iterator> it;
  std::unique_ptr<rocksdb::Iterator> it_cacheonly;
  Key upper_bound;
  rocksdb::Slice upper_bound_slice;
  char pad3[128] __attribute__((__unused__));
};

struct Reader {
 public:
  explicit Reader(std::vector<ShardState>* shard_states, rocksdb::DB* db)
      : shard_states_(shard_states), db_(db) {
    sem_init(&sem_, 0, 0);
    thread_ = port::Thread(&Reader::run, this);
  }

  void run() {
    while (1) {
      sem_wait(&sem_);
      if (done_.load()) {
        break;
      }

      uint64_t shard;
      {
        std::lock_guard<std::mutex> guard(queue_mutex_);
        assert(!shards_pending_queue_.empty());
        shard = shards_pending_queue_.front();
        shards_pending_queue_.pop();
        shards_pending_set_.reset(shard);
      }
      readOnceFromShard(shard);
    }
  }

  void readOnceFromShard(uint64_t shard) {
    ShardState& state = (*shard_states_)[shard];
    if (!state.it) {
      // Initialize iterators
      rocksdb::ReadOptions options;
      options.tailing = true;
      if (FLAGS_iterate_upper_bound) {
        state.upper_bound = Key(shard, std::numeric_limits<uint64_t>::max());
        state.upper_bound_slice = rocksdb::Slice(
            (const char*)&state.upper_bound, sizeof(state.upper_bound));
        options.iterate_upper_bound = &state.upper_bound_slice;
      }

      state.it.reset(db_->NewIterator(options));

      if (FLAGS_cache_only_first) {
        options.read_tier = rocksdb::ReadTier::kBlockCacheTier;
        state.it_cacheonly.reset(db_->NewIterator(options));
      }
    }

    const uint64_t upto = state.last_written.load();
    for (rocksdb::Iterator* it : {state.it_cacheonly.get(), state.it.get()}) {
      if (it == nullptr) {
        continue;
      }
      if (state.last_read.load() >= upto) {
        break;
      }
      bool need_seek = true;
      for (uint64_t seq = state.last_read.load() + 1; seq <= upto; ++seq) {
        if (need_seek) {
          Key from(shard, state.last_read.load() + 1);
          it->Seek(rocksdb::Slice((const char*)&from, sizeof(from)));
          need_seek = false;
        } else {
          it->Next();
        }
        if (it->status().IsIncomplete()) {
          ++::stats.cache_misses;
          break;
        }
        assert(it->Valid());
        assert(it->key().size() == sizeof(Key));
        Key key;
        memcpy(&key, it->key().data(), it->key().size());
        // fprintf(stderr, "Expecting (%ld, %ld) read (%ld, %ld)\n",
        //         shard, seq, key.shard(), key.seqno());
        assert(key.shard() == shard);
        assert(key.seqno() == seq);
        state.last_read.store(seq);
        ++::stats.read;
      }
    }
  }

  void onWrite(uint64_t shard) {
    {
      std::lock_guard<std::mutex> guard(queue_mutex_);
      if (!shards_pending_set_.test(shard)) {
        shards_pending_queue_.push(shard);
        shards_pending_set_.set(shard);
        sem_post(&sem_);
      }
    }
  }

  ~Reader() {
    done_.store(true);
    sem_post(&sem_);
    thread_.join();
  }

 private:
  char pad1[128] __attribute__((__unused__));
  std::vector<ShardState>* shard_states_;
  rocksdb::DB* db_;
  rocksdb::port::Thread thread_;
  sem_t sem_;
  std::mutex queue_mutex_;
  std::bitset<MAX_SHARDS + 1> shards_pending_set_;
  std::queue<uint64_t> shards_pending_queue_;
  std::atomic<bool> done_{false};
  char pad2[128] __attribute__((__unused__));
};

struct Writer {
  explicit Writer(std::vector<ShardState>* shard_states, rocksdb::DB* db)
      : shard_states_(shard_states), db_(db) {}

  void start() { thread_ = port::Thread(&Writer::run, this); }

  void run() {
    std::queue<std::chrono::steady_clock::time_point> workq;
    std::chrono::steady_clock::time_point deadline(
        std::chrono::steady_clock::now() +
        std::chrono::nanoseconds((uint64_t)(1000000000 * FLAGS_runtime)));
    std::vector<uint64_t> my_shards;
    for (int i = 1; i <= FLAGS_shards; ++i) {
      if ((*shard_states_)[i].writer == this) {
        my_shards.push_back(i);
      }
    }

    std::mt19937 rng{std::random_device()()};
    std::uniform_int_distribution<int> shard_dist(
        0, static_cast<int>(my_shards.size()) - 1);
    std::string value(FLAGS_value_size, '*');

    while (1) {
      auto now = std::chrono::steady_clock::now();
      if (FLAGS_runtime >= 0 && now >= deadline) {
        break;
      }
      if (workq.empty()) {
        for (int i = 0; i < FLAGS_rate; i += FLAGS_writers) {
          std::chrono::nanoseconds offset(1000000000LL * i / FLAGS_rate);
          workq.push(now + offset);
        }
      }
      while (!workq.empty() && workq.front() < now) {
        workq.pop();
        uint64_t shard = my_shards[shard_dist(rng)];
        ShardState& state = (*shard_states_)[shard];
        uint64_t seqno = state.last_written.load() + 1;
        Key key(shard, seqno);
        // fprintf(stderr, "Writing (%ld, %ld)\n", shard, seqno);
        rocksdb::Status status =
            db_->Put(rocksdb::WriteOptions(),
                     rocksdb::Slice((const char*)&key, sizeof(key)),
                     rocksdb::Slice(value));
        assert(status.ok());
        state.last_written.store(seqno);
        state.reader->onWrite(shard);
        ++::stats.written;
      }
      std::this_thread::sleep_for(std::chrono::milliseconds(1));
    }
    // fprintf(stderr, "Writer done\n");
  }

  ~Writer() { thread_.join(); }

 private:
  char pad1[128] __attribute__((__unused__));
  std::vector<ShardState>* shard_states_;
  rocksdb::DB* db_;
  rocksdb::port::Thread thread_;
  char pad2[128] __attribute__((__unused__));
};

struct StatsThread {
  explicit StatsThread(rocksdb::DB* db)
      : db_(db), thread_(&StatsThread::run, this) {}

  void run() {
    //    using namespace std::chrono;
    auto tstart = std::chrono::steady_clock::now(), tlast = tstart;
    uint64_t wlast = 0, rlast = 0;
    while (!done_.load()) {
      {
        std::unique_lock<std::mutex> lock(cvm_);
        cv_.wait_for(lock, std::chrono::seconds(1));
      }
      auto now = std::chrono::steady_clock::now();
      double elapsed =
          std::chrono::duration_cast<std::chrono::duration<double> >(
              now - tlast).count();
      uint64_t w = ::stats.written.load();
      uint64_t r = ::stats.read.load();
      fprintf(stderr,
              "%s elapsed %4lds | written %10ld | w/s %10.0f | read %10ld | "
              "r/s %10.0f | cache misses %10ld\n",
              db_->GetEnv()->TimeToString(time(nullptr)).c_str(),
              std::chrono::duration_cast<std::chrono::seconds>(now - tstart)
                  .count(),
              w, (w - wlast) / elapsed, r, (r - rlast) / elapsed,
              ::stats.cache_misses.load());
      wlast = w;
      rlast = r;
      tlast = now;
    }
  }

  ~StatsThread() {
    {
      std::lock_guard<std::mutex> guard(cvm_);
      done_.store(true);
    }
    cv_.notify_all();
    thread_.join();
  }

 private:
  rocksdb::DB* db_;
  std::mutex cvm_;
  std::condition_variable cv_;
  rocksdb::port::Thread thread_;
  std::atomic<bool> done_{false};
};

int main(int argc, char** argv) {
  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);

  std::mt19937 rng{std::random_device()()};
  rocksdb::Status status;
  std::string path = rocksdb::test::TmpDir() + "/forward_iterator_test";
  fprintf(stderr, "db path is %s\n", path.c_str());
  rocksdb::Options options;
  options.create_if_missing = true;
  options.compression = rocksdb::CompressionType::kNoCompression;
  options.compaction_style = rocksdb::CompactionStyle::kCompactionStyleNone;
  options.level0_slowdown_writes_trigger = 99999;
  options.level0_stop_writes_trigger = 99999;
  options.use_direct_io_for_flush_and_compaction = true;
  options.write_buffer_size = FLAGS_memtable_size;
  rocksdb::BlockBasedTableOptions table_options;
  table_options.block_cache = rocksdb::NewLRUCache(FLAGS_block_cache_size);
  table_options.block_size = FLAGS_block_size;
  options.table_factory.reset(
      rocksdb::NewBlockBasedTableFactory(table_options));

  status = rocksdb::DestroyDB(path, options);
  assert(status.ok());
  rocksdb::DB* db_raw;
  status = rocksdb::DB::Open(options, path, &db_raw);
  assert(status.ok());
  std::unique_ptr<rocksdb::DB> db(db_raw);

  std::vector<ShardState> shard_states(FLAGS_shards + 1);
  std::deque<Reader> readers;
  while (static_cast<int>(readers.size()) < FLAGS_readers) {
    readers.emplace_back(&shard_states, db_raw);
  }
  std::deque<Writer> writers;
  while (static_cast<int>(writers.size()) < FLAGS_writers) {
    writers.emplace_back(&shard_states, db_raw);
  }

  // Each shard gets a random reader and random writer assigned to it
  for (int i = 1; i <= FLAGS_shards; ++i) {
    std::uniform_int_distribution<int> reader_dist(0, FLAGS_readers - 1);
    std::uniform_int_distribution<int> writer_dist(0, FLAGS_writers - 1);
    shard_states[i].reader = &readers[reader_dist(rng)];
    shard_states[i].writer = &writers[writer_dist(rng)];
  }

  StatsThread stats_thread(db_raw);
  for (Writer& w : writers) {
    w.start();
  }

  writers.clear();
  readers.clear();
}
#endif  // !defined(GFLAGS) || defined(ROCKSDB_LITE)