From 4b1d0492367de4b1a5ca7899e77f94d5c9a656e3 Mon Sep 17 00:00:00 2001
From: Mike Lin <mlin@dnanexus.com>
Date: Tue, 31 Dec 2013 15:14:18 -0800
Subject: [PATCH 01/30] C API: add
 rocksdb_env_set_high_priority_background_threads

---
 db/c.cc             | 4 ++++
 include/rocksdb/c.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index 36ee2d486b..68f3613367 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -788,6 +788,10 @@ void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
   env->rep->SetBackgroundThreads(n);
 }
 
+void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) {
+  env->rep->SetBackgroundThreads(n, Env::HIGH);
+}
+
 void rocksdb_env_destroy(rocksdb_env_t* env) {
   if (!env->is_default) delete env->rep;
   delete env;
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index a3b18084a8..bd22e191b5 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -311,6 +311,7 @@ extern void rocksdb_cache_destroy(rocksdb_cache_t* cache);
 
 extern rocksdb_env_t* rocksdb_create_default_env();
 extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n);
+extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
 extern void rocksdb_env_destroy(rocksdb_env_t*);
 
 /* Universal Compaction options */

From 50994bf6990eae4155f7541161f5eaf1edb3ef08 Mon Sep 17 00:00:00 2001
From: Mark Callaghan <mdcallag@gmail.com>
Date: Thu, 19 Dec 2013 10:02:53 -0800
Subject: [PATCH 02/30] Don't always compress L0 files written by memtable
 flush

Summary:
Code was always compressing L0 files written by a memtable flush
when compression was enabled. Now this is done when
min_level_to_compress=0 for leveled compaction and when
universal_compaction_size_percent=-1 for universal compaction.

Task ID: #3416472

Blame Rev:

Test Plan:
ran db_bench with compression options

Revert Plan:

Database Impact:

Memcache Impact:

Other Notes:

EImportant:

- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -

Reviewers: dhruba, igor, sdong

Reviewed By: dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14757
---
 db/builder.cc |  4 ++--
 db/builder.h  |  2 +-
 db/db_impl.cc | 33 ++++++++++++++++++++++++++-------
 db/db_impl.h  |  3 +++
 db/repair.cc  |  3 ++-
 5 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/db/builder.cc b/db/builder.cc
index ad1334a158..61671db0d0 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -42,7 +42,7 @@ Status BuildTable(const std::string& dbname,
                   const Comparator* user_comparator,
                   const SequenceNumber newest_snapshot,
                   const SequenceNumber earliest_seqno_in_memtable,
-                  const bool enable_compression) {
+                  const CompressionType compression) {
   Status s;
   meta->file_size = 0;
   meta->smallest_seqno = meta->largest_seqno = 0;
@@ -65,7 +65,7 @@ Status BuildTable(const std::string& dbname,
     }
 
     TableBuilder* builder = GetTableBuilder(options, file.get(),
-                                            options.compression);
+                                            compression);
 
     // the first key is the smallest key
     Slice key = iter->key();
diff --git a/db/builder.h b/db/builder.h
index 8c525bd050..2600dc24b8 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -43,6 +43,6 @@ extern Status BuildTable(const std::string& dbname,
                          const Comparator* user_comparator,
                          const SequenceNumber newest_snapshot,
                          const SequenceNumber earliest_seqno_in_memtable,
-                         const bool enable_compression);
+                         const CompressionType compression);
 
 }  // namespace rocksdb
diff --git a/db/db_impl.cc b/db/db_impl.cc
index b8ae72fd94..169556e23d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -225,6 +225,28 @@ CompressionType GetCompressionType(const Options& options, int level,
   }
 }
 
+CompressionType GetCompressionFlush(const Options& options) {
+  // Compressing memtable flushes might not help unless the sequential load
+  // optimization is used for leveled compaction. Otherwise the CPU and
+  // latency overhead is not offset by saving much space.
+
+  bool can_compress;
+
+  if  (options.compaction_style == kCompactionStyleUniversal) {
+    can_compress =
+        (options.compaction_options_universal.compression_size_percent < 0);
+  } else {
+    // For leveled compress when min_level_to_compress == 0.
+    can_compress = (GetCompressionType(options, 0, true) != kNoCompression);
+  }
+
+  if (can_compress) {
+    return options.compression;
+  } else {
+    return kNoCompression;
+  }
+}
+
 DBImpl::DBImpl(const Options& options, const std::string& dbname)
     : env_(options.env),
       dbname_(dbname),
@@ -1068,7 +1090,8 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) {
     s = BuildTable(dbname_, env_, options_, storage_options_,
                    table_cache_.get(), iter, &meta,
                    user_comparator(), newest_snapshot,
-                   earliest_seqno_in_memtable, true);
+                   earliest_seqno_in_memtable,
+                   GetCompressionFlush(options_));
     LogFlush(options_.info_log);
     mutex_.Lock();
   }
@@ -1129,15 +1152,11 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
     Log(options_.info_log,
         "Level-0 flush table #%lu: started",
         (unsigned long)meta.number);
-    // We skip compression if universal compression is used and the size
-    // threshold is set for compression.
-    bool enable_compression = (options_.compaction_style
-        != kCompactionStyleUniversal ||
-        options_.compaction_options_universal.compression_size_percent < 0);
+
     s = BuildTable(dbname_, env_, options_, storage_options_,
                    table_cache_.get(), iter, &meta,
                    user_comparator(), newest_snapshot,
-                   earliest_seqno_in_memtable, enable_compression);
+                   earliest_seqno_in_memtable, GetCompressionFlush(options_));
     LogFlush(options_.info_log);
     delete iter;
     Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s",
diff --git a/db/db_impl.h b/db/db_impl.h
index adeb163f06..d33efd19ea 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -589,4 +589,7 @@ extern Options SanitizeOptions(const std::string& db,
 CompressionType GetCompressionType(const Options& options, int level,
                                    const bool enable_compression);
 
+// Determine compression type for L0 file written by memtable flush.
+CompressionType GetCompressionFlush(const Options& options);
+
 }  // namespace rocksdb
diff --git a/db/repair.cc b/db/repair.cc
index fc9ba282d6..6db90c8653 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -225,7 +225,8 @@ class Repairer {
     Iterator* iter = mem->NewIterator();
     status = BuildTable(dbname_, env_, options_, storage_options_,
                         table_cache_, iter, &meta,
-                        icmp_.user_comparator(), 0, 0, true);
+                        icmp_.user_comparator(), 0, 0,
+                        kNoCompression);
     delete iter;
     delete mem->Unref();
     mem = nullptr;

From 12b6d2b839bf494de01e2e49089fafac4e0d6f7e Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Wed, 8 Jan 2014 15:06:07 -0800
Subject: [PATCH 03/30] Separate the aligned and unaligned memory allocation

Summary: Use two vectors for different types of memory allocation.

Test Plan: run all unit tests.

Reviewers: haobo, sdong

Reviewed By: haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15027
---
 include/rocksdb/options.h | 67 +++++++++++++++++---------------
 util/arena_impl.cc        | 81 +++++++++++++++++++++++----------------
 util/arena_impl.h         | 69 ++++++++++++++++++---------------
 util/arena_test.cc        | 34 ++++++++++++++--
 4 files changed, 153 insertions(+), 98 deletions(-)

diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 85c1db059d..b7eaff37dc 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -44,15 +44,15 @@ using std::shared_ptr;
 enum CompressionType : char {
   // NOTE: do not change the values of existing entries, as these are
   // part of the persistent format on disk.
-  kNoCompression     = 0x0,
+  kNoCompression = 0x0,
   kSnappyCompression = 0x1,
   kZlibCompression = 0x2,
   kBZip2Compression = 0x3
 };
 
 enum CompactionStyle : char {
-  kCompactionStyleLevel       = 0x0, // level based compaction style
-  kCompactionStyleUniversal   = 0x1  // Universal compaction style
+  kCompactionStyleLevel = 0x0,     // level based compaction style
+  kCompactionStyleUniversal = 0x1  // Universal compaction style
 };
 
 // Compression options for different compression algorithms like Zlib
@@ -60,12 +60,9 @@ struct CompressionOptions {
   int window_bits;
   int level;
   int strategy;
-  CompressionOptions():window_bits(-14),
-                       level(-1),
-                       strategy(0){}
-  CompressionOptions(int wbits, int lev, int strategy):window_bits(wbits),
-                                                       level(lev),
-                                                       strategy(strategy){}
+  CompressionOptions() : window_bits(-14), level(-1), strategy(0) {}
+  CompressionOptions(int wbits, int lev, int strategy)
+      : window_bits(wbits), level(lev), strategy(strategy) {}
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
@@ -216,7 +213,6 @@ struct Options {
   // Default: 16
   int block_restart_interval;
 
-
   // Compress blocks using the specified compression algorithm.  This
   // parameter can be changed dynamically.
   //
@@ -247,7 +243,7 @@ struct Options {
   // java/C api hard to construct.
   std::vector<CompressionType> compression_per_level;
 
-  //different options for compression algorithms
+  // different options for compression algorithms
   CompressionOptions compression_opts;
 
   // If non-nullptr, use the specified filter policy to reduce disk reads.
@@ -326,7 +322,6 @@ struct Options {
   // will be 20MB, total file size for level-2 will be 200MB,
   // and total file size for level-3 will be 2GB.
 
-
   // by default 'max_bytes_for_level_base' is 10MB.
   uint64_t max_bytes_for_level_base;
   // by default 'max_bytes_for_level_base' is 10.
@@ -484,10 +479,19 @@ struct Options {
   // order.
   int table_cache_remove_scan_count_limit;
 
-  // size of one block in arena memory allocation.
-  // If <= 0, a proper value is automatically calculated (usually 1/10 of
+  // Size of one block in arena memory allocation.
+  //
+  // If <= 0, a proper value is automatically calculated (usually about 1/10 of
   // writer_buffer_size).
   //
+  // There are two additonal restriction of the The specified size:
+  // (1) size should be in the range of [4096, 2 << 30] and
+  // (2) be the multiple of the CPU word (which helps with the memory
+  // alignment).
+  //
+  // We'll automatically check and adjust the size number to make sure it
+  // conforms to the restrictions.
+  //
   // Default: 0
   size_t arena_block_size;
 
@@ -572,7 +576,12 @@ struct Options {
   // Specify the file access pattern once a compaction is started.
   // It will be applied to all input files of a compaction.
   // Default: NORMAL
-  enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start;
+  enum {
+    NONE,
+    NORMAL,
+    SEQUENTIAL,
+    WILLNEED
+  } access_hint_on_compaction_start;
 
   // Use adaptive mutex, which spins in the user space before resorting
   // to kernel. This could reduce context switch when the mutex is not
@@ -622,7 +631,7 @@ struct Options {
   // Default: emtpy vector -- no user-defined statistics collection will be
   // performed.
   std::vector<std::shared_ptr<TablePropertiesCollector>>
-    table_properties_collectors;
+  table_properties_collectors;
 
   // Allows thread-safe inplace updates. Requires Updates iff
   // * key exists in current memtable
@@ -644,7 +653,7 @@ struct Options {
 // the block cache. It will not page in data from the OS cache or data that
 // resides in storage.
 enum ReadTier {
-  kReadAllTier    = 0x0, // data in memtable, block cache, OS cache or storage
+  kReadAllTier = 0x0,    // data in memtable, block cache, OS cache or storage
   kBlockCacheTier = 0x1  // data in memtable or block cache
 };
 
@@ -697,13 +706,14 @@ struct ReadOptions {
         prefix_seek(false),
         snapshot(nullptr),
         prefix(nullptr),
-        read_tier(kReadAllTier) {
-  }
-  ReadOptions(bool cksum, bool cache) :
-              verify_checksums(cksum), fill_cache(cache),
-              prefix_seek(false), snapshot(nullptr), prefix(nullptr),
-              read_tier(kReadAllTier) {
-  }
+        read_tier(kReadAllTier) {}
+  ReadOptions(bool cksum, bool cache)
+      : verify_checksums(cksum),
+        fill_cache(cache),
+        prefix_seek(false),
+        snapshot(nullptr),
+        prefix(nullptr),
+        read_tier(kReadAllTier) {}
 };
 
 // Options that control write operations
@@ -730,10 +740,7 @@ struct WriteOptions {
   // and the write may got lost after a crash.
   bool disableWAL;
 
-  WriteOptions()
-      : sync(false),
-        disableWAL(false) {
-  }
+  WriteOptions() : sync(false), disableWAL(false) {}
 };
 
 // Options that control flush operations
@@ -742,9 +749,7 @@ struct FlushOptions {
   // Default: true
   bool wait;
 
-  FlushOptions()
-      : wait(true) {
-  }
+  FlushOptions() : wait(true) {}
 };
 
 }  // namespace rocksdb
diff --git a/util/arena_impl.cc b/util/arena_impl.cc
index d5c2a537e2..5125e23641 100644
--- a/util/arena_impl.cc
+++ b/util/arena_impl.cc
@@ -8,71 +8,86 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "util/arena_impl.h"
+#include <algorithm>
 
 namespace rocksdb {
 
-ArenaImpl::ArenaImpl(size_t block_size) {
-  if (block_size < kMinBlockSize) {
-    block_size_ = kMinBlockSize;
-  } else if (block_size > kMaxBlockSize) {
-    block_size_ = kMaxBlockSize;
-  } else {
-    block_size_ = block_size;
+const size_t ArenaImpl::kMinBlockSize = 4096;
+const size_t ArenaImpl::kMaxBlockSize = 2 << 30;
+static const int kAlignUnit = sizeof(void*);
+
+size_t OptimizeBlockSize(size_t block_size) {
+  // Make sure block_size is in optimal range
+  block_size = std::max(ArenaImpl::kMinBlockSize, block_size);
+  block_size = std::min(ArenaImpl::kMaxBlockSize, block_size);
+
+  // make sure block_size is the multiple of kAlignUnit
+  if (block_size % kAlignUnit != 0) {
+    block_size = (1 + block_size / kAlignUnit) * kAlignUnit;
   }
 
-  blocks_memory_ = 0;
-  alloc_ptr_ = nullptr;  // First allocation will allocate a block
-  alloc_bytes_remaining_ = 0;
+  return block_size;
+}
+
+ArenaImpl::ArenaImpl(size_t block_size)
+    : kBlockSize(OptimizeBlockSize(block_size)) {
+  assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
+         kBlockSize % kAlignUnit == 0);
 }
 
 ArenaImpl::~ArenaImpl() {
-  for (size_t i = 0; i < blocks_.size(); i++) {
-    delete[] blocks_[i];
+  for (const auto& block : blocks_) {
+    delete[] block;
   }
 }
 
-char* ArenaImpl::AllocateFallback(size_t bytes) {
-  if (bytes > block_size_ / 4) {
+char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
+  if (bytes > kBlockSize / 4) {
     // Object is more than a quarter of our block size.  Allocate it separately
     // to avoid wasting too much space in leftover bytes.
-    char* result = AllocateNewBlock(bytes);
-    return result;
+    return AllocateNewBlock(bytes);
   }
 
   // We waste the remaining space in the current block.
-  alloc_ptr_ = AllocateNewBlock(block_size_);
-  alloc_bytes_remaining_ = block_size_;
+  auto block_head = AllocateNewBlock(kBlockSize);
+  alloc_bytes_remaining_ = kBlockSize - bytes;
 
-  char* result = alloc_ptr_;
-  alloc_ptr_ += bytes;
-  alloc_bytes_remaining_ -= bytes;
-  return result;
+  if (aligned) {
+    aligned_alloc_ptr_ = block_head + bytes;
+    unaligned_alloc_ptr_ = block_head + kBlockSize;
+    return block_head;
+  } else {
+    aligned_alloc_ptr_ = block_head;
+    unaligned_alloc_ptr_ = block_head + kBlockSize - bytes;
+    return unaligned_alloc_ptr_;
+  }
 }
 
 char* ArenaImpl::AllocateAligned(size_t bytes) {
-  const int align = sizeof(void*);    // We'll align to pointer size
-  assert((align & (align-1)) == 0);   // Pointer size should be a power of 2
-  size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1);
-  size_t slop = (current_mod == 0 ? 0 : align - current_mod);
+  assert((kAlignUnit & (kAlignUnit - 1)) ==
+         0);  // Pointer size should be a power of 2
+  size_t current_mod =
+      reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
+  size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
   size_t needed = bytes + slop;
   char* result;
   if (needed <= alloc_bytes_remaining_) {
-    result = alloc_ptr_ + slop;
-    alloc_ptr_ += needed;
+    result = aligned_alloc_ptr_ + slop;
+    aligned_alloc_ptr_ += needed;
     alloc_bytes_remaining_ -= needed;
   } else {
     // AllocateFallback always returned aligned memory
-    result = AllocateFallback(bytes);
+    result = AllocateFallback(bytes, true /* aligned */);
   }
-  assert((reinterpret_cast<uintptr_t>(result) & (align-1)) == 0);
+  assert((reinterpret_cast<uintptr_t>(result) & (kAlignUnit - 1)) == 0);
   return result;
 }
 
 char* ArenaImpl::AllocateNewBlock(size_t block_bytes) {
-  char* result = new char[block_bytes];
+  char* block = new char[block_bytes];
   blocks_memory_ += block_bytes;
-  blocks_.push_back(result);
-  return result;
+  blocks_.push_back(block);
+  return block;
 }
 
 }  // namespace rocksdb
diff --git a/util/arena_impl.h b/util/arena_impl.h
index b5a6842472..538385ccc0 100644
--- a/util/arena_impl.h
+++ b/util/arena_impl.h
@@ -22,49 +22,54 @@ namespace rocksdb {
 
 class ArenaImpl : public Arena {
  public:
+  // No copying allowed
+  ArenaImpl(const ArenaImpl&) = delete;
+  void operator=(const ArenaImpl&) = delete;
+
+  static const size_t kMinBlockSize;
+  static const size_t kMaxBlockSize;
+
   explicit ArenaImpl(size_t block_size = kMinBlockSize);
   virtual ~ArenaImpl();
 
-  virtual char* Allocate(size_t bytes);
+  virtual char* Allocate(size_t bytes) override;
 
-  virtual char* AllocateAligned(size_t bytes);
+  virtual char* AllocateAligned(size_t bytes) override;
 
   // Returns an estimate of the total memory usage of data allocated
-  // by the arena (including space allocated but not yet used for user
+  // by the arena (exclude the space allocated but not yet used for future
   // allocations).
-  //
-  // TODO: Do we need to exclude space allocated but not used?
   virtual const size_t ApproximateMemoryUsage() {
-    return blocks_memory_ + blocks_.capacity() * sizeof(char*);
+    return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
+           alloc_bytes_remaining_;
   }
 
-  virtual const size_t MemoryAllocatedBytes() {
+  virtual const size_t MemoryAllocatedBytes() override {
     return blocks_memory_;
   }
 
  private:
-  char* AllocateFallback(size_t bytes);
+  // Number of bytes allocated in one block
+  const size_t kBlockSize;
+  // Array of new[] allocated memory blocks
+  typedef std::vector<char*> Blocks;
+  Blocks blocks_;
+
+  // Stats for current active block.
+  // For each block, we allocate aligned memory chucks from one end and
+  // allocate unaligned memory chucks from the other end. Otherwise the
+  // memory waste for alignment will be higher if we allocate both types of
+  // memory from one direction.
+  char* unaligned_alloc_ptr_ = nullptr;
+  char* aligned_alloc_ptr_ = nullptr;
+  // How many bytes left in currently active block?
+  size_t alloc_bytes_remaining_ = 0;
+
+  char* AllocateFallback(size_t bytes, bool aligned);
   char* AllocateNewBlock(size_t block_bytes);
 
-  static const size_t kMinBlockSize = 4096;
-  static const size_t kMaxBlockSize = 2 << 30;
-
-  // Number of bytes allocated in one block
-  size_t block_size_;
-
-  // Allocation state
-  char* alloc_ptr_;
-  size_t alloc_bytes_remaining_;
-
-  // Array of new[] allocated memory blocks
-  std::vector<char*> blocks_;
-
   // Bytes of memory in blocks allocated so far
-  size_t blocks_memory_;
-
-  // No copying allowed
-  ArenaImpl(const ArenaImpl&);
-  void operator=(const ArenaImpl&);
+  size_t blocks_memory_ = 0;
 };
 
 inline char* ArenaImpl::Allocate(size_t bytes) {
@@ -73,12 +78,16 @@ inline char* ArenaImpl::Allocate(size_t bytes) {
   // them for our internal use).
   assert(bytes > 0);
   if (bytes <= alloc_bytes_remaining_) {
-    char* result = alloc_ptr_;
-    alloc_ptr_ += bytes;
+    unaligned_alloc_ptr_ -= bytes;
     alloc_bytes_remaining_ -= bytes;
-    return result;
+    return unaligned_alloc_ptr_;
   }
-  return AllocateFallback(bytes);
+  return AllocateFallback(bytes, false /* unaligned */);
 }
 
+// check and adjust the block_size so that the return value is
+//  1. in the range of [kMinBlockSize, kMaxBlockSize].
+//  2. the multiple of align unit.
+extern size_t OptimizeBlockSize(size_t block_size);
+
 }  // namespace rocksdb
diff --git a/util/arena_test.cc b/util/arena_test.cc
index 12aa7f7fe5..4a3d1bd433 100644
--- a/util/arena_test.cc
+++ b/util/arena_test.cc
@@ -57,8 +57,33 @@ TEST(ArenaImplTest, MemoryAllocatedBytes) {
   ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
 }
 
+// Make sure we didn't count the allocate but not used memory space in
+// Arena::ApproximateMemoryUsage()
+TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
+  const size_t kBlockSize = 4096;
+  const size_t kEntrySize = kBlockSize / 8;
+  ArenaImpl arena(kBlockSize);
+  ASSERT_EQ(0, arena.ApproximateMemoryUsage());
+
+  auto num_blocks = kBlockSize / kEntrySize;
+
+  // first allocation
+  arena.AllocateAligned(kEntrySize);
+  auto mem_usage = arena.MemoryAllocatedBytes();
+  ASSERT_EQ(mem_usage, kBlockSize);
+  auto usage = arena.ApproximateMemoryUsage();
+  ASSERT_LT(usage, mem_usage);
+  for (size_t i = 1; i < num_blocks; ++i) {
+    arena.AllocateAligned(kEntrySize);
+    ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes());
+    ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize);
+    usage = arena.ApproximateMemoryUsage();
+  }
+  ASSERT_GT(usage, mem_usage);
+}
+
 TEST(ArenaImplTest, Simple) {
-  std::vector<std::pair<size_t, char*> > allocated;
+  std::vector<std::pair<size_t, char*>> allocated;
   ArenaImpl arena_impl;
   const int N = 100000;
   size_t bytes = 0;
@@ -68,8 +93,9 @@ TEST(ArenaImplTest, Simple) {
     if (i % (N / 10) == 0) {
       s = i;
     } else {
-      s = rnd.OneIn(4000) ? rnd.Uniform(6000) :
-          (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
+      s = rnd.OneIn(4000)
+              ? rnd.Uniform(6000)
+              : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
     }
     if (s == 0) {
       // Our arena disallows size 0 allocations.
@@ -89,7 +115,7 @@ TEST(ArenaImplTest, Simple) {
     bytes += s;
     allocated.push_back(std::make_pair(s, r));
     ASSERT_GE(arena_impl.ApproximateMemoryUsage(), bytes);
-    if (i > N/10) {
+    if (i > N / 10) {
       ASSERT_LE(arena_impl.ApproximateMemoryUsage(), bytes * 1.10);
     }
   }

From 5575316350f7627a95e94f97bc40eb3842122c7b Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 7 Jan 2014 14:33:15 -0800
Subject: [PATCH 04/30] StopWatch not to get time if it is created for
 statistics and it is disabled

Summary: Currently, even if statistics is not enabled, StopWatch only for the stats still gets the time of the day, which is wasteful. This patch adds a new option to StopWatch to disable this get in this case.

Test Plan: make all check

Reviewers: dhruba, haobo, igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14703
---
 db/db_impl.cc     | 10 +++++-----
 util/stop_watch.h |  5 +++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 169556e23d..e7f2abf99f 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2091,11 +2091,11 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
   if (s.ok() && !options_.disableDataSync) {
     if (options_.use_fsync) {
       StopWatch sw(env_, options_.statistics.get(),
-                   COMPACTION_OUTFILE_SYNC_MICROS);
+                   COMPACTION_OUTFILE_SYNC_MICROS, false);
       s = compact->outfile->Fsync();
     } else {
       StopWatch sw(env_, options_.statistics.get(),
-                   COMPACTION_OUTFILE_SYNC_MICROS);
+                   COMPACTION_OUTFILE_SYNC_MICROS, false);
       s = compact->outfile->Sync();
     }
   }
@@ -2724,7 +2724,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
                        bool* value_found) {
   Status s;
 
-  StopWatch sw(env_, options_.statistics.get(), DB_GET);
+  StopWatch sw(env_, options_.statistics.get(), DB_GET, false);
   SequenceNumber snapshot;
   if (options.snapshot != nullptr) {
     snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
@@ -2793,7 +2793,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
                                      const std::vector<Slice>& keys,
                                      std::vector<std::string>* values) {
 
-  StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET);
+  StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false);
   SequenceNumber snapshot;
   std::vector<MemTable*> to_delete;
 
@@ -2944,7 +2944,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   w.disableWAL = options.disableWAL;
   w.done = false;
 
-  StopWatch sw(env_, options_.statistics.get(), DB_WRITE);
+  StopWatch sw(env_, options_.statistics.get(), DB_WRITE, false);
   mutex_.Lock();
   writers_.push_back(&w);
   while (!w.done && &w != writers_.front()) {
diff --git a/util/stop_watch.h b/util/stop_watch.h
index e36bcb7ec6..6325a74408 100644
--- a/util/stop_watch.h
+++ b/util/stop_watch.h
@@ -15,9 +15,10 @@ class StopWatch {
   explicit StopWatch(
     Env * const env,
     Statistics* statistics = nullptr,
-    const Histograms histogram_name = DB_GET) :
+    const Histograms histogram_name = DB_GET,
+    bool auto_start = true) :
       env_(env),
-      start_time_(env->NowMicros()),
+      start_time_((!auto_start && !statistics) ? 0 : env->NowMicros()),
       statistics_(statistics),
       histogram_name_(histogram_name) {}
 

From d0406675c2755feacf1475d27eed678925ca215d Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 8 Jan 2014 17:44:58 -0800
Subject: [PATCH 05/30] readwhilewriting benchmark

Summary:
Added readwhilewriting benchmark to our regression tests.
Changed block cache shards from 16 to 64, as Mark found that cache mutex contention is a big bottleneck.

Test Plan: Ran it.

Reviewers: dhruba, haobo, MarkCallaghan, xjin

Reviewed By: MarkCallaghan

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15075
---
 build_tools/regression_build_test.sh | 60 +++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 9 deletions(-)

diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh
index b0c130e3cf..d38b67c3ce 100755
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@@ -50,7 +50,7 @@ make release
     --num=$NUM \
     --writes=$NUM \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --statistics=1 \
@@ -68,7 +68,7 @@ make release
     --num=$NUM \
     --writes=$((NUM / 10)) \
     --cache_size=6442450944 \
-    --cache_numshardbits=4  \
+    --cache_numshardbits=6  \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --statistics=1 \
@@ -87,7 +87,7 @@ make release
     --num=$NUM \
     --writes=$NUM \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --statistics=1 \
@@ -106,7 +106,7 @@ make release
     --num=$NUM \
     --reads=$((NUM / 5)) \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --disable_seek_compaction=1 \
@@ -126,7 +126,7 @@ make release
     --num=$NUM \
     --reads=$((NUM / 5)) \
     --cache_size=104857600 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --disable_seek_compaction=1 \
@@ -147,7 +147,7 @@ make release
     --reads=$((NUM / 5)) \
     --writes=512 \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --write_buffer_size=1000000000 \
     --open_files=55000 \
@@ -169,7 +169,7 @@ make release
     --num=$((NUM / 4)) \
     --writes=$((NUM / 4)) \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --statistics=1 \
@@ -179,6 +179,25 @@ make release
     --sync=0 \
     --threads=1 > /dev/null
 
+# dummy test just to compact the data
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 1000)) \
+    --reads=$((NUM / 1000)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > /dev/null
+
 # measure readrandom after load with filluniquerandom with 6GB block cache
 ./db_bench \
     --benchmarks=readrandom \
@@ -188,7 +207,7 @@ make release
     --num=$((NUM / 4)) \
     --reads=$((NUM / 4)) \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --disable_seek_compaction=1 \
@@ -200,6 +219,28 @@ make release
     --sync=0 \
     --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
 
+# measure readwhilewriting after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readwhilewriting \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --writes_per_second=1000 \
+    --write_buffer_size=100000000 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readwhilewriting
+
 # measure memtable performance -- none of the data gets flushed to disk
 ./db_bench \
     --benchmarks=fillrandom,readrandom, \
@@ -208,7 +249,7 @@ make release
     --num=$((NUM / 10)) \
     --reads=$NUM \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --write_buffer_size=1000000000 \
     --open_files=55000 \
@@ -264,3 +305,4 @@ send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_m
 send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
 send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
 send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting

From cb37ddf229d4df2d2bf484c88a8563927d5849f5 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 9 Jan 2014 12:24:28 -0800
Subject: [PATCH 06/30] Feature requests for BackupableDB

Summary:
This diff introduces some features that were requested by two internal customers:
* Ability for backups not to share table files, because we can't guarantee that equal filename means equal content accross replicas
* Ability for two threads to call EnableFileDeletions() and DisableFileDeletions()
* Ability to stop backup from another thread and not slow down the DB close
* Copy the files to the temporary folder first and then atomically rename

Test Plan: Added some tests to backupable_db_test

Reviewers: dhruba, sanketh, muthu, sdong, haobo

Reviewed By: haobo

CC: leveldb, sanketh, muthu

Differential Revision: https://reviews.facebook.net/D14769
---
 include/utilities/backupable_db.h          | 24 +++++++-
 utilities/backupable/backupable_db.cc      | 72 +++++++++++++++++-----
 utilities/backupable/backupable_db_test.cc | 38 +++++++++++-
 3 files changed, 112 insertions(+), 22 deletions(-)

diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h
index 335e028576..fbe2ae8a37 100644
--- a/include/utilities/backupable_db.h
+++ b/include/utilities/backupable_db.h
@@ -31,6 +31,14 @@ struct BackupableDBOptions {
   // Default: nullptr
   Env* backup_env;
 
+  // If share_table_files == true, backup will assume that table files with
+  // same name have the same contents. This enables incremental backups and
+  // avoids unnecessary data copies.
+  // If share_table_files == false, each backup will be on its own and will
+  // not share any data with other backups.
+  // default: true
+  bool share_table_files;
+
   // Backup info and error messages will be written to info_log
   // if non-nullptr.
   // Default: nullptr
@@ -49,6 +57,7 @@ struct BackupableDBOptions {
 
   explicit BackupableDBOptions(const std::string& _backup_dir,
                                Env* _backup_env = nullptr,
+                               bool _share_table_files = true,
                                Logger* _info_log = nullptr,
                                bool _sync = true,
                                bool _destroy_old_data = false) :
@@ -93,6 +102,14 @@ class BackupableDB : public StackableDB {
   Status PurgeOldBackups(uint32_t num_backups_to_keep);
   // deletes a specific backup
   Status DeleteBackup(BackupID backup_id);
+  // Call this from another thread if you want to stop the backup
+  // that is currently happening. It will return immediatelly, will
+  // not wait for the backup to stop.
+  // The backup will stop ASAP and the call to CreateNewBackup will
+  // return Status::Incomplete(). It will not clean up after itself, but
+  // the state will remain consistent. The state will be cleaned up
+  // next time you create BackupableDB or RestoreBackupableDB.
+  void StopBackup();
 
  private:
   BackupEngine* backup_engine_;
@@ -108,9 +125,10 @@ class RestoreBackupableDB {
    void GetBackupInfo(std::vector<BackupInfo>* backup_info);
 
    // restore from backup with backup_id
-   // IMPORTANT -- if you restore from some backup that is not the latest,
-   // and you start creating new backups from the new DB, all the backups
-   // that were newer than the backup you restored from will be deleted
+   // IMPORTANT -- if options_.share_table_files == true and you restore DB
+   // from some backup that is not the latest, and you start creating new
+   // backups from the new DB, all the backups that were newer than the
+   // backup you restored from will be deleted
    //
    // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
    // If you try creating a new backup now, old backups 4 and 5 will be deleted
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 61e009cd31..7f18d3142f 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -20,6 +20,7 @@
 #include <map>
 #include <string>
 #include <limits>
+#include <atomic>
 
 namespace rocksdb {
 
@@ -31,6 +32,9 @@ class BackupEngine {
   Status CreateNewBackup(DB* db, bool flush_before_backup = false);
   Status PurgeOldBackups(uint32_t num_backups_to_keep);
   Status DeleteBackup(BackupID backup_id);
+  void StopBackup() {
+    stop_backup_.store(true, std::memory_order_release);
+  }
 
   void GetBackupInfo(std::vector<BackupInfo>* backup_info);
   Status RestoreDBFromBackup(BackupID backup_id, const std::string &db_dir,
@@ -106,13 +110,16 @@ class BackupEngine {
     return "private";
   }
   inline std::string GetPrivateFileRel(BackupID backup_id,
-                                       const std::string &file = "") const {
+                                       bool tmp = false,
+                                       const std::string& file = "") const {
     assert(file.size() == 0 || file[0] != '/');
-    return GetPrivateDirRel() + "/" + std::to_string(backup_id) + "/" + file;
+    return GetPrivateDirRel() + "/" + std::to_string(backup_id) +
+           (tmp ? ".tmp" : "") + "/" + file;
   }
-  inline std::string GetSharedFileRel(const std::string& file = "") const {
+  inline std::string GetSharedFileRel(const std::string& file = "",
+                                      bool tmp = false) const {
     assert(file.size() == 0 || file[0] != '/');
-    return "shared/" + file;
+    return "shared/" + file + (tmp ? ".tmp" : "");
   }
   inline std::string GetLatestBackupFile(bool tmp = false) const {
     return GetAbsolutePath(std::string("LATEST_BACKUP") + (tmp ? ".tmp" : ""));
@@ -151,6 +158,7 @@ class BackupEngine {
   std::map<BackupID, BackupMeta> backups_;
   std::unordered_map<std::string, int> backuped_file_refs_;
   std::vector<BackupID> obsolete_backups_;
+  std::atomic<bool> stop_backup_;
 
   // options data
   BackupableDBOptions options_;
@@ -161,13 +169,17 @@ class BackupEngine {
 };
 
 BackupEngine::BackupEngine(Env* db_env, const BackupableDBOptions& options)
-  : options_(options),
-    db_env_(db_env),
-    backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_) {
+    : stop_backup_(false),
+      options_(options),
+      db_env_(db_env),
+      backup_env_(options.backup_env != nullptr ? options.backup_env
+                                                : db_env_) {
 
   // create all the dirs we need
   backup_env_->CreateDirIfMissing(GetAbsolutePath());
-  backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel()));
+  if (!options_.share_table_files) {
+    backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel()));
+  }
   backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel()));
   backup_env_->CreateDirIfMissing(GetBackupMetaDir());
 
@@ -298,8 +310,9 @@ Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) {
   Log(options_.info_log, "Started the backup process -- creating backup %u",
       new_backup_id);
 
-  // create private dir
-  s = backup_env_->CreateDir(GetAbsolutePath(GetPrivateFileRel(new_backup_id)));
+  // create temporary private dir
+  s = backup_env_->CreateDir(
+      GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)));
 
   // copy live_files
   for (size_t i = 0; s.ok() && i < live_files.size(); ++i) {
@@ -320,7 +333,7 @@ Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) {
     // * if it's kDescriptorFile, limit the size to manifest_file_size
     s = BackupFile(new_backup_id,
                    &new_backup,
-                   type == kTableFile,       /* shared  */
+                   options_.share_table_files && type == kTableFile,
                    db->GetName(),            /* src_dir */
                    live_files[i],            /* src_fname */
                    (type == kDescriptorFile) ? manifest_file_size : 0);
@@ -342,6 +355,13 @@ Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) {
   // we copied all the files, enable file deletions
   db->EnableFileDeletions();
 
+  if (s.ok()) {
+    // move tmp private backup to real backup folder
+    s = backup_env_->RenameFile(
+        GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)), // tmp
+        GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)));
+  }
+
   if (s.ok()) {
     // persist the backup metadata on the disk
     s = new_backup.StoreToFile(options_.sync);
@@ -561,6 +581,9 @@ Status BackupEngine::CopyFile(const std::string& src,
   Slice data;
 
   do {
+    if (stop_backup_.load(std::memory_order_acquire)) {
+      return Status::Incomplete("Backup stopped");
+    }
     size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ?
       copy_file_buffer_size_ : size_limit;
     s = src_file->Read(buffer_to_read, &data, buf.get());
@@ -590,12 +613,16 @@ Status BackupEngine::BackupFile(BackupID backup_id,
 
   assert(src_fname.size() > 0 && src_fname[0] == '/');
   std::string dst_relative = src_fname.substr(1);
+  std::string dst_relative_tmp;
   if (shared) {
-    dst_relative = GetSharedFileRel(dst_relative);
+    dst_relative_tmp = GetSharedFileRel(dst_relative, true);
+    dst_relative = GetSharedFileRel(dst_relative, false);
   } else {
-    dst_relative = GetPrivateFileRel(backup_id, dst_relative);
+    dst_relative_tmp = GetPrivateFileRel(backup_id, true, dst_relative);
+    dst_relative = GetPrivateFileRel(backup_id, false, dst_relative);
   }
   std::string dst_path = GetAbsolutePath(dst_relative);
+  std::string dst_path_tmp = GetAbsolutePath(dst_relative_tmp);
   Status s;
   uint64_t size;
 
@@ -607,12 +634,15 @@ Status BackupEngine::BackupFile(BackupID backup_id,
   } else {
     Log(options_.info_log, "Copying %s", src_fname.c_str());
     s = CopyFile(src_dir + src_fname,
-                 dst_path,
+                 dst_path_tmp,
                  db_env_,
                  backup_env_,
                  options_.sync,
                  &size,
                  size_limit);
+    if (s.ok() && shared) {
+      s = backup_env_->RenameFile(dst_path_tmp, dst_path);
+    }
   }
   if (s.ok()) {
     backup->AddFile(dst_relative, size);
@@ -671,14 +701,16 @@ void BackupEngine::GarbageCollection(bool full_scan) {
                              &private_children);
     for (auto& child : private_children) {
       BackupID backup_id = 0;
+      bool tmp_dir = child.find(".tmp") != std::string::npos;
       sscanf(child.c_str(), "%u", &backup_id);
-      if (backup_id == 0 || backups_.find(backup_id) != backups_.end()) {
+      if (!tmp_dir && // if it's tmp_dir, delete it
+          (backup_id == 0 || backups_.find(backup_id) != backups_.end())) {
         // it's either not a number or it's still alive. continue
         continue;
       }
       // here we have to delete the dir and all its children
       std::string full_private_path =
-          GetAbsolutePath(GetPrivateFileRel(backup_id));
+          GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir));
       std::vector<std::string> subchildren;
       backup_env_->GetChildren(full_private_path, &subchildren);
       for (auto& subchild : subchildren) {
@@ -813,7 +845,9 @@ Status BackupEngine::BackupMeta::StoreToFile(bool sync) {
 
 BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options)
     : StackableDB(db), backup_engine_(new BackupEngine(db->GetEnv(), options)) {
-  backup_engine_->DeleteBackupsNewerThan(GetLatestSequenceNumber());
+  if (options.share_table_files) {
+    backup_engine_->DeleteBackupsNewerThan(GetLatestSequenceNumber());
+  }
 }
 
 BackupableDB::~BackupableDB() {
@@ -836,6 +870,10 @@ Status BackupableDB::DeleteBackup(BackupID backup_id) {
   return backup_engine_->DeleteBackup(backup_id);
 }
 
+void BackupableDB::StopBackup() {
+  backup_engine_->StopBackup();
+}
+
 // --- RestoreBackupableDB methods ------
 
 RestoreBackupableDB::RestoreBackupableDB(Env* db_env,
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index c64f0170b8..de240558f9 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -305,7 +305,7 @@ class BackupableDBTest {
     CreateLoggerFromOptions(dbname_, backupdir_, env_,
                             Options(), &logger_);
     backupable_options_.reset(new BackupableDBOptions(
-        backupdir_, test_backup_env_.get(), logger_.get(), true));
+        backupdir_, test_backup_env_.get(), true, logger_.get(), true));
 
     // delete old files in db
     DestroyDB(dbname_, Options());
@@ -317,7 +317,8 @@ class BackupableDBTest {
     return db;
   }
 
-  void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false) {
+  void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false,
+                        bool share_table_files = true) {
     // reset all the defaults
     test_backup_env_->SetLimitWrittenFiles(1000000);
     test_db_env_->SetLimitWrittenFiles(1000000);
@@ -331,6 +332,7 @@ class BackupableDBTest {
       ASSERT_OK(DB::Open(options_, dbname_, &db));
     }
     backupable_options_->destroy_old_data = destroy_old_data;
+    backupable_options_->share_table_files = share_table_files;
     db_.reset(new BackupableDB(db, *backupable_options_));
   }
 
@@ -659,6 +661,38 @@ TEST(BackupableDBTest, DeleteNewerBackups) {
   CloseRestoreDB();
 }
 
+TEST(BackupableDBTest, NoShareTableFiles) {
+  const int keys_iteration = 5000;
+  OpenBackupableDB(true, false, false);
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(!!(i % 2)));
+  }
+  CloseBackupableDB();
+
+  for (int i = 0; i < 5; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * 6);
+  }
+}
+
+TEST(BackupableDBTest, DeleteTmpFiles) {
+  OpenBackupableDB();
+  CloseBackupableDB();
+  std::string shared_tmp = backupdir_ + "/shared/00006.sst.tmp";
+  std::string private_tmp_dir = backupdir_ + "/private/10.tmp";
+  std::string private_tmp_file = private_tmp_dir + "/00003.sst";
+  file_manager_->WriteToFile(shared_tmp, "tmp");
+  file_manager_->CreateDir(private_tmp_dir);
+  file_manager_->WriteToFile(private_tmp_file, "tmp");
+  ASSERT_EQ(true, file_manager_->FileExists(private_tmp_dir));
+  OpenBackupableDB();
+  CloseBackupableDB();
+  ASSERT_EQ(false, file_manager_->FileExists(shared_tmp));
+  ASSERT_EQ(false, file_manager_->FileExists(private_tmp_file));
+  ASSERT_EQ(false, file_manager_->FileExists(private_tmp_dir));
+}
+
 } // anon namespace
 
 } //  namespace rocksdb

From afdd2d1a46bc128d69dc839e680d6a5e29e995e0 Mon Sep 17 00:00:00 2001
From: Yancey <yancey1989@gmail.com>
Date: Fri, 10 Jan 2014 17:56:35 +0800
Subject: [PATCH 07/30] fix compile warning

---
 util/arena_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/util/arena_test.cc b/util/arena_test.cc
index 4a3d1bd433..ca6dfc99d6 100644
--- a/util/arena_test.cc
+++ b/util/arena_test.cc
@@ -62,8 +62,9 @@ TEST(ArenaImplTest, MemoryAllocatedBytes) {
 TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
   const size_t kBlockSize = 4096;
   const size_t kEntrySize = kBlockSize / 8;
+	const size_t kZero = 0;
   ArenaImpl arena(kBlockSize);
-  ASSERT_EQ(0, arena.ApproximateMemoryUsage());
+  ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
 
   auto num_blocks = kBlockSize / kEntrySize;
 

From f8642dacdebde6ca5d779059daeaffa204ea454f Mon Sep 17 00:00:00 2001
From: ono_matope <matope.ono@gmail.com>
Date: Sat, 11 Jan 2014 05:12:07 +0900
Subject: [PATCH 08/30] Fix share_table_files condition in BackupEngine
 constructor.

That makes BackupableDBTest.NoDoubleCopy test error.
---
 utilities/backupable/backupable_db.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 7f18d3142f..26bdd254b5 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -177,7 +177,7 @@ BackupEngine::BackupEngine(Env* db_env, const BackupableDBOptions& options)
 
   // create all the dirs we need
   backup_env_->CreateDirIfMissing(GetAbsolutePath());
-  if (!options_.share_table_files) {
+  if (options_.share_table_files) {
     backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel()));
   }
   backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel()));

From a09ee1069d927b61ffd0d0e36b45b91e15275e7c Mon Sep 17 00:00:00 2001
From: Schalk-Willem Kruger <swk@fb.com>
Date: Fri, 10 Jan 2014 17:33:56 -0800
Subject: [PATCH 09/30] Improve RocksDB "get" performance by computing merge
 result in memtable

Summary:
Added an option (max_successive_merges) that can be used to specify the
maximum number of successive merge operations on a key in the memtable.
This can be used to improve performance of the "get" operation. If many
successive merge operations are performed on a key, the performance of "get"
operations on the key deteriorates, as the value has to be computed for each
"get" operation by applying all the successive merge operations.

FB Task ID: #3428853

Test Plan:
make all check
db_bench --benchmarks=readrandommergerandom
counter_stress_test

Reviewers: haobo, vamsi, dhruba, sdong

Reviewed By: haobo

CC: zshao

Differential Revision: https://reviews.facebook.net/D14991
---
 db/db_bench.cc            |  97 ++++++++++++++++++++++++++++++-
 db/memtable.cc            |  33 +++++++++++
 db/memtable.h             |   5 ++
 db/merge_test.cc          | 118 +++++++++++++++++++++++++++++++++++++-
 db/write_batch.cc         |  58 ++++++++++++++++++-
 include/rocksdb/options.h |  11 ++++
 util/options.cc           |   5 +-
 7 files changed, 320 insertions(+), 7 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index eb5d7cb421..e0ba58281a 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -94,6 +94,8 @@ DEFINE_string(benchmarks,
               "\tmergerandom   -- same as updaterandom/appendrandom using merge"
               " operator. "
               "Must be used with merge_operator\n"
+              "\treadrandommergerandom -- perform N random read-or-merge "
+              "operations. Must be used with merge_operator\n"
               "\tseekrandom    -- N random seeks\n"
               "\tcrc32c        -- repeated crc32c of 4K of data\n"
               "\tacquireload   -- load N*1000 times\n"
@@ -112,6 +114,11 @@ DEFINE_int64(numdistinct, 1000,
              "read/write on fewer keys so that gets are more likely to find the"
              " key and puts are more likely to update the same key");
 
+DEFINE_int64(merge_keys, -1,
+             "Number of distinct keys to use for MergeRandom and "
+             "ReadRandomMergeRandom. "
+             "If negative, there will be FLAGS_num keys.");
+
 DEFINE_int64(reads, -1, "Number of read operations to do.  "
              "If negative, do FLAGS_num reads.");
 
@@ -297,6 +304,11 @@ DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed"
              "default value 90 means 90% operations out of all reads and writes"
              " operations are reads. In other words, 9 gets for every 1 put.");
 
+DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed"
+             " as percentage) for the ReadRandomMergeRandom workload. The"
+             " default value 70 means 70% out of all read and merge operations"
+             " are merges. In other words, 7 merges for every 3 gets.");
+
 DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
              "deletes (used in RandomWithVerify only). RandomWithVerify "
              "calculates writepercent as (100 - FLAGS_readwritepercent - "
@@ -446,6 +458,9 @@ DEFINE_uint64(bytes_per_sync,  rocksdb::Options().bytes_per_sync,
 DEFINE_bool(filter_deletes, false, " On true, deletes use bloom-filter and drop"
             " the delete if key not present");
 
+DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
+             " operations on a key in the memtable");
+
 static bool ValidatePrefixSize(const char* flagname, int32_t value) {
   if (value < 0 || value>=2000000000) {
     fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
@@ -784,6 +799,7 @@ class Benchmark {
   long long reads_;
   long long writes_;
   long long readwrites_;
+  long long merge_keys_;
   int heap_counter_;
   char keyFormat_[100]; // will contain the format of key. e.g "%016d"
   void PrintHeader() {
@@ -958,6 +974,7 @@ class Benchmark {
     readwrites_((FLAGS_writes < 0  && FLAGS_reads < 0)? FLAGS_num :
                 ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)
                ),
+    merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
     heap_counter_(0) {
     std::vector<std::string> files;
     FLAGS_env->GetChildren(FLAGS_db, &files);
@@ -985,8 +1002,8 @@ class Benchmark {
   }
 
   unique_ptr<char []> GenerateKeyFromInt(long long v, const char* suffix = "") {
-    unique_ptr<char []> keyInStr(new char[kMaxKeySize]);
-    snprintf(keyInStr.get(), kMaxKeySize, keyFormat_, v, suffix);
+    unique_ptr<char []> keyInStr(new char[kMaxKeySize + 1]);
+    snprintf(keyInStr.get(), kMaxKeySize + 1, keyFormat_, v, suffix);
     return keyInStr;
   }
 
@@ -1087,6 +1104,14 @@ class Benchmark {
         method = &Benchmark::ReadWhileWriting;
       } else if (name == Slice("readrandomwriterandom")) {
         method = &Benchmark::ReadRandomWriteRandom;
+      } else if (name == Slice("readrandommergerandom")) {
+        if (FLAGS_merge_operator.empty()) {
+          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
+                  name.ToString().c_str());
+          method = nullptr;
+        } else {
+          method = &Benchmark::ReadRandomMergeRandom;
+        }
       } else if (name == Slice("updaterandom")) {
         method = &Benchmark::UpdateRandom;
       } else if (name == Slice("appendrandom")) {
@@ -1421,6 +1446,7 @@ class Benchmark {
               FLAGS_merge_operator.c_str());
       exit(1);
     }
+    options.max_successive_merges = FLAGS_max_successive_merges;
 
     // set universal style compaction configurations, if applicable
     if (FLAGS_universal_size_ratio != 0) {
@@ -2375,13 +2401,16 @@ class Benchmark {
   //
   // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
   // to simulate random additions over 64-bit integers using merge.
+  //
+  // The number of merges on the same key can be controlled by adjusting
+  // FLAGS_merge_keys.
   void MergeRandom(ThreadState* thread) {
     RandomGenerator gen;
 
     // The number of iterations is the larger of read_ or write_
     Duration duration(FLAGS_duration, readwrites_);
     while (!duration.Done(1)) {
-      const long long k = thread->rand.Next() % FLAGS_num;
+      const long long k = thread->rand.Next() % merge_keys_;
       unique_ptr<char []> key = GenerateKeyFromInt(k);
 
       Status s = db_->Merge(write_options_, key.get(),
@@ -2400,6 +2429,68 @@ class Benchmark {
     thread->stats.AddMessage(msg);
   }
 
+  // Read and merge random keys. The amount of reads and merges are controlled
+  // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
+  // keys (and thus also the number of reads and merges on the same key) can be
+  // adjusted with FLAGS_merge_keys.
+  //
+  // As with MergeRandom, the merge operator to use should be defined by
+  // FLAGS_merge_operator.
+  void ReadRandomMergeRandom(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    long long num_hits = 0;
+    long long num_gets = 0;
+    long long num_merges = 0;
+    size_t max_length = 0;
+
+    // the number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+
+    while (!duration.Done(1)) {
+      const long long k = thread->rand.Next() % merge_keys_;
+      unique_ptr<char []> key = GenerateKeyFromInt(k);
+
+      bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
+
+      if (do_merge) {
+        Status s = db_->Merge(write_options_, key.get(),
+                              gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+
+        num_merges++;
+
+      } else {
+        Status s = db_->Get(options, key.get(), &value);
+        if (value.length() > max_length)
+          max_length = value.length();
+
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (!s.IsNotFound()) {
+          num_hits++;
+        }
+
+        num_gets++;
+
+      }
+
+      thread->stats.FinishedSingleOp(db_);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "(reads:%lld merges:%lld total:%lld hits:%lld maxlength:%zu)",
+             num_gets, num_merges, readwrites_, num_hits, max_length);
+    thread->stats.AddMessage(msg);
+  }
+
+
   void Compact(ThreadState* thread) {
     db_->CompactRange(nullptr, nullptr);
   }
diff --git a/db/memtable.cc b/db/memtable.cc
index 675a314ff5..7881ce5bdb 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -326,4 +326,37 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
   // Key doesn't exist
   return false;
 }
+
+size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
+  Slice memkey = key.memtable_key();
+
+  // A total ordered iterator is costly for some memtablerep (prefix aware
+  // reps). By passing in the user key, we allow efficient iterator creation.
+  // The iterator only needs to be ordered within the same user key.
+  std::shared_ptr<MemTableRep::Iterator> iter(
+    table_->GetIterator(key.user_key()));
+  iter->Seek(memkey.data());
+
+  size_t num_successive_merges = 0;
+
+  for (; iter->Valid(); iter->Next()) {
+    const char* entry = iter->key();
+    uint32_t key_length;
+    const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (!comparator_.comparator.user_comparator()->Compare(
+        Slice(iter_key_ptr, key_length - 8), key.user_key()) == 0) {
+      break;
+    }
+
+    const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
+    if (static_cast<ValueType>(tag & 0xff) != kTypeMerge) {
+      break;
+    }
+
+    ++num_successive_merges;
+  }
+
+  return num_successive_merges;
+}
+
 }  // namespace rocksdb
diff --git a/db/memtable.h b/db/memtable.h
index 79d5ba2d0e..12ccf3d379 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -107,6 +107,11 @@ class MemTable {
               const Slice& key,
               const Slice& value);
 
+  // Returns the number of successive merge entries starting from the newest
+  // entry for the key up to the last non-merge entry or last entry for the
+  // key in the memtable.
+  size_t CountSuccessiveMergeEntries(const LookupKey& key);
+
   // Returns the edits area that is needed for flushing the memtable
   VersionEdit* GetEdits() { return &edit_; }
 
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 0c14aff2ce..38acd8b29c 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -14,6 +14,7 @@
 #include "rocksdb/merge_operator.h"
 #include "db/dbformat.h"
 #include "db/db_impl.h"
+#include "db/write_batch_internal.h"
 #include "utilities/merge_operators.h"
 #include "util/testharness.h"
 #include "utilities/utility_db.h"
@@ -21,13 +22,52 @@
 using namespace std;
 using namespace rocksdb;
 
+namespace {
+  int numMergeOperatorCalls;
 
-std::shared_ptr<DB> OpenDb(const string& dbname, const bool ttl = false) {
+  void resetNumMergeOperatorCalls() {
+    numMergeOperatorCalls = 0;
+  }
+}
+
+class CountMergeOperator : public AssociativeMergeOperator {
+ public:
+  CountMergeOperator() {
+    mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
+  }
+
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const override {
+    ++numMergeOperatorCalls;
+    return mergeOperator_->PartialMerge(
+        key,
+        *existing_value,
+        value,
+        new_value,
+        logger);
+  }
+
+  virtual const char* Name() const override {
+    return "UInt64AddOperator";
+  }
+
+ private:
+  std::shared_ptr<MergeOperator> mergeOperator_;
+};
+
+std::shared_ptr<DB> OpenDb(
+    const string& dbname,
+    const bool ttl = false,
+    const unsigned max_successive_merges = 0) {
   DB* db;
   StackableDB* sdb;
   Options options;
   options.create_if_missing = true;
-  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  options.merge_operator = std::make_shared<CountMergeOperator>();
+  options.max_successive_merges = max_successive_merges;
   Status s;
   DestroyDB(dbname, Options());
   if (ttl) {
@@ -243,6 +283,67 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
   }
 }
 
+void testSuccessiveMerge(
+    Counters& counters, int max_num_merges, int num_merges) {
+
+  counters.assert_remove("z");
+  uint64_t sum = 0;
+
+  for (int i = 1; i <= num_merges; ++i) {
+    resetNumMergeOperatorCalls();
+    counters.assert_add("z", i);
+    sum += i;
+
+    if (i % (max_num_merges + 1) == 0) {
+      assert(numMergeOperatorCalls == max_num_merges + 1);
+    } else {
+      assert(numMergeOperatorCalls == 0);
+    }
+
+    resetNumMergeOperatorCalls();
+    assert(counters.assert_get("z") == sum);
+    assert(numMergeOperatorCalls == i % (max_num_merges + 1));
+  }
+}
+
+void testSingleBatchSuccessiveMerge(
+    DB* db,
+    int max_num_merges,
+    int num_merges) {
+  assert(num_merges > max_num_merges);
+
+  Slice key("BatchSuccessiveMerge");
+  uint64_t merge_value = 1;
+  Slice merge_value_slice((char *)&merge_value, sizeof(merge_value));
+
+  // Create the batch
+  WriteBatch batch;
+  for (int i = 0; i < num_merges; ++i) {
+    batch.Merge(key, merge_value_slice);
+  }
+
+  // Apply to memtable and count the number of merges
+  resetNumMergeOperatorCalls();
+  {
+    Status s = db->Write(WriteOptions(), &batch);
+    assert(s.ok());
+  }
+  assert(numMergeOperatorCalls ==
+      num_merges - (num_merges % (max_num_merges + 1)));
+
+  // Get the value
+  resetNumMergeOperatorCalls();
+  string get_value_str;
+  {
+    Status s = db->Get(ReadOptions(), key, &get_value_str);
+    assert(s.ok());
+  }
+  assert(get_value_str.size() == sizeof(uint64_t));
+  uint64_t get_value = DecodeFixed64(&get_value_str[0]);
+  assert(get_value == num_merges * merge_value);
+  assert(numMergeOperatorCalls == (num_merges % (max_num_merges + 1)));
+}
+
 void runTest(int argc, const string& dbname, const bool use_ttl = false) {
   auto db = OpenDb(dbname, use_ttl);
 
@@ -265,6 +366,19 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) {
   }
 
   DestroyDB(dbname, Options());
+  db.reset();
+
+  {
+    cout << "Test merge in memtable... \n";
+    unsigned maxMerge = 5;
+    auto db = OpenDb(dbname, use_ttl, maxMerge);
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), compact);
+    testSuccessiveMerge(counters, maxMerge, maxMerge * 2);
+    testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+    DestroyDB(dbname, Options());
+  }
+
 }
 
 int main(int argc, char *argv[]) {
diff --git a/db/write_batch.cc b/db/write_batch.cc
index c04930bbf1..2cfc8bd7db 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -21,6 +21,7 @@
 
 #include "rocksdb/write_batch.h"
 #include "rocksdb/options.h"
+#include "rocksdb/merge_operator.h"
 #include "db/dbformat.h"
 #include "db/db_impl.h"
 #include "db/memtable.h"
@@ -203,7 +204,62 @@ class MemTableInserter : public WriteBatch::Handler {
     sequence_++;
   }
   virtual void Merge(const Slice& key, const Slice& value) {
-    mem_->Add(sequence_, kTypeMerge, key, value);
+    bool perform_merge = false;
+
+    if (options_->max_successive_merges > 0 && db_ != nullptr) {
+      LookupKey lkey(key, sequence_);
+
+      // Count the number of successive merges at the head
+      // of the key in the memtable
+      size_t num_merges = mem_->CountSuccessiveMergeEntries(lkey);
+
+      if (num_merges >= options_->max_successive_merges) {
+        perform_merge = true;
+      }
+    }
+
+    if (perform_merge) {
+      // 1) Get the existing value
+      std::string get_value;
+
+      // Pass in the sequence number so that we also include previous merge
+      // operations in the same batch.
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions read_options;
+      read_options.snapshot = &read_from_snapshot;
+
+      db_->Get(read_options, key, &get_value);
+      Slice get_value_slice = Slice(get_value);
+
+      // 2) Apply this merge
+      auto merge_operator = options_->merge_operator.get();
+      assert(merge_operator);
+
+      std::deque<std::string> operands;
+      operands.push_front(value.ToString());
+      std::string new_value;
+      if (!merge_operator->FullMerge(key,
+                                     &get_value_slice,
+                                     operands,
+                                     &new_value,
+                                     options_->info_log.get())) {
+          // Failed to merge!
+          RecordTick(options_->statistics.get(), NUMBER_MERGE_FAILURES);
+
+          // Store the delta in memtable
+          perform_merge = false;
+      } else {
+        // 3) Add value to memtable
+        mem_->Add(sequence_, kTypeValue, key, new_value);
+      }
+    }
+
+    if (!perform_merge) {
+      // Add merge operator to memtable
+      mem_->Add(sequence_, kTypeMerge, key, value);
+    }
+
     sequence_++;
   }
   virtual void Delete(const Slice& key) {
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index b7eaff37dc..b84bdcf389 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -643,6 +643,17 @@ struct Options {
   // Number of locks used for inplace update
   // Default: 10000, if inplace_update_support = true, else 0.
   size_t inplace_update_num_locks;
+
+  // Maximum number of successive merge operations on a key in the memtable.
+  //
+  // When a merge operation is added to the memtable and the maximum number of
+  // successive merges is reached, the value of the key will be calculated and
+  // inserted into the memtable instead of the merge operation. This will
+  // ensure that there are never more than max_successive_merges merge
+  // operations in the memtable.
+  //
+  // Default: 0 (disabled)
+  size_t max_successive_merges;
 };
 
 //
diff --git a/util/options.cc b/util/options.cc
index 198d55384c..64cabc8ca1 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -101,7 +101,8 @@ Options::Options()
       table_factory(
         std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
       inplace_update_support(false),
-      inplace_update_num_locks(10000) {
+      inplace_update_num_locks(10000),
+      max_successive_merges(0) {
   assert(memtable_factory.get() != nullptr);
 }
 
@@ -292,6 +293,8 @@ Options::Dump(Logger* log) const
         inplace_update_support);
     Log(log, "                Options.inplace_update_num_locks: %zd",
         inplace_update_num_locks);
+    Log(log, "                   Options.max_successive_merges: %zd",
+        max_successive_merges);
 }   // Options::Dump
 
 //

From dd6ecdf3425209bc2971ae901bafa1ce26bc3037 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Sat, 11 Jan 2014 09:25:42 -0800
Subject: [PATCH 10/30] Use ASSERT_EQ() instead of assert() in merge_test

---
 db/merge_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/merge_test.cc b/db/merge_test.cc
index 38acd8b29c..887d8ad425 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -340,8 +340,8 @@ void testSingleBatchSuccessiveMerge(
   }
   assert(get_value_str.size() == sizeof(uint64_t));
   uint64_t get_value = DecodeFixed64(&get_value_str[0]);
-  assert(get_value == num_merges * merge_value);
-  assert(numMergeOperatorCalls == (num_merges % (max_num_merges + 1)));
+  ASSERT_EQ(get_value, num_merges * merge_value);
+  ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1)));
 }
 
 void runTest(int argc, const string& dbname, const bool use_ttl = false) {

From c4548d5f1f46645d996c7640d591397769ccbaf4 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Mon, 13 Jan 2014 15:01:34 -0800
Subject: [PATCH 11/30] WriteBatch to provide a way for user to query data size
 directly and only return constant reference of data in Data()

Summary:
WriteBatch::Data() now is easily to be misuse by users. Also, there is no cheap way for user of WriteBatch to know the data size accumulated. This patch fix the problem by:
(1) return a constant reference to Data() so it's obvious to caller what it means.
(2) add a function to return data size directly

Test Plan: make all check

Reviewers: haobo, igor, kailiu

Reviewed By: kailiu

CC: zshao, leveldb

Differential Revision: https://reviews.facebook.net/D15123
---
 include/rocksdb/write_batch.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index 798807045f..30abead502 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -88,7 +88,10 @@ class WriteBatch {
   Status Iterate(Handler* handler) const;
 
   // Retrieve the serialized version of this batch.
-  std::string Data() { return rep_; }
+  std::string Data() const { return rep_; }
+
+  // Retrieve data size of the batch.
+  size_t GetDataSize() const { return rep_.size(); }
 
   // Returns the number of updates in the batch
   int Count() const;

From ac2fe728327be75c8c289d4e3ebf8587d88c518d Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Mon, 13 Jan 2014 22:09:41 -0800
Subject: [PATCH 12/30] Compile dynamic library by default

Summary:
Per request, some users need to use dynamic rocksdb library instead of static one.

However currently the dynamic libraries have to be manually compiled by default, which is inconvenient. I made dymamic libraries to be compiled by default.

Test Plan: make clean; make; make clean;

Reviewers: haobo, sdong, dhruba, igor

Reviewed By: haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15117
---
 Makefile                          | 6 +++---
 build_tools/build_detect_platform | 4 ++--
 build_tools/fbcode.gcc481.sh      | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index ff8347957f..5170ac54a9 100644
--- a/Makefile
+++ b/Makefile
@@ -127,12 +127,12 @@ $(SHARED2): $(SHARED3)
 	ln -fs $(SHARED3) $(SHARED2)
 endif
 
-$(SHARED3):
-	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $@ $(LDFLAGS)
+$(SHARED3): $(LIBOBJECTS)
+	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES)-o $@
 
 endif  # PLATFORM_SHARED_EXT
 
-all: $(LIBRARY) $(PROGRAMS)
+all: $(LIBRARY) $(PROGRAMS) $(SHARED)
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
 	release tags valgrind_check whitebox_crash_test
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 87c4c871dc..8e83ae497e 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -81,9 +81,9 @@ PLATFORM_CCFLAGS=
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
 PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
 PLATFORM_SHARED_EXT="so"
-PLATFORM_SHARED_LDFLAGS="${EXEC_LDFLAGS_SHARED} -shared -Wl,-soname -Wl,"
+PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
 PLATFORM_SHARED_CFLAGS="-fPIC"
-PLATFORM_SHARED_VERSIONED=true
+PLATFORM_SHARED_VERSIONED=false
 
 # generic port files (working on all platform by #ifdef) go directly in /port
 GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "`
diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh
index ae2bb57da5..e8c9f090b9 100644
--- a/build_tools/fbcode.gcc481.sh
+++ b/build_tools/fbcode.gcc481.sh
@@ -60,7 +60,7 @@ AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
 RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
 
 CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
-CFLAGS+=" -nostdlib $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
 CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
 CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2"
 

From 51dd21926c677ae4a63c8f45992903e7b30f0d13 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 14 Jan 2014 10:42:36 -0800
Subject: [PATCH 13/30] DB::Put() to estimate write batch data size needed and
 pre-allocate buffer

Summary:
In one of CPU profiles, we see some CPU costs of string::reserve() inside Batch.Put(). This patch should be able to reduce some of the costs by allocating sufficient buffer before hand.

Since it is a trivial percentage of CPU costs, I didn't find a way to show the improvement in one of the benchmarks. I'll deploy it to same application and do the same CPU profiling to make sure those CPU costs are reduced.

Test Plan: make all check

Reviewers: haobo, kailiu, igor

Reviewed By: haobo

CC: leveldb, nkg-

Differential Revision: https://reviews.facebook.net/D15135
---
 db/db_impl.cc                 | 5 ++++-
 db/write_batch.cc             | 3 ++-
 include/rocksdb/write_batch.h | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index e7f2abf99f..12e07868f3 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3758,7 +3758,10 @@ Status DBImpl::GetDbIdentity(std::string& identity) {
 // Default implementations of convenience methods that subclasses of DB
 // can call if they wish
 Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
-  WriteBatch batch;
+  // Pre-allocate size of write batch conservatively.
+  // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+  // and we allocate 11 extra bytes for key length, as well as value length.
+  WriteBatch batch(key.size() + value.size() + 24);
   batch.Put(key, value);
   return Write(opt, &batch);
 }
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 2cfc8bd7db..7a6106afab 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -36,7 +36,8 @@ namespace rocksdb {
 // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
 static const size_t kHeader = 12;
 
-WriteBatch::WriteBatch() {
+WriteBatch::WriteBatch(size_t reserved_bytes) {
+  rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader);
   Clear();
 }
 
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index 30abead502..e7ce160056 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -35,7 +35,7 @@ struct SliceParts;
 
 class WriteBatch {
  public:
-  WriteBatch();
+  explicit WriteBatch(size_t reserved_bytes = 0);
   ~WriteBatch();
 
   // Store the mapping "key->value" in the database.

From fbbf0d1456f8d872d100f7cbfceb9f9b89249664 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 14 Jan 2014 11:04:27 -0800
Subject: [PATCH 14/30] Pre-calculate whether to slow down for too many level 0
 files

Summary: Currently in DBImpl::MakeRoomForWrite(), we do  "versions_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger" to check whether the writer thread needs to slow down. However, versions_->NumLevelFiles(0) is slightly more expensive than we expected. By caching the result of the comparison when installing a new version, we can avoid this function call every time.

Test Plan:
make all check
Manually trigger this behavior by applying universal compaction style and make sure inserts are made slow after there are certain number of files.

Reviewers: haobo, kailiu, igor

Reviewed By: kailiu

CC: nkg-, leveldb

Differential Revision: https://reviews.facebook.net/D15141
---
 db/db_impl.cc     | 3 +--
 db/version_set.cc | 4 ++++
 db/version_set.h  | 8 ++++++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 12e07868f3..ed5853336d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3166,8 +3166,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
       break;
     } else if (
         allow_delay &&
-        versions_->NumLevelFiles(0) >=
-          options_.level0_slowdown_writes_trigger) {
+        versions_->NeedSlowdownForNumLevel0Files()) {
       // We are getting close to hitting a hard limit on the number of
       // L0 files.  Rather than delaying a single write by several
       // seconds when we hit the hard limit, start delaying each
diff --git a/db/version_set.cc b/db/version_set.cc
index 46cdfaa61c..7a1f5cbf81 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1148,6 +1148,7 @@ VersionSet::VersionSet(const std::string& dbname,
       num_levels_(options_->num_levels),
       dummy_versions_(this),
       current_(nullptr),
+      need_slowdown_for_num_level0_files(false),
       compactions_in_progress_(options_->num_levels),
       current_version_number_(0),
       last_observed_manifest_size_(0),
@@ -1199,6 +1200,9 @@ void VersionSet::AppendVersion(Version* v) {
     current_->Unref();
   }
   current_ = v;
+  need_slowdown_for_num_level0_files =
+      (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr &&
+       NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
   v->Ref();
 
   // Append to linked list
diff --git a/db/version_set.h b/db/version_set.h
index 75b529942f..85ff2ff369 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -250,6 +250,12 @@ class VersionSet {
   // Return the current version.
   Version* current() const { return current_; }
 
+  // A Flag indicating whether write needs to slowdown because of there are
+  // too many number of level0 files.
+  bool NeedSlowdownForNumLevel0Files() const {
+    return need_slowdown_for_num_level0_files;
+  }
+
   // Return the current manifest file number
   uint64_t ManifestFileNumber() const { return manifest_file_number_; }
 
@@ -489,6 +495,8 @@ class VersionSet {
   Version dummy_versions_;  // Head of circular doubly-linked list of versions.
   Version* current_;        // == dummy_versions_.prev_
 
+  bool need_slowdown_for_num_level0_files;
+
   // Per-level key at which the next compaction at that level should start.
   // Either an empty string, or a valid InternalKey.
   std::string* compact_pointer_;

From 1d9bac4d7f2e66f056f0ac21753f5c0e7379e1bf Mon Sep 17 00:00:00 2001
From: Naman Gupta <nkgupta@fb.com>
Date: Fri, 15 Nov 2013 17:17:13 -0800
Subject: [PATCH 15/30] Use sanitized options while opening db

Summary: We use SanitizeOptions() to set appropriate values for some options, based on other options. So we should use the sanitized options by default. Luckily it hasn't caused a bug yet, but can result in a bug in the fugture.

Test Plan: make check

Reviewers: haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14103
---
 db/db_impl.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index ed5853336d..b50eb4c446 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3808,13 +3808,13 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
     uint64_t new_log_number = impl->versions_->NewFileNumber();
     unique_ptr<WritableFile> lfile;
     soptions.use_mmap_writes = false;
-    s = options.env->NewWritableFile(
+    s = impl->options_.env->NewWritableFile(
       LogFileName(impl->options_.wal_dir, new_log_number),
       &lfile,
       soptions
     );
     if (s.ok()) {
-      lfile->SetPreallocationBlockSize(1.1 * options.write_buffer_size);
+      lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size);
       edit.SetLogNumber(new_log_number);
       impl->logfile_number_ = new_log_number;
       impl->log_.reset(new log::Writer(std::move(lfile)));
@@ -3830,7 +3830,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
   }
   impl->mutex_.Unlock();
 
-  if (options.compaction_style == kCompactionStyleUniversal) {
+  if (impl->options_.compaction_style == kCompactionStyleUniversal) {
     int num_files;
     for (int i = 1; i < impl->NumberLevels(); i++) {
       num_files = impl->versions_->NumLevelFiles(i);

From d702d8073e2572a19c806fa53e484a25863f6df4 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Tue, 14 Jan 2014 00:39:42 -0800
Subject: [PATCH 16/30] A script that automatically reformat affected lines

Summary:
Added a script that reformat only the affected lines in a given diff.

I planned to make that file as pre-commit hook but looks it's a little bit more difficult than I thought. Since I don't want to spend too much time on this task right now, I eventually added a "make command" to achieve this with a few additional key strokes.

Also make the clang-format solely inherited from Google's style -- there are still debates on some of the style issues, but we can address them later once we reach a consensus.

Test Plan: Did some ugly format change and ran "make format", all affected lines are formatted as expected.

Reviewers: igor, sdong, haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15147
---
 .clang-format              | 42 -------------------
 Makefile                   | 11 ++++-
 build_tools/format-diff.sh | 83 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 43 deletions(-)
 create mode 100755 build_tools/format-diff.sh

diff --git a/.clang-format b/.clang-format
index a1e9a48e40..7c279811ac 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,46 +2,4 @@
 # http://clang.llvm.org/docs/ClangFormatStyleOptions.html
 ---
 BasedOnStyle: Google
-AccessModifierOffset: -1
-ConstructorInitializerIndentWidth: 4
-AlignEscapedNewlinesLeft: true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakTemplateDeclarations: true
-AlwaysBreakBeforeMultilineStrings: true
-BreakBeforeBinaryOperators: false
-BreakConstructorInitializersBeforeComma: false
-BinPackParameters: false
-ColumnLimit:     80
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-DerivePointerBinding: true
-ExperimentalAutoDetectBinPacking: true
-IndentCaseLabels: false
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakBeforeFirstCallParameter: 10
-PenaltyBreakComment: 60
-PenaltyBreakString: 1000
-PenaltyBreakFirstLessLess: 20
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerBindsToType: true
-SpacesBeforeTrailingComments: 2
-Cpp11BracedListStyle: true
-Standard:        Cpp11
-IndentWidth:     2
-TabWidth:        8
-UseTab:          Never
-BreakBeforeBraces: Attach
-IndentFunctionDeclarationAfterType: false
-SpacesInParentheses: false
-SpacesInAngles: false
-SpaceInEmptyParentheses: false
-SpacesInCStyleCastParentheses: false
-SpaceAfterControlStatementKeyword: true
-SpaceBeforeAssignmentOperators: true
-ContinuationIndentWidth: 4
 ...
diff --git a/Makefile b/Makefile
index 5170ac54a9..ebf7b96fe3 100644
--- a/Makefile
+++ b/Makefile
@@ -135,7 +135,7 @@ endif  # PLATFORM_SHARED_EXT
 all: $(LIBRARY) $(PROGRAMS) $(SHARED)
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
-	release tags valgrind_check whitebox_crash_test
+	release tags valgrind_check whitebox_crash_test format
 
 release:
 	$(MAKE) clean
@@ -196,6 +196,9 @@ tags:
 	ctags * -R
 	cscope -b `find . -name '*.cc'` `find . -name '*.h'`
 
+format:
+	build_tools/format-diff.sh
+
 # ---------------------------------------------------------------------------
 # 	Unit tests and tools
 # ---------------------------------------------------------------------------
@@ -411,6 +414,12 @@ DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
 
 depend: $(DEPFILES)
 
+# if the make goal is either "clean" or "format", we shouldn't
+# try to import the *.d files.
+# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
+# working solution.
 ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),format)
 -include $(DEPFILES)
 endif
+endif
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
new file mode 100755
index 0000000000..758135c9f8
--- /dev/null
+++ b/build_tools/format-diff.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+set -e
+# If clang_format_diff.py command is not specfied, we assume we are able to
+# access directly without any path.
+if [ -z $CLANG_FORMAT_DIFF ]
+then
+CLANG_FORMAT_DIFF="clang-format-diff.py"
+fi
+
+# Check clang-format-diff.py
+if ! which $CLANG_FORMAT_DIFF &> /dev/null
+then
+  echo "You didn't have clang-format-diff.py available in your computer!"
+  echo "You can download it by running: "
+  echo "    curl https://fburl.com/clang-format-diff"
+  exit 128
+fi
+
+# Check argparse, a library that clang-format-diff.py requires.
+python 2>/dev/null << EOF
+import argparse
+EOF
+
+if [ "$?" != 0 ]
+then
+  echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
+  echo "installed. You can try either of the follow ways to install it:"
+  echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
+  echo "  2. easy_install argparse (if you have easy_install)"
+  echo "  3. pip install argparse (if you have pip)"
+  exit 129
+fi
+
+# TODO(kailiu) following work is not complete since we still need to figure
+# out how to add the modified files done pre-commit hook to git's commit index.
+#
+# Check if this script has already been added to pre-commit hook.
+# Will suggest user to add this script to pre-commit hook if their pre-commit
+# is empty.
+# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
+# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
+# then
+#   echo "Would you like to add this script to pre-commit hook, which will do "
+#   echo -n "the format check for all the affected lines before you check in (y/n):"
+#   read add_to_hook
+#   if [ "$add_to_hook" == "y" ]
+#   then
+#     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
+#   fi
+# fi
+
+# Check the format of recently changed lines,
+diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
+
+if [ -z "$diffs" ]
+then
+  echo "Nothing needs to be reformatted!"
+  exit 0
+fi
+
+# Highlight the insertion/deletion from the clang-format-diff.py's output
+COLOR_END="\033[0m"
+COLOR_RED="\033[0;31m" 
+COLOR_GREEN="\033[0;32m" 
+
+echo -e "Detect lines that doesn't follow the format rules:\r"
+# Add the color to the diff. lines added will be green; lines removed will be red.
+echo "$diffs" | 
+  sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
+  sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
+echo -e "Would you like to fix the format automatically (y/n): \c"
+
+# Make sure under any mode, we can read user input.
+exec < /dev/tty
+read to_fix
+
+if [ "$to_fix" != "y" ]
+then
+  exit 1
+fi
+
+# Do in-place format adjustment.
+git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1

From 481c77e526e59accf98ef9a5527ab7fb0e40104b Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Tue, 14 Jan 2014 13:54:33 -0800
Subject: [PATCH 17/30] Move the compilation of the shared libraries to "make
 release"

Compiling the shared libraries took a long time. Thus to speed up the development speed, it still makes sense to be separated from regular compilation.
---
 Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index ebf7b96fe3..572e42e9ee 100644
--- a/Makefile
+++ b/Makefile
@@ -132,14 +132,16 @@ $(SHARED3): $(LIBOBJECTS)
 
 endif  # PLATFORM_SHARED_EXT
 
-all: $(LIBRARY) $(PROGRAMS) $(SHARED)
+all: $(LIBRARY) $(PROGRAMS)
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
 	release tags valgrind_check whitebox_crash_test format
 
+# Will also generate shared libraries. 
 release:
 	$(MAKE) clean
-	OPT=-DNDEBUG $(MAKE) -j32
+	OPT=-DNDEBUG $(MAKE) all -j32
+	OPT=-DNDEBUG $(MAKE) $(SHARED) -j32
 
 coverage:
 	$(MAKE) clean

From 7d9f21cf23d5951fe7654972ca99e0a17cffc177 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 14:49:31 -0800
Subject: [PATCH 18/30] BuildBatchGroup -- memcpy outside of lock

Summary: When building batch group, don't actually build a new batch since it requires heavy-weight mem copy and malloc. Only store references to the batches and build the batch group without lock held.

Test Plan:
`make check`

I am also planning to run performance tests. The workload that will benefit from this change is readwhilewriting. I will post the results once I have them.

Reviewers: dhruba, haobo, kailiu

Reviewed By: haobo

CC: leveldb, xjin

Differential Revision: https://reviews.facebook.net/D15063
---
 db/db_impl.cc | 38 +++++++++++++++----------------
 db/db_impl.h  |  4 +++-
 db/db_test.cc | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 20 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index b50eb4c446..37e8d75828 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -56,6 +56,7 @@
 #include "util/mutexlock.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -2969,12 +2970,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   uint64_t last_sequence = versions_->LastSequence();
   Writer* last_writer = &w;
   if (status.ok() && my_batch != nullptr) {  // nullptr batch is for compactions
-    // TODO: BuildBatchGroup physically concatenate/copy all write batches into
-    // a new one. Mem copy is done with the lock held. Ideally, we only need
-    // the lock to obtain the last_writer and the references to all batches.
-    // Creation (copy) of the merged batch could have been done outside of the
-    // lock protected region.
-    WriteBatch* updates = BuildBatchGroup(&last_writer);
+    autovector<WriteBatch*> write_batch_group;
+    BuildBatchGroup(&last_writer, &write_batch_group);
 
     // Add to log and apply to memtable.  We can release the lock
     // during this phase since &w is currently responsible for logging
@@ -2982,6 +2979,16 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
     // into mem_.
     {
       mutex_.Unlock();
+      WriteBatch* updates = nullptr;
+      if (write_batch_group.size() == 1) {
+        updates = write_batch_group[0];
+      } else {
+        updates = &tmp_batch_;
+        for (size_t i = 0; i < write_batch_group.size(); ++i) {
+          WriteBatchInternal::Append(updates, write_batch_group[i]);
+        }
+      }
+
       const SequenceNumber current_sequence = last_sequence + 1;
       WriteBatchInternal::SetSequence(updates, current_sequence);
       int my_batch_count = WriteBatchInternal::Count(updates);
@@ -3027,12 +3034,12 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         SetTickerCount(options_.statistics.get(),
                        SEQUENCE_NUMBER, last_sequence);
       }
+      if (updates == &tmp_batch_) tmp_batch_.Clear();
       mutex_.Lock();
       if (status.ok()) {
         versions_->SetLastSequence(last_sequence);
       }
     }
-    if (updates == &tmp_batch_) tmp_batch_.Clear();
   }
   if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) {
     bg_error_ = status; // stop compaction & fail any further writes
@@ -3060,13 +3067,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
 
 // REQUIRES: Writer list must be non-empty
 // REQUIRES: First writer must have a non-nullptr batch
-WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
+void DBImpl::BuildBatchGroup(Writer** last_writer,
+                             autovector<WriteBatch*>* write_batch_group) {
   assert(!writers_.empty());
   Writer* first = writers_.front();
-  WriteBatch* result = first->batch;
-  assert(result != nullptr);
+  assert(first->batch != nullptr);
 
   size_t size = WriteBatchInternal::ByteSize(first->batch);
+  write_batch_group->push_back(first->batch);
 
   // Allow the group to grow up to a maximum size, but if the
   // original write is small, limit the growth so we do not slow
@@ -3099,18 +3107,10 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
         break;
       }
 
-      // Append to *reuslt
-      if (result == first->batch) {
-        // Switch to temporary batch instead of disturbing caller's batch
-        result = &tmp_batch_;
-        assert(WriteBatchInternal::Count(result) == 0);
-        WriteBatchInternal::Append(result, first->batch);
-      }
-      WriteBatchInternal::Append(result, w->batch);
+      write_batch_group->push_back(w->batch);
     }
     *last_writer = w;
   }
-  return result;
 }
 
 // This function computes the amount of time in microseconds by which a write
diff --git a/db/db_impl.h b/db/db_impl.h
index d33efd19ea..d74b77aa4d 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -22,6 +22,7 @@
 #include "port/port.h"
 #include "util/stats_logger.h"
 #include "memtablelist.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -291,7 +292,8 @@ class DBImpl : public DB {
   // the superversion outside of mutex
   Status MakeRoomForWrite(bool force /* compact even if there is room? */,
                           SuperVersion** superversion_to_free);
-  WriteBatch* BuildBatchGroup(Writer** last_writer);
+  void BuildBatchGroup(Writer** last_writer,
+                       autovector<WriteBatch*>* write_batch_group);
 
   // Force current memtable contents to be flushed.
   Status FlushMemTable(const FlushOptions& options);
diff --git a/db/db_test.cc b/db/db_test.cc
index a0b3d9aaa4..560311ae3d 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -4333,6 +4333,69 @@ TEST(DBTest, MultiThreaded) {
   } while (ChangeOptions());
 }
 
+// Group commit test:
+namespace {
+
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
+
+struct GCThread {
+  DB* db;
+  int id;
+  std::atomic<bool> done;
+};
+
+static void GCThreadBody(void* arg) {
+  GCThread* t = reinterpret_cast<GCThread*>(arg);
+  int id = t->id;
+  DB* db = t->db;
+  WriteOptions wo;
+
+  for (int i = 0; i < kGCNumKeys; ++i) {
+    std::string kv(std::to_string(i + id * kGCNumKeys));
+    ASSERT_OK(db->Put(wo, kv, kv));
+  }
+  t->done = true;
+}
+
+}  // namespace
+
+TEST(DBTest, GroupCommitTest) {
+  do {
+    // Start threads
+    GCThread thread[kGCNumThreads];
+    for (int id = 0; id < kGCNumThreads; id++) {
+      thread[id].id = id;
+      thread[id].db = db_;
+      thread[id].done = false;
+      env_->StartThread(GCThreadBody, &thread[id]);
+    }
+
+    for (int id = 0; id < kGCNumThreads; id++) {
+      while (thread[id].done == false) {
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+
+    std::vector<std::string> expected_db;
+    for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+      expected_db.push_back(std::to_string(i));
+    }
+    sort(expected_db.begin(), expected_db.end());
+
+    Iterator* itr = db_->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    for (auto x : expected_db) {
+      ASSERT_TRUE(itr->Valid());
+      ASSERT_EQ(itr->key().ToString(), x);
+      ASSERT_EQ(itr->value().ToString(), x);
+      itr->Next();
+    }
+    ASSERT_TRUE(!itr->Valid());
+
+  } while (ChangeOptions());
+}
+
 namespace {
 typedef std::map<std::string, std::string> KVMap;
 }

From 055e6df45b24204feb34461754a482ef7ffc14b6 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 15:27:09 -0800
Subject: [PATCH 19/30] VersionEdit not to take NumLevels()

Summary:
I will submit a sequence of diffs that are preparing master branch for column families. There are a lot of implicit assumptions in the code that are making column family implementation hard. If I make the change only in column family branch, it will make merging back to master impossible.

Most of the diffs will be simple code refactorings, so I hope we can have fast turnaround time. Feel free to grab me in person to discuss any of them.

This diff removes number of level check from VersionEdit. It is used only when VersionEdit is read, not written, but has to be set when it is written. I believe it is a right thing to make VersionEdit dumb and check consistency on the caller side. This will also make it much easier to implement Column Families, since different column families can have different number of levels.

Test Plan: make check

Reviewers: dhruba, haobo, sdong, kailiu

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15159
---
 db/db_impl.cc                       | 30 ++++++++----------
 db/db_impl_readonly.cc              |  2 +-
 db/db_test.cc                       |  9 +++---
 db/memtable.cc                      | 16 ++++------
 db/memtable.h                       |  7 ++---
 db/repair.cc                        |  5 ++-
 db/version_edit.cc                  | 10 +++---
 db/version_edit.h                   | 11 +++----
 db/version_edit_test.cc             |  4 +--
 db/version_set.cc                   | 47 ++++++++++++++++-------------
 db/version_set_reduce_num_levels.cc |  4 +--
 11 files changed, 66 insertions(+), 79 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 37e8d75828..4781ad85d5 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -252,8 +252,8 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
     : env_(options.env),
       dbname_(dbname),
       internal_comparator_(options.comparator),
-      options_(SanitizeOptions(
-          dbname, &internal_comparator_, &internal_filter_policy_, options)),
+      options_(SanitizeOptions(dbname, &internal_comparator_,
+                               &internal_filter_policy_, options)),
       internal_filter_policy_(options.filter_policy),
       owns_info_log_(options_.info_log != options.info_log),
       db_lock_(nullptr),
@@ -261,8 +261,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
       shutting_down_(nullptr),
       bg_cv_(&mutex_),
       mem_rep_factory_(options_.memtable_factory.get()),
-      mem_(new MemTable(internal_comparator_, mem_rep_factory_,
-        NumberLevels(), options_)),
+      mem_(new MemTable(internal_comparator_, options_)),
       logfile_number_(0),
       super_version_(nullptr),
       tmp_batch_(),
@@ -408,7 +407,7 @@ uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
 }
 
 Status DBImpl::NewDB() {
-  VersionEdit new_db(NumberLevels());
+  VersionEdit new_db;
   new_db.SetComparatorName(user_comparator()->Name());
   new_db.SetLogNumber(0);
   new_db.SetNextFile(2);
@@ -864,7 +863,7 @@ void DBImpl::PurgeObsoleteWALFiles() {
 // If externalTable is set, then apply recovered transactions
 // to that table. This is used for readonly mode.
 Status DBImpl::Recover(VersionEdit* edit, MemTable* external_table,
-    bool error_if_log_file_exist) {
+                       bool error_if_log_file_exist) {
   mutex_.AssertHeld();
 
   assert(db_lock_ == nullptr);
@@ -1031,8 +1030,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
     WriteBatchInternal::SetContents(&batch, record);
 
     if (mem == nullptr) {
-      mem = new MemTable(internal_comparator_, mem_rep_factory_,
-        NumberLevels(), options_);
+      mem = new MemTable(internal_comparator_, options_);
       mem->Ref();
     }
     status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
@@ -1358,7 +1356,7 @@ void DBImpl::ReFitLevel(int level, int target_level) {
     Log(options_.info_log, "Before refitting:\n%s",
         versions_->current()->DebugString().data());
 
-    VersionEdit edit(NumberLevels());
+    VersionEdit edit;
     for (const auto& f : versions_->current()->files_[level]) {
       edit.DeleteFile(level, f->number);
       edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest,
@@ -3289,17 +3287,13 @@ Status DBImpl::MakeRoomForWrite(bool force,
         EnvOptions soptions(storage_options_);
         soptions.use_mmap_writes = false;
         DelayLoggingAndReset();
-        s = env_->NewWritableFile(
-            LogFileName(options_.wal_dir, new_log_number),
-            &lfile,
-            soptions
-          );
+        s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
+                                  &lfile, soptions);
         if (s.ok()) {
           // Our final size should be less than write_buffer_size
           // (compression, etc) but err on the side of caution.
           lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
-          memtmp = new MemTable(
-            internal_comparator_, mem_rep_factory_, NumberLevels(), options_);
+          memtmp = new MemTable(internal_comparator_, options_);
           new_superversion = new SuperVersion(options_.max_write_buffer_number);
         }
       }
@@ -3680,7 +3674,7 @@ Status DBImpl::DeleteFile(std::string name) {
   int level;
   FileMetaData metadata;
   int maxlevel = NumberLevels();
-  VersionEdit edit(maxlevel);
+  VersionEdit edit;
   DeletionState deletion_state(0, true);
   {
     MutexLock l(&mutex_);
@@ -3802,7 +3796,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
     return s;
   }
   impl->mutex_.Lock();
-  VersionEdit edit(impl->NumberLevels());
+  VersionEdit edit;
   s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
   if (s.ok()) {
     uint64_t new_log_number = impl->versions_->NewFileNumber();
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index dbb297e93a..04033b2fa3 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -86,7 +86,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
 
   DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
   impl->mutex_.Lock();
-  VersionEdit edit(impl->NumberLevels());
+  VersionEdit edit;
   Status s = impl->Recover(&edit, impl->GetMemTable(),
                            error_if_log_file_exist);
   impl->mutex_.Unlock();
diff --git a/db/db_test.cc b/db/db_test.cc
index 560311ae3d..2ff47320aa 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -765,10 +765,9 @@ TEST(DBTest, LevelLimitReopen) {
   options.num_levels = 1;
   options.max_bytes_for_level_multiplier_additional.resize(1, 1);
   Status s = TryReopen(&options);
-  ASSERT_EQ(s.IsCorruption(), true);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
   ASSERT_EQ(s.ToString(),
-            "Corruption: VersionEdit: db already has "
-            "more levels than options.num_levels");
+            "Invalid argument: db has more levels than options.num_levels");
 
   options.num_levels = 10;
   options.max_bytes_for_level_multiplier_additional.resize(10, 1);
@@ -4936,7 +4935,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
   EnvOptions sopt;
   VersionSet vset(dbname, &options, sopt, nullptr, &cmp);
   ASSERT_OK(vset.Recover());
-  VersionEdit vbase(vset.NumberLevels());
+  VersionEdit vbase;
   uint64_t fnum = 1;
   for (int i = 0; i < num_base_files; i++) {
     InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
@@ -4948,7 +4947,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
   uint64_t start_micros = env->NowMicros();
 
   for (int i = 0; i < iters; i++) {
-    VersionEdit vedit(vset.NumberLevels());
+    VersionEdit vedit;
     vedit.DeleteFile(2, fnum);
     InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
     InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
diff --git a/db/memtable.cc b/db/memtable.cc
index 7881ce5bdb..baff4fb340 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -33,24 +33,20 @@ struct hash<rocksdb::Slice> {
 
 namespace rocksdb {
 
-MemTable::MemTable(const InternalKeyComparator& cmp,
-                   MemTableRepFactory* table_factory,
-                   int numlevel,
-                   const Options& options)
+MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
     : comparator_(cmp),
       refs_(0),
       arena_impl_(options.arena_block_size),
-      table_(table_factory->CreateMemTableRep(comparator_, &arena_impl_)),
+      table_(options.memtable_factory->CreateMemTableRep(comparator_,
+                                                         &arena_impl_)),
       flush_in_progress_(false),
       flush_completed_(false),
       file_number_(0),
-      edit_(numlevel),
       first_seqno_(0),
       mem_next_logfile_number_(0),
       mem_logfile_number_(0),
-      locks_(options.inplace_update_support
-             ? options.inplace_update_num_locks
-             : 0) { }
+      locks_(options.inplace_update_support ? options.inplace_update_num_locks
+                                            : 0) {}
 
 MemTable::~MemTable() {
   assert(refs_ == 0);
@@ -58,7 +54,7 @@ MemTable::~MemTable() {
 
 size_t MemTable::ApproximateMemoryUsage() {
   return arena_impl_.ApproximateMemoryUsage() +
-    table_->ApproximateMemoryUsage();
+         table_->ApproximateMemoryUsage();
 }
 
 int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
diff --git a/db/memtable.h b/db/memtable.h
index 12ccf3d379..24a2c852bd 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -34,11 +34,8 @@ class MemTable {
 
   // MemTables are reference counted.  The initial reference count
   // is zero and the caller must call Ref() at least once.
-  explicit MemTable(
-    const InternalKeyComparator& comparator,
-    MemTableRepFactory* table_factory,
-    int numlevel = 7,
-    const Options& options = Options());
+  explicit MemTable(const InternalKeyComparator& comparator,
+                    const Options& options = Options());
 
   ~MemTable();
 
diff --git a/db/repair.cc b/db/repair.cc
index 6db90c8653..29524233f0 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -58,7 +58,7 @@ class Repairer {
         next_file_number_(1) {
     // TableCache can be small since we expect each table to be opened once.
     table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10);
-    edit_ = new VersionEdit(options.num_levels);
+    edit_ = new VersionEdit();
   }
 
   ~Repairer() {
@@ -196,8 +196,7 @@ class Repairer {
     std::string scratch;
     Slice record;
     WriteBatch batch;
-    MemTable* mem = new MemTable(icmp_, options_.memtable_factory.get(),
-      options_.num_levels);
+    MemTable* mem = new MemTable(icmp_, options_);
     mem->Ref();
     int counter = 0;
     while (reader.ReadRecord(&record, &scratch)) {
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 9f23faba7f..42c07e7b07 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -33,6 +33,7 @@ enum Tag {
 
 void VersionEdit::Clear() {
   comparator_.clear();
+  max_level_ = 0;
   log_number_ = 0;
   prev_log_number_ = 0;
   last_sequence_ = 0;
@@ -107,14 +108,13 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
 
 bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
   uint32_t v;
-  if (GetVarint32(input, &v) &&
-      (int)v < number_levels_) {
+  if (GetVarint32(input, &v)) {
     *level = v;
+    if (max_level_ < *level) {
+      max_level_ = *level;
+    }
     return true;
   } else {
-    if ((int)v >= number_levels_) {
-      *msg = "db already has more levels than options.num_levels";
-    }
     return false;
   }
 }
diff --git a/db/version_edit.h b/db/version_edit.h
index 196914e2bb..a0546c9831 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -34,10 +34,7 @@ struct FileMetaData {
 
 class VersionEdit {
  public:
-  explicit VersionEdit(int number_levels) :
-      number_levels_(number_levels) {
-    Clear();
-  }
+  VersionEdit() { Clear(); }
   ~VersionEdit() { }
 
   void Clear();
@@ -108,7 +105,7 @@ class VersionEdit {
 
   bool GetLevel(Slice* input, int* level, const char** msg);
 
-  int number_levels_;
+  int max_level_;
   std::string comparator_;
   uint64_t log_number_;
   uint64_t prev_log_number_;
@@ -120,9 +117,9 @@ class VersionEdit {
   bool has_next_file_number_;
   bool has_last_sequence_;
 
-  std::vector< std::pair<int, InternalKey> > compact_pointers_;
+  std::vector<std::pair<int, InternalKey> > compact_pointers_;
   DeletedFileSet deleted_files_;
-  std::vector< std::pair<int, FileMetaData> > new_files_;
+  std::vector<std::pair<int, FileMetaData> > new_files_;
 };
 
 }  // namespace rocksdb
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 4a00822f79..745ea90d09 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -15,7 +15,7 @@ namespace rocksdb {
 static void TestEncodeDecode(const VersionEdit& edit) {
   std::string encoded, encoded2;
   edit.EncodeTo(&encoded);
-  VersionEdit parsed(7);
+  VersionEdit parsed();
   Status s = parsed.DecodeFrom(encoded);
   ASSERT_TRUE(s.ok()) << s.ToString();
   parsed.EncodeTo(&encoded2);
@@ -27,7 +27,7 @@ class VersionEditTest { };
 TEST(VersionEditTest, EncodeDecode) {
   static const uint64_t kBig = 1ull << 50;
 
-  VersionEdit edit(7);
+  VersionEdit edit();
   for (int i = 0; i < 4; i++) {
     TestEncodeDecode(edit);
     edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
diff --git a/db/version_set.cc b/db/version_set.cc
index 7a1f5cbf81..91b3dcd3f0 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -980,14 +980,12 @@ class VersionSet::Builder {
 #endif
   }
 
-  void CheckConsistencyForDeletes(
-    VersionEdit* edit,
-    unsigned int number,
-    int level) {
+  void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number,
+                                  int level) {
 #ifndef NDEBUG
       // a file to be deleted better exist in the previous version
       bool found = false;
-      for (int l = 0; !found && l < edit->number_levels_; l++) {
+      for (int l = 0; !found && l < vset_->NumberLevels(); l++) {
         const std::vector<FileMetaData*>& base_files = base_->files_[l];
         for (unsigned int i = 0; i < base_files.size(); i++) {
           FileMetaData* f = base_files[i];
@@ -1000,7 +998,7 @@ class VersionSet::Builder {
       // if the file did not exist in the previous version, then it
       // is possibly moved from lower level to higher level in current
       // version
-      for (int l = level+1; !found && l < edit->number_levels_; l++) {
+      for (int l = level+1; !found && l < vset_->NumberLevels(); l++) {
         const FileSet* added = levels_[l].added_files;
         for (FileSet::const_iterator added_iter = added->begin();
              added_iter != added->end(); ++added_iter) {
@@ -1213,7 +1211,7 @@ void VersionSet::AppendVersion(Version* v) {
 }
 
 Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
-    bool new_descriptor_log) {
+                               bool new_descriptor_log) {
   mu->AssertHeld();
 
   // queue our request
@@ -1383,7 +1381,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
 }
 
 void VersionSet::LogAndApplyHelper(Builder* builder, Version* v,
-  VersionEdit* edit, port::Mutex* mu) {
+                                   VersionEdit* edit, port::Mutex* mu) {
   mu->AssertHeld();
 
   if (edit->has_log_number_) {
@@ -1455,21 +1453,28 @@ Status VersionSet::Recover() {
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit(NumberLevels());
+      VersionEdit edit;
       s = edit.DecodeFrom(record);
-      if (s.ok()) {
-        if (edit.has_comparator_ &&
-            edit.comparator_ != icmp_.user_comparator()->Name()) {
-          s = Status::InvalidArgument(icmp_.user_comparator()->Name(),
-                                      "does not match existing comparator " +
-                                      edit.comparator_);
-        }
+      if (!s.ok()) {
+        break;
       }
 
-      if (s.ok()) {
-        builder.Apply(&edit);
+      if (edit.max_level_ >= NumberLevels()) {
+        s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
       }
 
+      if (edit.has_comparator_ &&
+          edit.comparator_ != icmp_.user_comparator()->Name()) {
+        s = Status::InvalidArgument(icmp_.user_comparator()->Name(),
+            "does not match existing comparator " +
+            edit.comparator_);
+        break;
+      }
+
+      builder.Apply(&edit);
+
       if (edit.has_log_number_) {
         log_number = edit.log_number_;
         have_log_number = true;
@@ -1577,7 +1582,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit(NumberLevels());
+      VersionEdit edit;
       s = edit.DecodeFrom(record);
       if (s.ok()) {
         if (edit.has_comparator_ &&
@@ -1832,7 +1837,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   // TODO: Break up into multiple records to reduce memory usage on recovery?
 
   // Save metadata
-  VersionEdit edit(NumberLevels());
+  VersionEdit edit;
   edit.SetComparatorName(icmp_.user_comparator()->Name());
 
   // Save compaction pointers
@@ -2994,7 +2999,7 @@ Compaction::Compaction(int level, int out_level, uint64_t target_file_size,
       bottommost_level_(false),
       is_full_compaction_(false),
       level_ptrs_(std::vector<size_t>(number_levels)) {
-  edit_ = new VersionEdit(number_levels_);
+  edit_ = new VersionEdit();
   for (int i = 0; i < number_levels_; i++) {
     level_ptrs_[i] = 0;
   }
diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc
index d13a4aed91..07062399b3 100644
--- a/db/version_set_reduce_num_levels.cc
+++ b/db/version_set_reduce_num_levels.cc
@@ -72,8 +72,8 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   num_levels_ = new_levels;
   compact_pointer_ = new std::string[new_levels];
   Init(new_levels);
-  VersionEdit ve(new_levels);
-  st = LogAndApply(&ve , mu, true);
+  VersionEdit ve;
+  st = LogAndApply(&ve, mu, true);
   return st;
 }
 

From 7f3e417f59e9f398abca74ceef370b9861cb7523 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 15:32:37 -0800
Subject: [PATCH 20/30] Fix memtable construction in tests

---
 db/version_edit_test.cc |  4 ++--
 db/write_batch_test.cc  |  5 +++--
 table/table_test.cc     | 13 +++++++++----
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 745ea90d09..63aa32e8f6 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -15,7 +15,7 @@ namespace rocksdb {
 static void TestEncodeDecode(const VersionEdit& edit) {
   std::string encoded, encoded2;
   edit.EncodeTo(&encoded);
-  VersionEdit parsed();
+  VersionEdit parsed;
   Status s = parsed.DecodeFrom(encoded);
   ASSERT_TRUE(s.ok()) << s.ToString();
   parsed.EncodeTo(&encoded2);
@@ -27,7 +27,7 @@ class VersionEditTest { };
 TEST(VersionEditTest, EncodeDecode) {
   static const uint64_t kBig = 1ull << 50;
 
-  VersionEdit edit();
+  VersionEdit edit;
   for (int i = 0; i < 4; i++) {
     TestEncodeDecode(edit);
     edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index ff9aa63eec..931d8f3f59 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -22,10 +22,11 @@ namespace rocksdb {
 static std::string PrintContents(WriteBatch* b) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
-  MemTable* mem = new MemTable(cmp, factory.get());
+  Options options;
+  options.memtable_factory = factory;
+  MemTable* mem = new MemTable(cmp, options);
   mem->Ref();
   std::string state;
-  Options options;
   Status s = WriteBatchInternal::InsertInto(b, mem, &options);
   int count = 0;
   Iterator* iter = mem->NewIterator();
diff --git a/table/table_test.cc b/table/table_test.cc
index 1f79fcdf9a..d404e0b2a2 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -370,7 +370,9 @@ class MemTableConstructor: public Constructor {
       : Constructor(cmp),
         internal_comparator_(cmp),
         table_factory_(new SkipListFactory) {
-    memtable_ = new MemTable(internal_comparator_, table_factory_.get());
+    Options options;
+    options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, options);
     memtable_->Ref();
   }
   ~MemTableConstructor() {
@@ -378,7 +380,9 @@ class MemTableConstructor: public Constructor {
   }
   virtual Status FinishImpl(const Options& options, const KVMap& data) {
     delete memtable_->Unref();
-    memtable_ = new MemTable(internal_comparator_, table_factory_.get());
+    Options memtable_options;
+    memtable_options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, memtable_options);
     memtable_->Ref();
     int seq = 1;
     for (KVMap::const_iterator it = data.begin();
@@ -1268,10 +1272,11 @@ class MemTableTest { };
 TEST(MemTableTest, Simple) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto table_factory = std::make_shared<SkipListFactory>();
-  MemTable* memtable = new MemTable(cmp, table_factory.get());
+  Options options;
+  options.memtable_factory = table_factory;
+  MemTable* memtable = new MemTable(cmp, options);
   memtable->Ref();
   WriteBatch batch;
-  Options options;
   WriteBatchInternal::SetSequence(&batch, 100);
   batch.Put(std::string("k1"), std::string("v1"));
   batch.Put(std::string("k2"), std::string("v2"));

From 62910202844a6abe5fb5eb23fa0195a3d34d3cf9 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 15:41:30 -0800
Subject: [PATCH 21/30] Fix test

---
 db/db_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index 2ff47320aa..6e7a2edc20 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -4391,6 +4391,7 @@ TEST(DBTest, GroupCommitTest) {
       itr->Next();
     }
     ASSERT_TRUE(!itr->Valid());
+    delete itr;
 
   } while (ChangeOptions());
 }

From 1ed2404f27b351def723ecd59dd646007e500b3f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 15:54:11 -0800
Subject: [PATCH 22/30] Wrong number of levels is Invalid argument now, not
 corruption

---
 db/db_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 6e7a2edc20..91970381fe 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3397,7 +3397,7 @@ TEST(DBTest, DBOpen_Change_NumLevels) {
   opts.create_if_missing = false;
   opts.num_levels = 2;
   s = DB::Open(opts, dbname, &db);
-  ASSERT_TRUE(strstr(s.ToString().c_str(), "Corruption") != nullptr);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
   ASSERT_TRUE(db == nullptr);
 }
 

From d9cd7a063f919d4a57334932e57b31571ce87ddc Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 14 Jan 2014 16:19:09 -0800
Subject: [PATCH 23/30] Fix CompactRange to apply filter to every key

Summary:
When doing CompactRange(), we should first flush the memtable and then calculate max_level_with_files. Also, we want to compact all the levels that have files, including level `max_level_with_files`.

This patch fixed the unit test.

Test Plan: Added a failing unit test and a fix, so it's not failing anymore.

Reviewers: dhruba, haobo, sdong

Reviewed By: haobo

CC: leveldb, xjin

Differential Revision: https://reviews.facebook.net/D14421
---
 db/db_impl.cc                  | 85 +++++++++++++++++++++++-----------
 db/db_impl.h                   | 12 ++++-
 db/db_test.cc                  | 56 +++++++++++++---------
 db/version_set.cc              | 51 +++++++++++++-------
 db/version_set.h               | 16 +++++--
 include/rocksdb/db.h           |  1 +
 util/manual_compaction_test.cc | 75 +++++++++++++++++++++++++++---
 7 files changed, 216 insertions(+), 80 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 4781ad85d5..908ede5b4a 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1278,8 +1278,11 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
   return s;
 }
 
-void DBImpl::CompactRange(const Slice* begin, const Slice* end,
-                          bool reduce_level, int target_level) {
+void DBImpl::CompactRange(const Slice* begin,
+                          const Slice* end,
+                          bool reduce_level,
+                          int target_level) {
+  FlushMemTable(FlushOptions());
   int max_level_with_files = 1;
   {
     MutexLock l(&mutex_);
@@ -1290,9 +1293,15 @@ void DBImpl::CompactRange(const Slice* begin, const Slice* end,
       }
     }
   }
-  TEST_FlushMemTable(); // TODO(sanjay): Skip if memtable does not overlap
-  for (int level = 0; level < max_level_with_files; level++) {
-    TEST_CompactRange(level, begin, end);
+  for (int level = 0; level <= max_level_with_files; level++) {
+    // in case the compaction is unversal or if we're compacting the
+    // bottom-most level, the output level will be the same as input one
+    if (options_.compaction_style == kCompactionStyleUniversal ||
+        level == max_level_with_files) {
+      RunManualCompaction(level, level, begin, end);
+    } else {
+      RunManualCompaction(level, level + 1, begin, end);
+    }
   }
 
   if (reduce_level) {
@@ -1591,13 +1600,17 @@ Status DBImpl::AppendSortedWalsOfType(const std::string& path,
   return status;
 }
 
-void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
-  assert(level >= 0);
+void DBImpl::RunManualCompaction(int input_level,
+                                 int output_level,
+                                 const Slice* begin,
+                                 const Slice* end) {
+  assert(input_level >= 0);
 
   InternalKey begin_storage, end_storage;
 
   ManualCompaction manual;
-  manual.level = level;
+  manual.input_level = input_level;
+  manual.output_level = output_level;
   manual.done = false;
   manual.in_progress = false;
   // For universal compaction, we enforce every manual compaction to compact
@@ -1625,11 +1638,11 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
   // can compact any range of keys/files.
   //
   // bg_manual_only_ is non-zero when at least one thread is inside
-  // TEST_CompactRange(), i.e. during that time no other compaction will
+  // RunManualCompaction(), i.e. during that time no other compaction will
   // get scheduled (see MaybeScheduleFlushOrCompaction).
   //
   // Note that the following loop doesn't stop more that one thread calling
-  // TEST_CompactRange() from getting to the second while loop below.
+  // RunManualCompaction() from getting to the second while loop below.
   // However, only one of them will actually schedule compaction, while
   // others will wait on a condition variable until it completes.
 
@@ -1659,6 +1672,15 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
   --bg_manual_only_;
 }
 
+void DBImpl::TEST_CompactRange(int level,
+                               const Slice* begin,
+                               const Slice* end) {
+  int output_level = (options_.compaction_style == kCompactionStyleUniversal)
+                         ? level
+                         : level + 1;
+  RunManualCompaction(level, output_level, begin, end);
+}
+
 Status DBImpl::FlushMemTable(const FlushOptions& options) {
   // nullptr batch means just wait for earlier writes to be done
   Status s = Write(WriteOptions(), nullptr);
@@ -1878,23 +1900,27 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   unique_ptr<Compaction> c;
   bool is_manual = (manual_compaction_ != nullptr) &&
                    (manual_compaction_->in_progress == false);
-  InternalKey manual_end;
+  InternalKey manual_end_storage;
+  InternalKey* manual_end = &manual_end_storage;
   if (is_manual) {
     ManualCompaction* m = manual_compaction_;
     assert(!m->in_progress);
     m->in_progress = true; // another thread cannot pick up the same work
-    c.reset(versions_->CompactRange(m->level, m->begin, m->end));
-    if (c) {
-      manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
-    } else {
+    c.reset(versions_->CompactRange(
+        m->input_level, m->output_level, m->begin, m->end, &manual_end));
+    if (!c) {
       m->done = true;
     }
     Log(options_.info_log,
-        "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
-        m->level,
+        "Manual compaction from level-%d to level-%d from %s .. %s; will stop "
+        "at %s\n",
+        m->input_level,
+        m->output_level,
         (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
         (m->end ? m->end->DebugString().c_str() : "(end)"),
-        (m->done ? "(end)" : manual_end.DebugString().c_str()));
+        ((m->done || manual_end == nullptr)
+             ? "(end)"
+             : manual_end->DebugString().c_str()));
   } else if (!options_.disable_auto_compactions) {
     c.reset(versions_->PickCompaction());
   }
@@ -1959,13 +1985,19 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     //   Also note that, if we don't stop here, then the current compaction
     //   writes a new file back to level 0, which will be used in successive
     //   compaction. Hence the manual compaction will never finish.
-    if (options_.compaction_style == kCompactionStyleUniversal) {
+    //
+    // Stop the compaction if manual_end points to nullptr -- this means
+    // that we compacted the whole range. manual_end should always point
+    // to nullptr in case of universal compaction
+    if (manual_end == nullptr) {
       m->done = true;
     }
     if (!m->done) {
       // We only compacted part of the requested range.  Update *m
       // to the range that is left to be compacted.
-      m->tmp_storage = manual_end;
+      // Universal compaction should always compact the whole range
+      assert(options_.compaction_style != kCompactionStyleUniversal);
+      m->tmp_storage = *manual_end;
       m->begin = &m->tmp_storage;
     }
     m->in_progress = false; // not being processed anymore
@@ -1997,14 +2029,14 @@ void DBImpl::CleanupCompaction(CompactionState* compact, Status status) {
 }
 
 // Allocate the file numbers for the output file. We allocate as
-// many output file numbers as there are files in level+1.
+// many output file numbers as there are files in level+1 (at least one)
 // Insert them into pending_outputs so that they do not get deleted.
 void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) {
   mutex_.AssertHeld();
   assert(compact != nullptr);
   assert(compact->builder == nullptr);
   int filesNeeded = compact->compaction->num_input_files(1);
-  for (int i = 0; i < filesNeeded; i++) {
+  for (int i = 0; i < std::max(filesNeeded, 1); i++) {
     uint64_t file_number = versions_->NewFileNumber();
     pending_outputs_.insert(file_number);
     compact->allocated_file_numbers.push_back(file_number);
@@ -2148,14 +2180,11 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
 
   // Add compaction outputs
   compact->compaction->AddInputDeletions(compact->compaction->edit());
-  const int level = compact->compaction->level();
   for (size_t i = 0; i < compact->outputs.size(); i++) {
     const CompactionState::Output& out = compact->outputs[i];
     compact->compaction->edit()->AddFile(
-        (options_.compaction_style == kCompactionStyleUniversal) ?
-          level : level + 1,
-        out.number, out.file_size, out.smallest, out.largest,
-        out.smallest_seqno, out.largest_seqno);
+        compact->compaction->output_level(), out.number, out.file_size,
+        out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
   }
   return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
 }
@@ -2197,7 +2226,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       compact->compaction->num_input_files(0),
       compact->compaction->level(),
       compact->compaction->num_input_files(1),
-      compact->compaction->level() + 1,
+      compact->compaction->output_level(),
       compact->compaction->score(),
       options_.max_background_compactions - bg_compaction_scheduled_);
   char scratch[256];
diff --git a/db/db_impl.h b/db/db_impl.h
index d74b77aa4d..476b2bf549 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -89,10 +89,17 @@ class DBImpl : public DB {
 
   virtual Status GetDbIdentity(std::string& identity);
 
+  void RunManualCompaction(int input_level,
+                           int output_level,
+                           const Slice* begin,
+                           const Slice* end);
+
   // Extra methods (for testing) that are not in the public DB interface
 
   // Compact any files in the named level that overlap [*begin, *end]
-  void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
+  void TEST_CompactRange(int level,
+                         const Slice* begin,
+                         const Slice* end);
 
   // Force current memtable contents to be flushed.
   Status TEST_FlushMemTable();
@@ -406,7 +413,8 @@ class DBImpl : public DB {
 
   // Information for a manual compaction
   struct ManualCompaction {
-    int level;
+    int input_level;
+    int output_level;
     bool done;
     bool in_progress;           // compaction request being processed?
     const InternalKey* begin;   // nullptr means beginning of key range
diff --git a/db/db_test.cc b/db/db_test.cc
index 91970381fe..9c8a97f936 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3309,34 +3309,46 @@ TEST(DBTest, ManualCompaction) {
   ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
       << "Need to update this test to match kMaxMemCompactLevel";
 
-  MakeTables(3, "p", "q");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
 
-  // Compaction range falls before files
-  Compact("", "c");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+    // Compaction range falls before files
+    Compact("", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
 
-  // Compaction range falls after files
-  Compact("r", "z");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+    // Compaction range falls after files
+    Compact("r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
 
-  // Compaction range overlaps files
-  Compact("p1", "p9");
-  ASSERT_EQ("0,0,1", FilesPerLevel());
+    // Compaction range overlaps files
+    Compact("p1", "p9");
+    ASSERT_EQ("0,0,1", FilesPerLevel());
 
-  // Populate a different range
-  MakeTables(3, "c", "e");
-  ASSERT_EQ("1,1,2", FilesPerLevel());
+    // Populate a different range
+    MakeTables(3, "c", "e");
+    ASSERT_EQ("1,1,2", FilesPerLevel());
 
-  // Compact just the new range
-  Compact("b", "f");
-  ASSERT_EQ("0,0,2", FilesPerLevel());
+    // Compact just the new range
+    Compact("b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel());
+
+    // Compact all
+    MakeTables(1, "a", "z");
+    ASSERT_EQ("0,1,2", FilesPerLevel());
+    db_->CompactRange(nullptr, nullptr);
+    ASSERT_EQ("0,0,1", FilesPerLevel());
+
+    if (iter == 0) {
+      Options options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(&options);
+    }
+  }
 
-  // Compact all
-  MakeTables(1, "a", "z");
-  ASSERT_EQ("0,1,2", FilesPerLevel());
-  db_->CompactRange(nullptr, nullptr);
-  ASSERT_EQ("0,0,1", FilesPerLevel());
 }
 
 TEST(DBTest, DBOpen_Options) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 91b3dcd3f0..a411ea2108 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2715,6 +2715,7 @@ Compaction* VersionSet::PickCompaction() {
 bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest,
   const InternalKey* largest, int level, int* parent_index) {
   std::vector<FileMetaData*> inputs;
+  assert(level + 1 < NumberLevels());
 
   current_->GetOverlappingInputs(level+1, smallest, largest,
                                  &inputs, *parent_index, parent_index);
@@ -2776,7 +2777,8 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) {
   // compaction, then we must drop/cancel this compaction.
   int parent_index = -1;
   if (FilesInCompaction(c->inputs_[0]) ||
-      ParentRangeInCompaction(&smallest, &largest, level, &parent_index)) {
+      (c->level() != c->output_level() &&
+       ParentRangeInCompaction(&smallest, &largest, level, &parent_index))) {
     c->inputs_[0].clear();
     c->inputs_[1].clear();
     delete c;
@@ -2790,7 +2792,9 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) {
 // user-key with another file.
 void VersionSet::SetupOtherInputs(Compaction* c) {
   // If inputs are empty, then there is nothing to expand.
-  if (c->inputs_[0].empty()) {
+  // If both input and output levels are the same, no need to consider
+  // files at level "level+1"
+  if (c->inputs_[0].empty() || c->level() == c->output_level()) {
     return;
   }
 
@@ -2918,11 +2922,13 @@ void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
   obsolete_files_.clear();
 }
 
-Compaction* VersionSet::CompactRange(
-    int level,
-    const InternalKey* begin,
-    const InternalKey* end) {
+Compaction* VersionSet::CompactRange(int input_level,
+                                     int output_level,
+                                     const InternalKey* begin,
+                                     const InternalKey* end,
+                                     InternalKey** compaction_end) {
   std::vector<FileMetaData*> inputs;
+  bool covering_the_whole_range = true;
 
   // All files are 'overlapping' in universal style compaction.
   // We have to compact the entire range in one shot.
@@ -2930,7 +2936,7 @@ Compaction* VersionSet::CompactRange(
     begin = nullptr;
     end = nullptr;
   }
-  current_->GetOverlappingInputs(level, begin, end, &inputs);
+  current_->GetOverlappingInputs(input_level, begin, end, &inputs);
   if (inputs.empty()) {
     return nullptr;
   }
@@ -2939,24 +2945,26 @@ Compaction* VersionSet::CompactRange(
   // But we cannot do this for level-0 since level-0 files can overlap
   // and we must not pick one file and drop another older file if the
   // two files overlap.
-  if (level > 0) {
-    const uint64_t limit = MaxFileSizeForLevel(level) *
-                         options_->source_compaction_factor;
+  if (input_level > 0) {
+    const uint64_t limit =
+        MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
     uint64_t total = 0;
-    for (size_t i = 0; i < inputs.size(); ++i) {
+    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
       uint64_t s = inputs[i]->file_size;
       total += s;
       if (total >= limit) {
+        **compaction_end = inputs[i + 1]->smallest;
+        covering_the_whole_range = false;
         inputs.resize(i + 1);
         break;
       }
     }
   }
-  int out_level = (options_->compaction_style == kCompactionStyleUniversal) ?
-                  level : level+1;
-
-  Compaction* c = new Compaction(level, out_level, MaxFileSizeForLevel(out_level),
-    MaxGrandParentOverlapBytes(level), NumberLevels());
+  Compaction* c = new Compaction(input_level,
+                                 output_level,
+                                 MaxFileSizeForLevel(output_level),
+                                 MaxGrandParentOverlapBytes(input_level),
+                                 NumberLevels());
 
   c->inputs_[0] = inputs;
   ExpandWhileOverlapping(c);
@@ -2969,6 +2977,10 @@ Compaction* VersionSet::CompactRange(
   c->input_version_->Ref();
   SetupOtherInputs(c);
 
+  if (covering_the_whole_range) {
+    *compaction_end = nullptr;
+  }
+
   // These files that are to be manaully compacted do not trample
   // upon other files because manual compactions are processed when
   // the system has a max of 1 background compaction thread.
@@ -3016,7 +3028,10 @@ bool Compaction::IsTrivialMove() const {
   // Avoid a move if there is lots of overlapping grandparent data.
   // Otherwise, the move could create a parent file that will require
   // a very expensive merge later on.
-  return (num_input_files(0) == 1 &&
+  // If level_== out_level_, the purpose is to force compaction filter to be
+  // applied to that level, and thus cannot be a trivia move.
+  return (level_ != out_level_ &&
+          num_input_files(0) == 1 &&
           num_input_files(1) == 0 &&
           TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
 }
@@ -3109,7 +3124,7 @@ void Compaction::SetupBottomMostLevel(bool isManual) {
   }
   bottommost_level_ = true;
   int num_levels = input_version_->vset_->NumberLevels();
-  for (int i = level() + 2; i < num_levels; i++) {
+  for (int i = output_level() + 1; i < num_levels; i++) {
     if (input_version_->vset_->NumLevelFiles(i) > 0) {
       bottommost_level_ = false;
       break;
diff --git a/db/version_set.h b/db/version_set.h
index 85ff2ff369..2c91532b5c 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -310,10 +310,18 @@ class VersionSet {
   // the specified level.  Returns nullptr if there is nothing in that
   // level that overlaps the specified range.  Caller should delete
   // the result.
-  Compaction* CompactRange(
-      int level,
-      const InternalKey* begin,
-      const InternalKey* end);
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  Compaction* CompactRange(int input_level,
+                           int output_level,
+                           const InternalKey* begin,
+                           const InternalKey* end,
+                           InternalKey** compaction_end);
 
   // Return the maximum overlapping data (in bytes) at next level for any
   // file at a level >= 1.
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index dd17d9e9b5..4bf095756c 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -199,6 +199,7 @@ class DB {
                                    uint64_t* sizes) = 0;
 
   // Compact the underlying storage for the key range [*begin,*end].
+  // The actual compaction interval might be superset of [*begin, *end].
   // In particular, deleted and overwritten versions are discarded,
   // and the data is rearranged to reduce the cost of operations
   // needed to access the data.  This operation should typically only
diff --git a/util/manual_compaction_test.cc b/util/manual_compaction_test.cc
index ebe1339e53..dd615f0570 100644
--- a/util/manual_compaction_test.cc
+++ b/util/manual_compaction_test.cc
@@ -9,9 +9,13 @@
 #include <cstdlib>
 
 #include "rocksdb/db.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
 #include "rocksdb/write_batch.h"
 #include "util/testharness.h"
 
+using namespace rocksdb;
+
 namespace {
 
 const int kNumKeys = 1100000;
@@ -26,12 +30,71 @@ std::string Key2(int i) {
   return Key1(i) + "_xxx";
 }
 
-class ManualCompactionTest { };
+class ManualCompactionTest {
+ public:
+  ManualCompactionTest() {
+    // Get rid of any state from an old run.
+    dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
+    DestroyDB(dbname_, rocksdb::Options());
+  }
+
+  std::string dbname_;
+};
+
+class DestroyAllCompactionFilter : public CompactionFilter {
+ public:
+  DestroyAllCompactionFilter() {}
+
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const {
+    return existing_value.ToString() == "destroy";
+  }
+
+  virtual const char* Name() const {
+    return "DestroyAllCompactionFilter";
+  }
+};
+
+TEST(ManualCompactionTest, CompactTouchesAllKeys) {
+  for (int iter = 0; iter < 2; ++iter) {
+    DB* db;
+    Options options;
+    if (iter == 0) { // level compaction
+      options.num_levels = 3;
+      options.compaction_style = kCompactionStyleLevel;
+    } else { // universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.create_if_missing = true;
+    options.compression = rocksdb::kNoCompression;
+    options.compaction_filter = new DestroyAllCompactionFilter();
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+
+    db->Put(WriteOptions(), Slice("key1"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key2"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+    db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
+
+    Slice key4("key4");
+    db->CompactRange(nullptr, &key4);
+    Iterator* itr = db->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    ASSERT_TRUE(itr->Valid());
+    ASSERT_EQ("key3", itr->key().ToString());
+    itr->Next();
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    delete options.compaction_filter;
+    delete db;
+    DestroyDB(dbname_, options);
+  }
+}
 
 TEST(ManualCompactionTest, Test) {
-  // Get rid of any state from an old run.
-  std::string dbpath = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
-  DestroyDB(dbpath, rocksdb::Options());
 
   // Open database.  Disable compression since it affects the creation
   // of layers and the code below is trying to test against a very
@@ -40,7 +103,7 @@ TEST(ManualCompactionTest, Test) {
   rocksdb::Options db_options;
   db_options.create_if_missing = true;
   db_options.compression = rocksdb::kNoCompression;
-  ASSERT_OK(rocksdb::DB::Open(db_options, dbpath, &db));
+  ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db));
 
   // create first key range
   rocksdb::WriteBatch batch;
@@ -83,7 +146,7 @@ TEST(ManualCompactionTest, Test) {
 
   // close database
   delete db;
-  DestroyDB(dbpath, rocksdb::Options());
+  DestroyDB(dbname_, rocksdb::Options());
 }
 
 }  // anonymous namespace

From 9b51af5a17f3cfd754575894e090dd867fb47740 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 12 Dec 2013 10:54:03 -0800
Subject: [PATCH 24/30] [RocksDB Performance Branch]
 DBImpl.NewInternalIterator() to reduce works inside mutex

Summary: To reduce mutex contention caused by DBImpl.NewInternalIterator(), in this function, move all the iteration creation works out of mutex, only leaving object ref and get.

Test Plan:
make all check
will run db_stress for a while too to make sure no problem.

Reviewers: haobo, dhruba, kailiu

Reviewed By: haobo

CC: igor, leveldb

Differential Revision: https://reviews.facebook.net/D14589

Conflicts:
	db/db_impl.cc
---
 db/db_impl.cc | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 908ede5b4a..07ac5c9d05 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2657,38 +2657,40 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
 Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
                                       SequenceNumber* latest_snapshot) {
   IterState* cleanup = new IterState;
-  mutex_.Lock();
-  *latest_snapshot = versions_->LastSequence();
+  MemTable* mutable_mem;
+  std::vector<MemTable*> immutables;
+  Version* version;
 
   // Collect together all needed child iterators for mem
-  std::vector<Iterator*> list;
+  mutex_.Lock();
+  *latest_snapshot = versions_->LastSequence();
   mem_->Ref();
-  list.push_back(mem_->NewIterator(options));
-
-  cleanup->mem.push_back(mem_);
-
+  mutable_mem = mem_;
   // Collect together all needed child iterators for imm_
-  std::vector<MemTable*> immutables;
   imm_.GetMemTables(&immutables);
   for (unsigned int i = 0; i < immutables.size(); i++) {
-    MemTable* m = immutables[i];
-    m->Ref();
+    immutables[i]->Ref();
+  }
+  // Collect iterators for files in L0 - Ln
+  versions_->current()->Ref();
+  version = versions_->current();
+  mutex_.Unlock();
+
+  std::vector<Iterator*> list;
+  list.push_back(mutable_mem->NewIterator(options));
+  cleanup->mem.push_back(mutable_mem);
+  for (MemTable* m : immutables) {
     list.push_back(m->NewIterator(options));
     cleanup->mem.push_back(m);
   }
-
-  // Collect iterators for files in L0 - Ln
-  versions_->current()->AddIterators(options, storage_options_, &list);
+  version->AddIterators(options, storage_options_, &list);
   Iterator* internal_iter =
       NewMergingIterator(&internal_comparator_, &list[0], list.size());
-  versions_->current()->Ref();
-
+  cleanup->version = version;
   cleanup->mu = &mutex_;
   cleanup->db = this;
-  cleanup->version = versions_->current();
   internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
 
-  mutex_.Unlock();
   return internal_iter;
 }
 

From c8f16221ed9e0f23b8f11f046e7dddaf6472d2ea Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Tue, 14 Jan 2014 18:03:56 -0800
Subject: [PATCH 25/30] Fix the return type of WriteBatch::Data().

Summary: Quick fix for https://reviews.facebook.net/D15123

Test Plan: Make check

Reviewers: sdong, vkrest

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15165
---
 include/rocksdb/write_batch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index e7ce160056..2cfb731f63 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -88,7 +88,7 @@ class WriteBatch {
   Status Iterate(Handler* handler) const;
 
   // Retrieve the serialized version of this batch.
-  std::string Data() const { return rep_; }
+  const std::string& Data() const { return rep_; }
 
   // Retrieve data size of the batch.
   size_t GetDataSize() const { return rep_.size(); }

From 65a8a52b546cf5eec3c2895d220fd343353585d2 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 15 Jan 2014 16:15:43 -0800
Subject: [PATCH 26/30] Decrease reliance on VersionSet::NumberLevels()

Summary:
With column families VersionSet will not have a constant number of levels (each CF can have different options), so we'll need to eliminate call to VersionSet::NumberLevels()

This diff decreases number of callsites, but we're not there yet. It associates number of levels with Version (each version is associated with single CF) instead of VersionSet.

I have also slightly changed how VersionSet keeps track of manifest size.

This diff also modifies constructor of Compaction such that it takes input_version and automatically Ref()s it. Before this was done outside of constructor.

In next diffs I will continue to decrease number of callsites of VersionSet::NumberLevels() and also references to current_

Test Plan: make check

Reviewers: haobo, dhruba, kailiu, sdong

Reviewed By: sdong

Differential Revision: https://reviews.facebook.net/D15171
---
 db/db_impl.cc                       |  26 +--
 db/db_stats_logger.cc               |   5 +-
 db/version_set.cc                   | 254 +++++++++++++---------------
 db/version_set.h                    |  27 ++-
 db/version_set_reduce_num_levels.cc |   5 +-
 util/ldb_cmd.cc                     |   2 +-
 6 files changed, 153 insertions(+), 166 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 07ac5c9d05..cffcbdfef8 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1316,7 +1316,7 @@ int DBImpl::FindMinimumEmptyLevelFitting(int level) {
   int minimum_level = level;
   for (int i = level - 1; i > 0; --i) {
     // stop if level i is not empty
-    if (versions_->NumLevelFiles(i) > 0) break;
+    if (versions_->current()->NumLevelFiles(i) > 0) break;
 
     // stop if level i is too small (cannot fit the level files)
     if (versions_->MaxBytesForLevel(i) < versions_->NumLevelBytes(level)) break;
@@ -2233,7 +2233,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   compact->compaction->Summary(scratch, sizeof(scratch));
   Log(options_.info_log, "Compaction start summary: %s\n", scratch);
 
-  assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
+  assert(versions_->current()->NumLevelFiles(compact->compaction->level()) > 0);
   assert(compact->builder == nullptr);
   assert(!compact->outfile);
 
@@ -3207,7 +3207,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
       {
         StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT);
         env_->SleepForMicroseconds(
-          SlowdownAmount(versions_->NumLevelFiles(0),
+          SlowdownAmount(versions_->current()->NumLevelFiles(0),
                          options_.level0_slowdown_writes_trigger,
                          options_.level0_stop_writes_trigger)
         );
@@ -3242,7 +3242,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
                  STALL_MEMTABLE_COMPACTION_MICROS, stall);
       stall_memtable_compaction_ += stall;
       stall_memtable_compaction_count_++;
-    } else if (versions_->NumLevelFiles(0) >=
+    } else if (versions_->current()->NumLevelFiles(0) >=
                options_.level0_stop_writes_trigger) {
       // There are too many level-0 files.
       DelayLoggingAndReset();
@@ -3372,6 +3372,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
   value->clear();
 
   MutexLock l(&mutex_);
+  Version* current = versions_->current();
   Slice in = property;
   Slice prefix("rocksdb.");
   if (!in.starts_with(prefix)) return false;
@@ -3386,7 +3387,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
     } else {
       char buf[100];
       snprintf(buf, sizeof(buf), "%d",
-               versions_->NumLevelFiles(static_cast<int>(level)));
+               current->NumLevelFiles(static_cast<int>(level)));
       *value = buf;
       return true;
     }
@@ -3401,7 +3402,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
       snprintf(buf, sizeof(buf),
                "%3d %8d %8.0f\n",
                level,
-               versions_->NumLevelFiles(level),
+               current->NumLevelFiles(level),
                versions_->NumLevelBytes(level) / 1048576.0);
       value->append(buf);
     }
@@ -3446,7 +3447,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
              );
     value->append(buf);
     for (int level = 0; level < NumberLevels(); level++) {
-      int files = versions_->NumLevelFiles(level);
+      int files = current->NumLevelFiles(level);
       if (stats_[level].micros > 0 || files > 0) {
         int64_t bytes_read = stats_[level].bytes_readn +
                              stats_[level].bytes_readnp1;
@@ -3728,7 +3729,7 @@ Status DBImpl::DeleteFile(std::string name) {
     // This is to make sure that any deletion tombstones are not
     // lost. Check that the level passed is the last level.
     for (int i = level + 1; i < maxlevel; i++) {
-      if (versions_->NumLevelFiles(i) != 0) {
+      if (versions_->current()->NumLevelFiles(i) != 0) {
         Log(options_.info_log,
             "DeleteFile %s FAILED. File not in last level\n", name.c_str());
         return Status::InvalidArgument("File not in last level");
@@ -3853,12 +3854,11 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
       impl->MaybeScheduleLogDBDeployStats();
     }
   }
-  impl->mutex_.Unlock();
 
-  if (impl->options_.compaction_style == kCompactionStyleUniversal) {
-    int num_files;
+  if (s.ok() && impl->options_.compaction_style == kCompactionStyleUniversal) {
+    Version* current = impl->versions_->current();
     for (int i = 1; i < impl->NumberLevels(); i++) {
-      num_files = impl->versions_->NumLevelFiles(i);
+      int num_files = current->NumLevelFiles(i);
       if (num_files > 0) {
         s = Status::InvalidArgument("Not all files are at level 0. Cannot "
           "open with universal compaction style.");
@@ -3867,6 +3867,8 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
     }
   }
 
+  impl->mutex_.Unlock();
+
   if (s.ok()) {
     *dbptr = impl;
   } else {
diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc
index 91810abe38..0fd6dd8057 100644
--- a/db/db_stats_logger.cc
+++ b/db/db_stats_logger.cc
@@ -65,8 +65,9 @@ void DBImpl::LogDBDeployStats() {
 
   uint64_t file_total_size = 0;
   uint32_t file_total_num = 0;
-  for (int i = 0; i < versions_->NumberLevels(); i++) {
-    file_total_num += versions_->NumLevelFiles(i);
+  Version* current = versions_->current();
+  for (int i = 0; i < current->NumberLevels(); i++) {
+    file_total_num += current->NumLevelFiles(i);
     file_total_size += versions_->NumLevelBytes(i);
   }
 
diff --git a/db/version_set.cc b/db/version_set.cc
index a411ea2108..b4c1b22337 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -45,7 +45,7 @@ Version::~Version() {
   next_->prev_ = prev_;
 
   // Drop references to files
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     for (size_t i = 0; i < files_[level].size(); i++) {
       FileMetaData* f = files_[level][i];
       assert(f->refs > 0);
@@ -265,7 +265,7 @@ void Version::AddIterators(const ReadOptions& options,
   // For levels > 0, we can use a concatenating iterator that sequentially
   // walks through the non-overlapping files in the level, opening them
   // lazily.
-  for (int level = 1; level < vset_->NumberLevels(); level++) {
+  for (int level = 1; level < num_levels_; level++) {
     if (!files_[level].empty()) {
       iters->push_back(NewConcatenatingIterator(options, soptions, level));
     }
@@ -404,17 +404,19 @@ static bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
 }
 
 Version::Version(VersionSet* vset, uint64_t version_number)
-    : vset_(vset), next_(this), prev_(this), refs_(0),
-      files_(new std::vector<FileMetaData*>[vset->NumberLevels()]),
-      files_by_size_(vset->NumberLevels()),
-      next_file_to_compact_by_size_(vset->NumberLevels()),
+    : vset_(vset),
+      next_(this),
+      prev_(this),
+      refs_(0),
+      num_levels_(vset->num_levels_),
+      files_(new std::vector<FileMetaData*>[num_levels_]),
+      files_by_size_(num_levels_),
+      next_file_to_compact_by_size_(num_levels_),
       file_to_compact_(nullptr),
       file_to_compact_level_(-1),
-      compaction_score_(vset->NumberLevels()),
-      compaction_level_(vset->NumberLevels()),
-      offset_manifest_file_(0),
-      version_number_(version_number) {
-}
+      compaction_score_(num_levels_),
+      compaction_level_(num_levels_),
+      version_number_(version_number) {}
 
 void Version::Get(const ReadOptions& options,
                   const LookupKey& k,
@@ -453,7 +455,7 @@ void Version::Get(const ReadOptions& options,
   // levels.  Therefore we are guaranteed that if we find data
   // in an smaller level, later levels are irrelevant (unless we
   // are MergeInProgress).
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     size_t num_files = files_[level].size();
     if (num_files == 0) continue;
 
@@ -622,7 +624,7 @@ int Version::PickLevelForMemTableOutput(
       if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
         break;
       }
-      if (level + 2 >= vset_->NumberLevels()) {
+      if (level + 2 >= num_levels_) {
         level++;
         break;
       }
@@ -857,7 +859,7 @@ bool Version::HasOverlappingUserKey(
 
 std::string Version::DebugString(bool hex) const {
   std::string r;
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     // E.g.,
     //   --- level 1 ---
     //   17:123['a' .. 'd']
@@ -926,20 +928,18 @@ class VersionSet::Builder {
 
  public:
   // Initialize a builder with the files from *base and other info from *vset
-  Builder(VersionSet* vset, Version* base)
-      : vset_(vset),
-        base_(base) {
+  Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) {
     base_->Ref();
-    levels_ = new LevelState[vset_->NumberLevels()];
+    levels_ = new LevelState[base->NumberLevels()];
     BySmallestKey cmp;
     cmp.internal_comparator = &vset_->icmp_;
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base->NumberLevels(); level++) {
       levels_[level].added_files = new FileSet(cmp);
     }
   }
 
   ~Builder() {
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base_->NumberLevels(); level++) {
       const FileSet* added = levels_[level].added_files;
       std::vector<FileMetaData*> to_unref;
       to_unref.reserve(added->size());
@@ -962,7 +962,7 @@ class VersionSet::Builder {
 
   void CheckConsistency(Version* v) {
 #ifndef NDEBUG
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
       // Make sure there is no overlap in levels > 0
       if (level > 0) {
         for (uint32_t i = 1; i < v->files_[level].size(); i++) {
@@ -985,7 +985,7 @@ class VersionSet::Builder {
 #ifndef NDEBUG
       // a file to be deleted better exist in the previous version
       bool found = false;
-      for (int l = 0; !found && l < vset_->NumberLevels(); l++) {
+      for (int l = 0; !found && l < base_->NumberLevels(); l++) {
         const std::vector<FileMetaData*>& base_files = base_->files_[l];
         for (unsigned int i = 0; i < base_files.size(); i++) {
           FileMetaData* f = base_files[i];
@@ -998,7 +998,7 @@ class VersionSet::Builder {
       // if the file did not exist in the previous version, then it
       // is possibly moved from lower level to higher level in current
       // version
-      for (int l = level+1; !found && l < vset_->NumberLevels(); l++) {
+      for (int l = level+1; !found && l < base_->NumberLevels(); l++) {
         const FileSet* added = levels_[l].added_files;
         for (FileSet::const_iterator added_iter = added->begin();
              added_iter != added->end(); ++added_iter) {
@@ -1081,7 +1081,7 @@ class VersionSet::Builder {
     CheckConsistency(v);
     BySmallestKey cmp;
     cmp.internal_comparator = &vset_->icmp_;
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base_->NumberLevels(); level++) {
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
       const std::vector<FileMetaData*>& base_files = base_->files_[level];
@@ -1128,8 +1128,7 @@ class VersionSet::Builder {
   }
 };
 
-VersionSet::VersionSet(const std::string& dbname,
-                       const Options* options,
+VersionSet::VersionSet(const std::string& dbname, const Options* options,
                        const EnvOptions& storage_options,
                        TableCache* table_cache,
                        const InternalKeyComparator* cmp)
@@ -1149,9 +1148,9 @@ VersionSet::VersionSet(const std::string& dbname,
       need_slowdown_for_num_level0_files(false),
       compactions_in_progress_(options_->num_levels),
       current_version_number_(0),
-      last_observed_manifest_size_(0),
+      manifest_file_size_(0),
       storage_options_(storage_options),
-      storage_options_compactions_(storage_options_)  {
+      storage_options_compactions_(storage_options_) {
   compact_pointer_ = new std::string[options_->num_levels];
   Init(options_->num_levels);
   AppendVersion(new Version(this, current_version_number_++));
@@ -1200,7 +1199,7 @@ void VersionSet::AppendVersion(Version* v) {
   current_ = v;
   need_slowdown_for_num_level0_files =
       (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr &&
-       NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
+       v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
   v->Ref();
 
   // Append to linked list
@@ -1250,7 +1249,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
 
   //  No need to perform this check if a new Manifest is being created anyways.
   if (!descriptor_log_ ||
-      last_observed_manifest_size_ > options_->max_manifest_file_size) {
+      manifest_file_size_ > options_->max_manifest_file_size) {
     new_descriptor_log = true;
     manifest_file_number_ = NewFileNumber(); // Change manifest file no.
   }
@@ -1264,7 +1263,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
   // because &w is ensuring that all new writes get queued.
   {
     // calculate the amount of data being compacted at every level
-    std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
 
     mu->Unlock();
@@ -1340,14 +1339,11 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
 
     LogFlush(options_->info_log);
     mu->Lock();
-    // cache the manifest_file_size so that it can be used to rollover in the
-    // next call to LogAndApply
-    last_observed_manifest_size_ = new_manifest_file_size;
   }
 
   // Install the new version
   if (s.ok()) {
-    v->offset_manifest_file_ = new_manifest_file_size;
+    manifest_file_size_ = new_manifest_file_size;
     AppendVersion(v);
     log_number_ = edit->log_number_;
     prev_log_number_ = edit->prev_log_number_;
@@ -1459,7 +1455,7 @@ Status VersionSet::Recover() {
         break;
       }
 
-      if (edit.max_level_ >= NumberLevels()) {
+      if (edit.max_level_ >= current_->NumberLevels()) {
         s = Status::InvalidArgument(
             "db has more levels than options.num_levels");
         break;
@@ -1520,11 +1516,11 @@ Status VersionSet::Recover() {
     builder.SaveTo(v);
 
     // Install recovered version
-    std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
     Finalize(v, size_being_compacted);
 
-    v->offset_manifest_file_ = manifest_file_size;
+    manifest_file_size_ = manifest_file_size;
     AppendVersion(v);
     manifest_file_number_ = next_file;
     next_file_number_ = next_file + 1;
@@ -1548,7 +1544,7 @@ Status VersionSet::Recover() {
 }
 
 Status VersionSet::DumpManifest(Options& options, std::string& dscname,
-    bool verbose, bool hex) {
+                                bool verbose, bool hex) {
   struct LogReporter : public log::Reader::Reporter {
     Status* status;
     virtual void Corruption(size_t bytes, const Status& s) {
@@ -1652,7 +1648,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     builder.SaveTo(v);
 
     // Install recovered version
-    std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
     Finalize(v, size_being_compacted);
 
@@ -1683,7 +1679,7 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
 }
 
 void VersionSet::Finalize(Version* v,
-  std::vector<uint64_t>& size_being_compacted) {
+                          std::vector<uint64_t>& size_being_compacted) {
   // Pre-sort level0 for Get()
   if (options_->compaction_style == kCompactionStyleUniversal) {
     std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo);
@@ -1696,7 +1692,7 @@ void VersionSet::Finalize(Version* v,
 
   int num_levels_to_check =
       (options_->compaction_style != kCompactionStyleUniversal) ?
-          NumberLevels() - 1 : 1;
+          v->NumberLevels() - 1 : 1;
 
   for (int level = 0; level < num_levels_to_check; level++) {
 
@@ -1757,8 +1753,8 @@ void VersionSet::Finalize(Version* v,
 
   // sort all the levels based on their score. Higher scores get listed
   // first. Use bubble sort because the number of entries are small.
-  for (int i = 0; i <  NumberLevels()-2; i++) {
-    for (int j = i+1; j < NumberLevels()-1; j++) {
+  for (int i = 0; i < v->NumberLevels() - 2; i++) {
+    for (int j = i + 1; j < v->NumberLevels() - 1; j++) {
       if (v->compaction_score_[i] < v->compaction_score_[j]) {
         double score = v->compaction_score_[i];
         int level = v->compaction_level_[i];
@@ -1793,8 +1789,9 @@ static bool compareSeqnoDescending(const VersionSet::Fsize& first,
 void VersionSet::UpdateFilesBySize(Version* v) {
 
   // No need to sort the highest level because it is never compacted.
-  int max_level = (options_->compaction_style == kCompactionStyleUniversal) ?
-                  NumberLevels() : NumberLevels() - 1;
+  int max_level = (options_->compaction_style == kCompactionStyleUniversal)
+                      ? v->NumberLevels()
+                      : v->NumberLevels() - 1;
 
   for (int level = 0; level < max_level; level++) {
 
@@ -1850,7 +1847,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   }
 
   // Save files
-  for (int level = 0; level < NumberLevels(); level++) {
+  for (int level = 0; level < current_->NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       const FileMetaData* f = files[i];
@@ -1864,15 +1861,9 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   return log->AddRecord(record);
 }
 
-int VersionSet::NumLevelFiles(int level) const {
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  return current_->files_[level].size();
-}
-
 const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
   int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
-  for (int i = 0; i < NumberLevels(); i++) {
+  for (int i = 0; i < current_->NumberLevels(); i++) {
     int sz = sizeof(scratch->buffer) - len;
     int ret = snprintf(scratch->buffer + len, sz, "%d ",
         int(current_->files_[i].size()));
@@ -1884,10 +1875,10 @@ const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
   return scratch->buffer;
 }
 
-const char* VersionSet::LevelDataSizeSummary(
-    LevelSummaryStorage* scratch) const {
+const char* VersionSet::LevelDataSizeSummary(LevelSummaryStorage* scratch)
+    const {
   int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
-  for (int i = 0; i < NumberLevels(); i++) {
+  for (int i = 0; i < current_->NumberLevels(); i++) {
     int sz = sizeof(scratch->buffer) - len;
     int ret = snprintf(scratch->buffer + len, sz, "%lu ",
         (unsigned long)NumLevelBytes(i));
@@ -1950,7 +1941,7 @@ bool VersionSet::ManifestContains(const std::string& record) const {
 
 uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
   uint64_t result = 0;
-  for (int level = 0; level < NumberLevels(); level++) {
+  for (int level = 0; level < v->NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = v->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
@@ -1987,7 +1978,7 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   for (Version* v = dummy_versions_.next_;
        v != &dummy_versions_;
        v = v->next_) {
-    for (int level = 0; level < NumberLevels(); level++) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
       total_files += v->files_[level].size();
     }
   }
@@ -1998,7 +1989,7 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   for (Version* v = dummy_versions_.next_;
        v != &dummy_versions_;
        v = v->next_) {
-    for (int level = 0; level < NumberLevels(); level++) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
       for (const auto& f : v->files_[level]) {
         live_list->push_back(f->number);
       }
@@ -2008,7 +1999,7 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
 
 void VersionSet::AddLiveFilesCurrentVersion(std::set<uint64_t>* live) {
   Version* v = current_;
-  for (int level = 0; level < NumberLevels(); level++) {
+  for (int level = 0; level < v->NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = v->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       live->insert(files[i]->number);
@@ -2018,7 +2009,7 @@ void VersionSet::AddLiveFilesCurrentVersion(std::set<uint64_t>* live) {
 
 int64_t VersionSet::NumLevelBytes(int level) const {
   assert(level >= 0);
-  assert(level < NumberLevels());
+  assert(level < current_->NumberLevels());
   assert(current_);
   return TotalFileSize(current_->files_[level]);
 }
@@ -2026,7 +2017,7 @@ int64_t VersionSet::NumLevelBytes(int level) const {
 int64_t VersionSet::MaxNextLevelOverlappingBytes() {
   uint64_t result = 0;
   std::vector<FileMetaData*> overlaps;
-  for (int level = 1; level < NumberLevels() - 1; level++) {
+  for (int level = 1; level < current_->NumberLevels() - 1; level++) {
     for (size_t i = 0; i < current_->files_[level].size(); i++) {
       const FileMetaData* f = current_->files_[level][i];
       current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
@@ -2200,7 +2191,7 @@ void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) {
 // The total size of files that are currently being compacted
 // at at every level upto the penultimate level.
 void VersionSet::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
-  for (int level = 0; level < NumberLevels()-1; level++) {
+  for (int level = 0; level < NumberLevels() - 1; level++) {
     uint64_t total = 0;
     for (std::set<Compaction*>::iterator it =
          compactions_in_progress_[level].begin();
@@ -2223,8 +2214,8 @@ void VersionSet::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
 // base file (overrides configured values of file-size ratios,
 // min_merge_width and max_merge_width).
 //
-Compaction* VersionSet::PickCompactionUniversalSizeAmp(
-    int level, double score) {
+Compaction* VersionSet::PickCompactionUniversalSizeAmp(int level,
+                                                       double score) {
   assert (level == 0);
 
   // percentage flexibilty while reducing size amplification
@@ -2306,13 +2297,13 @@ Compaction* VersionSet::PickCompactionUniversalSizeAmp(
 
   // create a compaction request
   // We always compact all the files, so always compress.
-  Compaction* c = new Compaction(level, level, MaxFileSizeForLevel(level),
-                                 LLONG_MAX, NumberLevels(), false,
-                                 true);
+  Compaction* c =
+      new Compaction(current_, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, true);
   c->score_ = score;
   for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
     int index = file_by_time[loop];
-    f = current_->files_[level][index];
+    f = c->input_version_->files_[level][index];
     c->inputs_[0].push_back(f);
     Log(options_->info_log,
         "Universal: size amp picking file %lu[%d] with size %lu",
@@ -2436,14 +2427,14 @@ Compaction* VersionSet::PickCompactionUniversalReadAmp(
       }
     }
   }
-  Compaction* c = new Compaction(level, level, MaxFileSizeForLevel(level),
-                                 LLONG_MAX, NumberLevels(), false,
-                                 enable_compression);
+  Compaction* c =
+      new Compaction(current_, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, enable_compression);
   c->score_ = score;
 
   for (unsigned int i = start_index; i < first_index_after; i++) {
     int index = file_by_time[i];
-    FileMetaData* f = current_->files_[level][index];
+    FileMetaData* f = c->input_version_->files_[level][index];
     c->inputs_[0].push_back(f);
     Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n",
         (unsigned long)f->number,
@@ -2505,11 +2496,11 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
   }
 
   // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = current_->files_by_size_[level];
+  std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
 
   // Is the earliest file part of this compaction?
   int last_index = file_by_time[file_by_time.size()-1];
-  FileMetaData* last_file = current_->files_[level][last_index];
+  FileMetaData* last_file = c->input_version_->files_[level][last_index];
   if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
     c->bottommost_level_ = true;
   }
@@ -2520,9 +2511,6 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
                                       c->inputs_[0].size());
   }
 
-  c->input_version_ = current_;
-  c->input_version_->Ref();
-
   // mark all the files that are being compacted
   c->MarkFilesBeingCompacted(true);
 
@@ -2531,7 +2519,8 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
 
   // Record whether this compaction includes all sst files.
   // For now, it is only relevant in universal compaction mode.
-  c->is_full_compaction_ = (c->inputs_[0].size() == current_->files_[0].size());
+  c->is_full_compaction_ =
+      (c->inputs_[0].size() == c->input_version_->files_[0].size());
 
   return c;
 }
@@ -2548,27 +2537,28 @@ Compaction* VersionSet::PickCompactionBySize(int level, double score) {
   }
 
   assert(level >= 0);
-  assert(level+1 < NumberLevels());
-  c = new Compaction(level, level+1, MaxFileSizeForLevel(level+1),
-      MaxGrandParentOverlapBytes(level), NumberLevels());
+  assert(level + 1 < current_->NumberLevels());
+  c = new Compaction(current_, level, level + 1, MaxFileSizeForLevel(level + 1),
+                     MaxGrandParentOverlapBytes(level));
   c->score_ = score;
 
   // Pick the largest file in this level that is not already
   // being compacted
-  std::vector<int>& file_size = current_->files_by_size_[level];
+  std::vector<int>& file_size = c->input_version_->files_by_size_[level];
 
   // record the first file that is not yet compacted
   int nextIndex = -1;
 
-  for (unsigned int i = current_->next_file_to_compact_by_size_[level];
+  for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
        i < file_size.size(); i++) {
     int index = file_size[i];
-    FileMetaData* f = current_->files_[level][index];
+    FileMetaData* f = c->input_version_->files_[level][index];
 
     // check to verify files are arranged in descending size
     assert((i == file_size.size() - 1) ||
-           (i >= Version::number_of_files_to_sort_-1) ||
-          (f->file_size >= current_->files_[level][file_size[i+1]]->file_size));
+           (i >= Version::number_of_files_to_sort_ - 1) ||
+           (f->file_size >=
+            c->input_version_->files_[level][file_size[i + 1]]->file_size));
 
     // do not pick a file to compact if it is being compacted
     // from n-1 level.
@@ -2604,7 +2594,7 @@ Compaction* VersionSet::PickCompactionBySize(int level, double score) {
   }
 
   // store where to start the iteration in the next call to PickCompaction
-  current_->next_file_to_compact_by_size_[level] = nextIndex;
+  c->input_version_->next_file_to_compact_by_size_[level] = nextIndex;
 
   return c;
 }
@@ -2655,11 +2645,12 @@ Compaction* VersionSet::PickCompaction() {
     if (level != 0 || compactions_in_progress_[0].empty()) {
       if(!ParentRangeInCompaction(&f->smallest, &f->largest, level,
                                   &parent_index)) {
-        c = new Compaction(level, level+1, MaxFileSizeForLevel(level+1),
-                MaxGrandParentOverlapBytes(level), NumberLevels(), true);
+        c = new Compaction(current_, level, level + 1,
+                           MaxFileSizeForLevel(level + 1),
+                           MaxGrandParentOverlapBytes(level), true);
         c->inputs_[0].push_back(f);
         c->parent_index_ = parent_index;
-        current_->file_to_compact_ = nullptr;
+        c->input_version_->file_to_compact_ = nullptr;
         ExpandWhileOverlapping(c);
       }
     }
@@ -2669,9 +2660,6 @@ Compaction* VersionSet::PickCompaction() {
     return nullptr;
   }
 
-  c->input_version_ = current_;
-  c->input_version_->Ref();
-
   // Two level 0 compaction won't run at the same time, so don't need to worry
   // about files on level 0 being compacted.
   if (level == 0) {
@@ -2682,7 +2670,8 @@ Compaction* VersionSet::PickCompaction() {
     // c->inputs_[0] earlier and replace it with an overlapping set
     // which will include the picked file.
     c->inputs_[0].clear();
-    current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
+    c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
+                                            &c->inputs_[0]);
 
     // If we include more L0 files in the same compaction run it can
     // cause the 'smallest' and 'largest' key to get extended to a
@@ -2713,12 +2702,13 @@ Compaction* VersionSet::PickCompaction() {
 
 // Returns true if any one of the parent files are being compacted
 bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest,
-  const InternalKey* largest, int level, int* parent_index) {
+                                         const InternalKey* largest, int level,
+                                         int* parent_index) {
   std::vector<FileMetaData*> inputs;
-  assert(level + 1 < NumberLevels());
+  assert(level + 1 < current_->NumberLevels());
 
-  current_->GetOverlappingInputs(level+1, smallest, largest,
-                                 &inputs, *parent_index, parent_index);
+  current_->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
+                                 *parent_index, parent_index);
   return FilesInCompaction(inputs);
 }
 
@@ -2766,8 +2756,8 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) {
     old_size = c->inputs_[0].size();
     GetRange(c->inputs_[0], &smallest, &largest);
     c->inputs_[0].clear();
-    current_->GetOverlappingInputs(level, &smallest, &largest, &c->inputs_[0],
-                                   hint_index, &hint_index);
+    c->input_version_->GetOverlappingInputs(
+        level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
   } while(c->inputs_[0].size() > old_size);
 
   // Get the new range
@@ -2805,8 +2795,9 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   GetRange(c->inputs_[0], &smallest, &largest);
 
   // Populate the set of next-level files (inputs_[1]) to include in compaction
-  current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1],
-                                 c->parent_index_, &c->parent_index_);
+  c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
+                                          &c->inputs_[1], c->parent_index_,
+                                          &c->parent_index_);
 
   // Get entire range covered by compaction
   InternalKey all_start, all_limit;
@@ -2819,8 +2810,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   // can happen when one user key spans multiple files.
   if (!c->inputs_[1].empty()) {
     std::vector<FileMetaData*> expanded0;
-    current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0,
-                                   c->base_index_, nullptr);
+    c->input_version_->GetOverlappingInputs(
+        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
     const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
     const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
     const uint64_t expanded0_size = TotalFileSize(expanded0);
@@ -2828,13 +2819,13 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
     if (expanded0.size() > c->inputs_[0].size() &&
         inputs1_size + expanded0_size < limit &&
         !FilesInCompaction(expanded0) &&
-        !current_->HasOverlappingUserKey(&expanded0, level)) {
+        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
       InternalKey new_start, new_limit;
       GetRange(expanded0, &new_start, &new_limit);
       std::vector<FileMetaData*> expanded1;
-      current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
-                                     &expanded1, c->parent_index_,
-                                     &c->parent_index_);
+      c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
+                                              &expanded1, c->parent_index_,
+                                              &c->parent_index_);
       if (expanded1.size() == c->inputs_[1].size() &&
           !FilesInCompaction(expanded1)) {
         Log(options_->info_log,
@@ -2861,8 +2852,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   // Compute the set of grandparent files that overlap this compaction
   // (parent == level+1; grandparent == level+2)
   if (level + 2 < NumberLevels()) {
-    current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
-                                   &c->grandparents_);
+    c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+                                            &c->grandparents_);
   }
 
   if (false) {
@@ -2880,10 +2871,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   c->edit_->SetCompactPointer(level, largest);
 }
 
-Status VersionSet::GetMetadataForFile(
-    uint64_t number,
-    int *filelevel,
-    FileMetaData *meta) {
+Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
+                                      FileMetaData* meta) {
   for (int level = 0; level < NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
@@ -2897,8 +2886,7 @@ Status VersionSet::GetMetadataForFile(
   return Status::NotFound("File not present in any level");
 }
 
-void VersionSet::GetLiveFilesMetaData(
-    std::vector<LiveFileMetaData> * metadata) {
+void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
   for (int level = 0; level < NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
@@ -2960,11 +2948,9 @@ Compaction* VersionSet::CompactRange(int input_level,
       }
     }
   }
-  Compaction* c = new Compaction(input_level,
-                                 output_level,
+  Compaction* c = new Compaction(current_, input_level, output_level,
                                  MaxFileSizeForLevel(output_level),
-                                 MaxGrandParentOverlapBytes(input_level),
-                                 NumberLevels());
+                                 MaxGrandParentOverlapBytes(input_level));
 
   c->inputs_[0] = inputs;
   ExpandWhileOverlapping(c);
@@ -2973,8 +2959,6 @@ Compaction* VersionSet::CompactRange(int input_level,
     return nullptr;
   }
 
-  c->input_version_ = current_;
-  c->input_version_->Ref();
   SetupOtherInputs(c);
 
   if (covering_the_whole_range) {
@@ -2991,15 +2975,16 @@ Compaction* VersionSet::CompactRange(int input_level,
   return c;
 }
 
-Compaction::Compaction(int level, int out_level, uint64_t target_file_size,
-  uint64_t max_grandparent_overlap_bytes, int number_levels,
-  bool seek_compaction, bool enable_compression)
+Compaction::Compaction(Version* input_version, int level, int out_level,
+                       uint64_t target_file_size,
+                       uint64_t max_grandparent_overlap_bytes,
+                       bool seek_compaction, bool enable_compression)
     : level_(level),
       out_level_(out_level),
       max_output_file_size_(target_file_size),
       maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
-      input_version_(nullptr),
-      number_levels_(number_levels),
+      input_version_(input_version),
+      number_levels_(input_version_->NumberLevels()),
       seek_compaction_(seek_compaction),
       enable_compression_(enable_compression),
       grandparent_index_(0),
@@ -3010,7 +2995,9 @@ Compaction::Compaction(int level, int out_level, uint64_t target_file_size,
       score_(0),
       bottommost_level_(false),
       is_full_compaction_(false),
-      level_ptrs_(std::vector<size_t>(number_levels)) {
+      level_ptrs_(std::vector<size_t>(number_levels_)) {
+
+  input_version_->Ref();
   edit_ = new VersionEdit();
   for (int i = 0; i < number_levels_; i++) {
     level_ptrs_[i] = 0;
@@ -3125,7 +3112,7 @@ void Compaction::SetupBottomMostLevel(bool isManual) {
   bottommost_level_ = true;
   int num_levels = input_version_->vset_->NumberLevels();
   for (int i = output_level() + 1; i < num_levels; i++) {
-    if (input_version_->vset_->NumLevelFiles(i) > 0) {
+    if (input_version_->NumLevelFiles(i) > 0) {
       bottommost_level_ = false;
       break;
     }
@@ -3143,9 +3130,8 @@ void Compaction::ResetNextCompactionIndex() {
   input_version_->ResetNextCompactionIndex(level_);
 }
 
-static void InputSummary(std::vector<FileMetaData*>& files,
-    char* output,
-    int len) {
+static void InputSummary(std::vector<FileMetaData*>& files, char* output,
+                         int len) {
   int write = 0;
   for (unsigned int i = 0; i < files.size(); i++) {
     int sz = len - write;
diff --git a/db/version_set.h b/db/version_set.h
index 2c91532b5c..68c41b1602 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -135,7 +135,10 @@ class Version {
   int PickLevelForMemTableOutput(const Slice& smallest_user_key,
                                  const Slice& largest_user_key);
 
-  int NumFiles(int level) const { return files_[level].size(); }
+  int NumberLevels() const { return num_levels_; }
+
+  // REQUIRES: lock is held
+  int NumLevelFiles(int level) const { return files_[level].size(); }
 
   // Return a human readable string that describes this version's contents.
   std::string DebugString(bool hex = false) const;
@@ -161,6 +164,7 @@ class Version {
   Version* next_;               // Next version in linked list
   Version* prev_;               // Previous version in linked list
   int refs_;                    // Number of live refs to this version
+  int num_levels_;              // Number of levels
 
   // List of files per level, files in each level are arranged
   // in increasing order of keys
@@ -197,9 +201,6 @@ class Version {
   double max_compaction_score_; // max score in l1 to ln-1
   int max_compaction_score_level_; // level on which max score occurs
 
-  // The offset in the manifest file where this version is stored.
-  uint64_t offset_manifest_file_;
-
   // A version number that uniquely represents this version. This is
   // used for debugging and logging purposes only.
   uint64_t version_number_;
@@ -234,7 +235,7 @@ class VersionSet {
   // REQUIRES: *mu is held on entry.
   // REQUIRES: no other thread concurrently calls LogAndApply()
   Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
-      bool new_descriptor_log = false);
+                     bool new_descriptor_log = false);
 
   // Recover the last saved descriptor from persistent storage.
   Status Recover();
@@ -271,9 +272,6 @@ class VersionSet {
     }
   }
 
-  // Return the number of Table files at the specified level.
-  int NumLevelFiles(int level) const;
-
   // Return the combined file size of all files at the specified level.
   int64_t NumLevelBytes(int level) const;
 
@@ -400,7 +398,7 @@ class VersionSet {
   const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
 
   // Return the size of the current manifest file
-  const uint64_t ManifestFileSize() { return current_->offset_manifest_file_; }
+  uint64_t ManifestFileSize() const { return manifest_file_size_; }
 
   // For the specfied level, pick a compaction.
   // Returns nullptr if there is no compaction to be done.
@@ -524,9 +522,8 @@ class VersionSet {
   // Queue of writers to the manifest file
   std::deque<ManifestWriter*> manifest_writers_;
 
-  // Store the manifest file size when it is checked.
-  // Save us the cost of checking file size twice in LogAndApply
-  uint64_t last_observed_manifest_size_;
+  // Current size of manifest file
+  uint64_t manifest_file_size_;
 
   std::vector<FileMetaData*> obsolete_files_;
 
@@ -619,9 +616,9 @@ class Compaction {
   friend class Version;
   friend class VersionSet;
 
-  explicit Compaction(int level, int out_level, uint64_t target_file_size,
-    uint64_t max_grandparent_overlap_bytes, int number_levels,
-    bool seek_compaction = false, bool enable_compression = true);
+  Compaction(Version* input_version, int level, int out_level,
+             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
+             bool seek_compaction = false, bool enable_compression = true);
 
   int level_;
   int out_level_; // levels to which output files are stored
diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc
index 07062399b3..2ca689809d 100644
--- a/db/version_set_reduce_num_levels.cc
+++ b/db/version_set_reduce_num_levels.cc
@@ -25,7 +25,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   }
 
   Version* current_version = current_;
-  int current_levels = NumberLevels();
+  int current_levels = current_version->NumberLevels();
 
   if (current_levels <= new_levels) {
     return Status::OK();
@@ -36,7 +36,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   int first_nonempty_level = -1;
   int first_nonempty_level_filenum = 0;
   for (int i = new_levels - 1; i < current_levels; i++) {
-    int file_num = NumLevelFiles(i);
+    int file_num = current_version->NumLevelFiles(i);
     if (file_num != 0) {
       if (first_nonempty_level < 0) {
         first_nonempty_level = i;
@@ -65,6 +65,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
 
   delete[] current_version->files_;
   current_version->files_ = new_files_list;
+  current_version->num_levels_ = new_levels;
 
   delete[] compact_pointer_;
   delete[] max_file_size_;
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 58d81460e9..65ecd61a26 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -1024,7 +1024,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
   }
   int max = -1;
   for (int i = 0; i < versions.NumberLevels(); i++) {
-    if (versions.NumLevelFiles(i)) {
+    if (versions.current()->NumLevelFiles(i)) {
       max = i;
     }
   }

From 2f4eda78906e5922c519f3ba49e7a3fe1bdd1403 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 15 Jan 2014 16:18:04 -0800
Subject: [PATCH 27/30] Move functions from VersionSet to Version

Summary:
There were some functions in VersionSet that had no reason to be there instead of Version. Moving them to Version will make column families implementation easier.

The functions moved are:
* NumLevelBytes
* LevelSummary
* LevelFileSummary
* MaxNextLevelOverlappingBytes
* AddLiveFiles (previously AddLiveFilesCurrentVersion())
* NeedSlowdownForNumLevel0Files

The diff continues on (and depends on) D15171

Test Plan: make check

Reviewers: dhruba, haobo, kailiu, sdong, emayanke

Reviewed By: sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15183
---
 db/db_filesnapshot.cc |   2 +-
 db/db_impl.cc         |  50 +++++++-------
 db/db_impl.h          |   2 +-
 db/db_stats_logger.cc |   6 +-
 db/version_set.cc     | 152 ++++++++++++++++++------------------------
 db/version_set.h      |  67 +++++++++----------
 6 files changed, 125 insertions(+), 154 deletions(-)

diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index a7232246a3..04d6d0e17a 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -74,7 +74,7 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
   // Make a set of all of the live *.sst files
   std::set<uint64_t> live;
-  versions_->AddLiveFilesCurrentVersion(&live);
+  versions_->current()->AddLiveFiles(&live);
 
   ret.clear();
   ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
diff --git a/db/db_impl.cc b/db/db_impl.cc
index cffcbdfef8..e84817b9b2 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1313,13 +1313,13 @@ void DBImpl::CompactRange(const Slice* begin,
 // return the same level if it cannot be moved
 int DBImpl::FindMinimumEmptyLevelFitting(int level) {
   mutex_.AssertHeld();
+  Version* current = versions_->current();
   int minimum_level = level;
   for (int i = level - 1; i > 0; --i) {
     // stop if level i is not empty
-    if (versions_->current()->NumLevelFiles(i) > 0) break;
-
+    if (current->NumLevelFiles(i) > 0) break;
     // stop if level i is too small (cannot fit the level files)
-    if (versions_->MaxBytesForLevel(i) < versions_->NumLevelBytes(level)) break;
+    if (versions_->MaxBytesForLevel(i) < current->NumLevelBytes(level)) break;
 
     minimum_level = i;
   }
@@ -1826,6 +1826,11 @@ void DBImpl::TEST_PurgeObsoleteteWAL() {
   PurgeObsoleteWALFiles();
 }
 
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+  MutexLock l(&mutex_);
+  return versions_->current()->NumLevelBytes(0);
+}
+
 void DBImpl::BackgroundCallCompaction() {
   bool madeProgress = false;
   DeletionState deletion_state(options_.max_write_buffer_number, true);
@@ -1939,13 +1944,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
                        f->smallest_seqno, f->largest_seqno);
     status = versions_->LogAndApply(c->edit(), &mutex_);
     InstallSuperVersion(deletion_state);
-    VersionSet::LevelSummaryStorage tmp;
+    Version::LevelSummaryStorage tmp;
     Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
-        static_cast<unsigned long long>(f->number),
-        c->level() + 1,
+        static_cast<unsigned long long>(f->number), c->level() + 1,
         static_cast<unsigned long long>(f->file_size),
-        status.ToString().c_str(),
-        versions_->LevelSummary(&tmp));
+        status.ToString().c_str(), versions_->current()->LevelSummary(&tmp));
     versions_->ReleaseCompactionFiles(c.get(), status);
     *madeProgress = true;
   } else {
@@ -2605,22 +2608,21 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
     status = InstallCompactionResults(compact);
     InstallSuperVersion(deletion_state);
   }
-  VersionSet::LevelSummaryStorage tmp;
+  Version::LevelSummaryStorage tmp;
   Log(options_.info_log,
       "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) "
       "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
       "write-amplify(%.1f) %s\n",
-      versions_->LevelSummary(&tmp),
+      versions_->current()->LevelSummary(&tmp),
       (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) /
-          (double) stats.micros,
-      compact->compaction->output_level(),
-      stats.files_in_leveln, stats.files_in_levelnp1, stats.files_out_levelnp1,
-      stats.bytes_readn / 1048576.0,
-      stats.bytes_readnp1 / 1048576.0,
+          (double)stats.micros,
+      compact->compaction->output_level(), stats.files_in_leveln,
+      stats.files_in_levelnp1, stats.files_out_levelnp1,
+      stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
       stats.bytes_written / 1048576.0,
       (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
-          (double) stats.bytes_readn,
-      stats.bytes_written / (double) stats.bytes_readn,
+          (double)stats.bytes_readn,
+      stats.bytes_written / (double)stats.bytes_readn,
       status.ToString().c_str());
 
   return status;
@@ -2701,7 +2703,7 @@ Iterator* DBImpl::TEST_NewInternalIterator() {
 
 int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
   MutexLock l(&mutex_);
-  return versions_->MaxNextLevelOverlappingBytes();
+  return versions_->current()->MaxNextLevelOverlappingBytes();
 }
 
 Status DBImpl::Get(const ReadOptions& options,
@@ -3193,9 +3195,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
       // Yield previous error
       s = bg_error_;
       break;
-    } else if (
-        allow_delay &&
-        versions_->NeedSlowdownForNumLevel0Files()) {
+    } else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) {
       // We are getting close to hitting a hard limit on the number of
       // L0 files.  Rather than delaying a single write by several
       // seconds when we hit the hard limit, start delaying each
@@ -3403,7 +3403,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
                "%3d %8d %8.0f\n",
                level,
                current->NumLevelFiles(level),
-               versions_->NumLevelBytes(level) / 1048576.0);
+               current->NumLevelBytes(level) / 1048576.0);
       value->append(buf);
     }
     return true;
@@ -3446,7 +3446,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
              "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"
              );
     value->append(buf);
-    for (int level = 0; level < NumberLevels(); level++) {
+    for (int level = 0; level < current->NumberLevels(); level++) {
       int files = current->NumLevelFiles(level);
       if (stats_[level].micros > 0 || files > 0) {
         int64_t bytes_read = stats_[level].bytes_readn +
@@ -3468,8 +3468,8 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
             "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f %10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %9.1f %9lu\n",
             level,
             files,
-            versions_->NumLevelBytes(level) / 1048576.0,
-            versions_->NumLevelBytes(level) /
+            current->NumLevelBytes(level) / 1048576.0,
+            current->NumLevelBytes(level) /
                 versions_->MaxBytesForLevel(level),
             stats_[level].micros / 1e6,
             bytes_read / 1048576.0,
diff --git a/db/db_impl.h b/db/db_impl.h
index 476b2bf549..214affac7c 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -129,7 +129,7 @@ class DBImpl : public DB {
   void TEST_PurgeObsoleteteWAL();
 
   // get total level0 file size. Only for testing.
-  uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);}
+  uint64_t TEST_GetLevel0TotalSize();
 
   void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
   {
diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc
index 0fd6dd8057..db86865ca0 100644
--- a/db/db_stats_logger.cc
+++ b/db/db_stats_logger.cc
@@ -68,11 +68,11 @@ void DBImpl::LogDBDeployStats() {
   Version* current = versions_->current();
   for (int i = 0; i < current->NumberLevels(); i++) {
     file_total_num += current->NumLevelFiles(i);
-    file_total_size += versions_->NumLevelBytes(i);
+    file_total_size += current->NumLevelBytes(i);
   }
 
-  VersionSet::LevelSummaryStorage scratch;
-  const char* file_num_summary = versions_->LevelSummary(&scratch);
+  Version::LevelSummaryStorage scratch;
+  const char* file_num_summary = current->LevelSummary(&scratch);
   std::string file_num_per_level(file_num_summary);
   std::string data_size_per_level(file_num_summary);
 
diff --git a/db/version_set.cc b/db/version_set.cc
index b4c1b22337..eb20650ba2 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -857,6 +857,67 @@ bool Version::HasOverlappingUserKey(
   return false;
 }
 
+int64_t Version::NumLevelBytes(int level) const {
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return TotalFileSize(files_[level]);
+}
+
+const char* Version::LevelSummary(LevelSummaryStorage* scratch) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
+  for (int i = 0; i < NumberLevels(); i++) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
+    if (ret < 0 || ret >= sz) break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+const char* Version::LevelFileSummary(FileSummaryStorage* scratch,
+                                      int level) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
+  for (const auto& f : files_[level]) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz,
+                       "#%lu(seq=%lu,sz=%lu,%lu) ",
+                       (unsigned long)f->number,
+                       (unsigned long)f->smallest_seqno,
+                       (unsigned long)f->file_size,
+                       (unsigned long)f->being_compacted);
+    if (ret < 0 || ret >= sz)
+      break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+int64_t Version::MaxNextLevelOverlappingBytes() {
+  uint64_t result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 1; level < NumberLevels() - 1; level++) {
+    for (const auto& f : files_[level]) {
+      GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
+void Version::AddLiveFiles(std::set<uint64_t>* live) {
+  for (int level = 0; level < NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    for (const auto& file : files) {
+      live->insert(file->number);
+    }
+  }
+}
+
 std::string Version::DebugString(bool hex) const {
   std::string r;
   for (int level = 0; level < num_levels_; level++) {
@@ -1145,7 +1206,7 @@ VersionSet::VersionSet(const std::string& dbname, const Options* options,
       num_levels_(options_->num_levels),
       dummy_versions_(this),
       current_(nullptr),
-      need_slowdown_for_num_level0_files(false),
+      need_slowdown_for_num_level0_files_(false),
       compactions_in_progress_(options_->num_levels),
       current_version_number_(0),
       manifest_file_size_(0),
@@ -1197,7 +1258,7 @@ void VersionSet::AppendVersion(Version* v) {
     current_->Unref();
   }
   current_ = v;
-  need_slowdown_for_num_level0_files =
+  need_slowdown_for_num_level0_files_ =
       (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr &&
        v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
   v->Ref();
@@ -1861,55 +1922,6 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   return log->AddRecord(record);
 }
 
-const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
-  for (int i = 0; i < current_->NumberLevels(); i++) {
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz, "%d ",
-        int(current_->files_[i].size()));
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
-const char* VersionSet::LevelDataSizeSummary(LevelSummaryStorage* scratch)
-    const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
-  for (int i = 0; i < current_->NumberLevels(); i++) {
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz, "%lu ",
-        (unsigned long)NumLevelBytes(i));
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
-const char* VersionSet::LevelFileSummary(
-    FileSummaryStorage* scratch, int level) const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
-  for (unsigned int i = 0; i < current_->files_[level].size(); i++) {
-    FileMetaData* f = current_->files_[level][i];
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz,
-                       "#%lu(seq=%lu,sz=%lu,%lu) ",
-                       (unsigned long)f->number,
-                       (unsigned long)f->smallest_seqno,
-                       (unsigned long)f->file_size,
-                       (unsigned long)f->being_compacted);
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
 // Opens the mainfest file and reads all records
 // till it finds the record we are looking for.
 bool VersionSet::ManifestContains(const std::string& record) const {
@@ -1997,40 +2009,6 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   }
 }
 
-void VersionSet::AddLiveFilesCurrentVersion(std::set<uint64_t>* live) {
-  Version* v = current_;
-  for (int level = 0; level < v->NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = v->files_[level];
-    for (size_t i = 0; i < files.size(); i++) {
-      live->insert(files[i]->number);
-    }
-  }
-}
-
-int64_t VersionSet::NumLevelBytes(int level) const {
-  assert(level >= 0);
-  assert(level < current_->NumberLevels());
-  assert(current_);
-  return TotalFileSize(current_->files_[level]);
-}
-
-int64_t VersionSet::MaxNextLevelOverlappingBytes() {
-  uint64_t result = 0;
-  std::vector<FileMetaData*> overlaps;
-  for (int level = 1; level < current_->NumberLevels() - 1; level++) {
-    for (size_t i = 0; i < current_->files_[level].size(); i++) {
-      const FileMetaData* f = current_->files_[level][i];
-      current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
-                                     &overlaps);
-      const uint64_t sum = TotalFileSize(overlaps);
-      if (sum > result) {
-        result = sum;
-      }
-    }
-  }
-  return result;
-}
-
 // Stores the minimal range that covers all entries in inputs in
 // *smallest, *largest.
 // REQUIRES: inputs is not empty
@@ -2456,10 +2434,10 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
     Log(options_->info_log, "Universal: nothing to do\n");
     return nullptr;
   }
-  VersionSet::FileSummaryStorage tmp;
+  Version::FileSummaryStorage tmp;
   Log(options_->info_log, "Universal: candidate files(%lu): %s\n",
       current_->files_[level].size(),
-      LevelFileSummary(&tmp, 0));
+      current_->LevelFileSummary(&tmp, 0));
 
   // Check for size amplification first.
   Compaction* c = PickCompactionUniversalSizeAmp(level, score);
diff --git a/db/version_set.h b/db/version_set.h
index 68c41b1602..51f6d9b6ca 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -140,13 +140,34 @@ class Version {
   // REQUIRES: lock is held
   int NumLevelFiles(int level) const { return files_[level].size(); }
 
+  // Return the combined file size of all files at the specified level.
+  int64_t NumLevelBytes(int level) const;
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[100];
+  };
+  struct FileSummaryStorage {
+    char buffer[1000];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  // Return a human-readable short (single-line) summary of files
+  // in a specified level.  Uses *scratch as backing store.
+  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t MaxNextLevelOverlappingBytes();
+
+  // Add all files listed in the current version to *live.
+  void AddLiveFiles(std::set<uint64_t>* live);
+
   // Return a human readable string that describes this version's contents.
   std::string DebugString(bool hex = false) const;
 
   // Returns the version nuber of this version
-  uint64_t GetVersionNumber() {
-    return version_number_;
-  }
+  uint64_t GetVersionNumber() const { return version_number_; }
 
  private:
   friend class Compaction;
@@ -222,10 +243,8 @@ class Version {
 
 class VersionSet {
  public:
-  VersionSet(const std::string& dbname,
-             const Options* options,
-             const EnvOptions& storage_options,
-             TableCache* table_cache,
+  VersionSet(const std::string& dbname, const Options* options,
+             const EnvOptions& storage_options, TableCache* table_cache,
              const InternalKeyComparator*);
   ~VersionSet();
 
@@ -254,7 +273,7 @@ class VersionSet {
   // A Flag indicating whether write needs to slowdown because of there are
   // too many number of level0 files.
   bool NeedSlowdownForNumLevel0Files() const {
-    return need_slowdown_for_num_level0_files;
+    return need_slowdown_for_num_level0_files_;
   }
 
   // Return the current manifest file number
@@ -272,9 +291,6 @@ class VersionSet {
     }
   }
 
-  // Return the combined file size of all files at the specified level.
-  int64_t NumLevelBytes(int level) const;
-
   // Return the last sequence number.
   uint64_t LastSequence() const {
     return last_sequence_.load(std::memory_order_acquire);
@@ -321,10 +337,6 @@ class VersionSet {
                            const InternalKey* end,
                            InternalKey** compaction_end);
 
-  // Return the maximum overlapping data (in bytes) at next level for any
-  // file at a level >= 1.
-  int64_t MaxNextLevelOverlappingBytes();
-
   // Create an iterator that reads over the compaction inputs for "*c".
   // The caller should delete the iterator when no longer needed.
   Iterator* MakeInputIterator(Compaction* c);
@@ -368,35 +380,14 @@ class VersionSet {
   // Add all files listed in any live version to *live.
   void AddLiveFiles(std::vector<uint64_t>* live_list);
 
-  // Add all files listed in the current version to *live.
-  void AddLiveFilesCurrentVersion(std::set<uint64_t>* live);
-
   // Return the approximate offset in the database of the data for
   // "key" as of version "v".
   uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
 
-  // Return a human-readable short (single-line) summary of the number
-  // of files per level.  Uses *scratch as backing store.
-  struct LevelSummaryStorage {
-    char buffer[100];
-  };
-  struct FileSummaryStorage {
-    char buffer[1000];
-  };
-  const char* LevelSummary(LevelSummaryStorage* scratch) const;
-
   // printf contents (for debugging)
   Status DumpManifest(Options& options, std::string& manifestFileName,
                       bool verbose, bool hex = false);
 
-  // Return a human-readable short (single-line) summary of the data size
-  // of files per level.  Uses *scratch as backing store.
-  const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const;
-
-  // Return a human-readable short (single-line) summary of files
-  // in a specified level.  Uses *scratch as backing store.
-  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
-
   // Return the size of the current manifest file
   uint64_t ManifestFileSize() const { return manifest_file_size_; }
 
@@ -501,7 +492,9 @@ class VersionSet {
   Version dummy_versions_;  // Head of circular doubly-linked list of versions.
   Version* current_;        // == dummy_versions_.prev_
 
-  bool need_slowdown_for_num_level0_files;
+  // A flag indicating whether we should delay writes because
+  // we have too many level 0 files
+  bool need_slowdown_for_num_level0_files_;
 
   // Per-level key at which the next compaction at that level should start.
   // Either an empty string, or a valid InternalKey.

From 615d1ea2f48ee7ca730cee8f06778e2f06dd0fbd Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 15 Jan 2014 16:22:34 -0800
Subject: [PATCH 28/30] Moving Compaction class to separate header file

Summary:
I'm sure we'll all agree that version_set.cc needs simplifying. This diff moves Compaction class to a separate file.

The diff depends on D15171 and D15183

Test Plan: make check

Reviewers: dhruba, haobo, kailiu, sdong

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15189
---
 db/compaction.cc  | 214 ++++++++++++++++++++++++++++++++++++++++++++++
 db/compaction.h   | 131 ++++++++++++++++++++++++++++
 db/version_set.cc | 193 +----------------------------------------
 db/version_set.h  | 115 +------------------------
 4 files changed, 347 insertions(+), 306 deletions(-)
 create mode 100644 db/compaction.cc
 create mode 100644 db/compaction.h

diff --git a/db/compaction.cc b/db/compaction.cc
new file mode 100644
index 0000000000..703e7aeaeb
--- /dev/null
+++ b/db/compaction.cc
@@ -0,0 +1,214 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction.h"
+
+namespace rocksdb {
+
+static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+Compaction::Compaction(Version* input_version, int level, int out_level,
+                       uint64_t target_file_size,
+                       uint64_t max_grandparent_overlap_bytes,
+                       bool seek_compaction, bool enable_compression)
+    : level_(level),
+      out_level_(out_level),
+      max_output_file_size_(target_file_size),
+      maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
+      input_version_(input_version),
+      number_levels_(input_version_->NumberLevels()),
+      seek_compaction_(seek_compaction),
+      enable_compression_(enable_compression),
+      grandparent_index_(0),
+      seen_key_(false),
+      overlapped_bytes_(0),
+      base_index_(-1),
+      parent_index_(-1),
+      score_(0),
+      bottommost_level_(false),
+      is_full_compaction_(false),
+      level_ptrs_(std::vector<size_t>(number_levels_)) {
+
+  input_version_->Ref();
+  edit_ = new VersionEdit();
+  for (int i = 0; i < number_levels_; i++) {
+    level_ptrs_[i] = 0;
+  }
+}
+
+Compaction::~Compaction() {
+  delete edit_;
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+  }
+}
+
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there is lots of overlapping grandparent data.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  // If level_== out_level_, the purpose is to force compaction filter to be
+  // applied to that level, and thus cannot be a trivia move.
+  return (level_ != out_level_ &&
+          num_input_files(0) == 1 &&
+          num_input_files(1) == 0 &&
+          TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
+}
+
+void Compaction::AddInputDeletions(VersionEdit* edit) {
+  for (int which = 0; which < 2; which++) {
+    for (size_t i = 0; i < inputs_[which].size(); i++) {
+      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+    }
+  }
+}
+
+bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
+  if (input_version_->vset_->options_->compaction_style ==
+      kCompactionStyleUniversal) {
+    return bottommost_level_;
+  }
+  // Maybe use binary search to find right entry instead of linear search?
+  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
+  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
+    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+    for (; level_ptrs_[lvl] < files.size(); ) {
+      FileMetaData* f = files[level_ptrs_[lvl]];
+      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+        // We've advanced far enough
+        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+          // Key falls in this file's range, so definitely not base level
+          return false;
+        }
+        break;
+      }
+      level_ptrs_[lvl]++;
+    }
+  }
+  return true;
+}
+
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
+  // Scan to find earliest grandparent file that contains key.
+  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+  while (grandparent_index_ < grandparents_.size() &&
+      icmp->Compare(internal_key,
+                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
+    if (seen_key_) {
+      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+    }
+    assert(grandparent_index_ + 1 >= grandparents_.size() ||
+           icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
+                         grandparents_[grandparent_index_+1]->smallest.Encode())
+                         < 0);
+    grandparent_index_++;
+  }
+  seen_key_ = true;
+
+  if (overlapped_bytes_ > maxGrandParentOverlapBytes_) {
+    // Too much overlap for current output; start new output
+    overlapped_bytes_ = 0;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool value) {
+  for (int i = 0; i < 2; i++) {
+    std::vector<FileMetaData*> v = inputs_[i];
+    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
+      assert(value ? !inputs_[i][j]->being_compacted :
+                      inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = value;
+    }
+  }
+}
+
+// Is this compaction producing files at the bottommost level?
+void Compaction::SetupBottomMostLevel(bool isManual) {
+  if (input_version_->vset_->options_->compaction_style  ==
+         kCompactionStyleUniversal) {
+    // If universal compaction style is used and manual
+    // compaction is occuring, then we are guaranteed that
+    // all files will be picked in a single compaction
+    // run. We can safely set bottommost_level_ = true.
+    // If it is not manual compaction, then bottommost_level_
+    // is already set when the Compaction was created.
+    if (isManual) {
+      bottommost_level_ = true;
+    }
+    return;
+  }
+  bottommost_level_ = true;
+  int num_levels = input_version_->vset_->NumberLevels();
+  for (int i = output_level() + 1; i < num_levels; i++) {
+    if (input_version_->NumLevelFiles(i) > 0) {
+      bottommost_level_ = false;
+      break;
+    }
+  }
+}
+
+void Compaction::ReleaseInputs() {
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+    input_version_ = nullptr;
+  }
+}
+
+void Compaction::ResetNextCompactionIndex() {
+  input_version_->ResetNextCompactionIndex(level_);
+}
+
+static void InputSummary(std::vector<FileMetaData*>& files, char* output,
+                         int len) {
+  int write = 0;
+  for (unsigned int i = 0; i < files.size(); i++) {
+    int sz = len - write;
+    int ret = snprintf(output + write, sz, "%lu(%lu) ",
+        (unsigned long)files.at(i)->number,
+        (unsigned long)files.at(i)->file_size);
+    if (ret < 0 || ret >= sz)
+      break;
+    write += ret;
+  }
+}
+
+void Compaction::Summary(char* output, int len) {
+  int write = snprintf(output, len,
+      "Base version %lu Base level %d, seek compaction:%d, inputs:",
+      (unsigned long)input_version_->GetVersionNumber(),
+      level_,
+      seek_compaction_);
+  if (write < 0 || write > len) {
+    return;
+  }
+
+  char level_low_summary[100];
+  InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary));
+  char level_up_summary[100];
+  if (inputs_[1].size()) {
+    InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary));
+  } else {
+    level_up_summary[0] = '\0';
+  }
+
+  snprintf(output + write, len - write, "[%s],[%s]",
+      level_low_summary, level_up_summary);
+}
+
+}  // namespace rocksdb
diff --git a/db/compaction.h b/db/compaction.h
new file mode 100644
index 0000000000..4cc0197da8
--- /dev/null
+++ b/db/compaction.h
@@ -0,0 +1,131 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+
+namespace rocksdb {
+
+class Version;
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+  ~Compaction();
+
+  // Return the level that is being compacted.  Inputs from "level"
+  // will be merged.
+  int level() const { return level_; }
+
+  // Outputs will go to this level
+  int output_level() const { return out_level_; }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return edit_; }
+
+  // "which" must be either 0 or 1
+  int num_input_files(int which) const { return inputs_[which].size(); }
+
+  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+  // Whether compression will be enabled for compaction outputs
+  bool enable_compression() const { return enable_compression_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the information we have available guarantees that
+  // the compaction is producing data in "level+1" for which no data exists
+  // in levels greater than "level+1".
+  bool IsBaseLevelForKey(const Slice& user_key);
+
+  // Returns true iff we should stop building the current output
+  // before processing "internal_key".
+  bool ShouldStopBefore(const Slice& internal_key);
+
+  // Release the input version for the compaction, once the compaction
+  // is successful.
+  void ReleaseInputs();
+
+  void Summary(char* output, int len);
+
+  // Return the score that was used to pick this compaction run.
+  double score() const { return score_; }
+
+  // Is this compaction creating a file in the bottom most level?
+  bool BottomMostLevel() { return bottommost_level_; }
+
+  // Does this compaction include all sst files?
+  bool IsFullCompaction() { return is_full_compaction_; }
+
+ private:
+  friend class Version;
+  friend class VersionSet;
+
+  Compaction(Version* input_version, int level, int out_level,
+             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
+             bool seek_compaction = false, bool enable_compression = true);
+
+  int level_;
+  int out_level_; // levels to which output files are stored
+  uint64_t max_output_file_size_;
+  uint64_t maxGrandParentOverlapBytes_;
+  Version* input_version_;
+  VersionEdit* edit_;
+  int number_levels_;
+
+  bool seek_compaction_;
+  bool enable_compression_;
+
+  // Each compaction reads inputs from "level_" and "level_+1"
+  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
+
+  // State used to check for number of of overlapping grandparent files
+  // (parent == level_ + 1, grandparent == level_ + 2)
+  std::vector<FileMetaData*> grandparents_;
+  size_t grandparent_index_;  // Index in grandparent_starts_
+  bool seen_key_;             // Some output key has been seen
+  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
+                              // and grandparent files
+  int base_index_;   // index of the file in files_[level_]
+  int parent_index_; // index of some file with same range in files_[level_+1]
+  double score_;     // score that was used to pick this compaction.
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+  // Does this compaction include all sst files?
+  bool is_full_compaction_;
+
+  // level_ptrs_ holds indices into input_version_->levels_: our state
+  // is that we are positioned at one of the file ranges for each
+  // higher level than the ones involved in this compaction (i.e. for
+  // all L >= level_ + 2).
+  std::vector<size_t> level_ptrs_;
+
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool);
+
+  // Initialize whether compaction producing files at the bottommost level
+  void SetupBottomMostLevel(bool isManual);
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+};
+
+}  // namespace rocksdb
diff --git a/db/version_set.cc b/db/version_set.cc
index eb20650ba2..05e7c7053f 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -18,6 +18,7 @@
 #include "db/memtable.h"
 #include "db/merge_context.h"
 #include "db/table_cache.h"
+#include "db/compaction.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/table.h"
@@ -2953,196 +2954,4 @@ Compaction* VersionSet::CompactRange(int input_level,
   return c;
 }
 
-Compaction::Compaction(Version* input_version, int level, int out_level,
-                       uint64_t target_file_size,
-                       uint64_t max_grandparent_overlap_bytes,
-                       bool seek_compaction, bool enable_compression)
-    : level_(level),
-      out_level_(out_level),
-      max_output_file_size_(target_file_size),
-      maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
-      input_version_(input_version),
-      number_levels_(input_version_->NumberLevels()),
-      seek_compaction_(seek_compaction),
-      enable_compression_(enable_compression),
-      grandparent_index_(0),
-      seen_key_(false),
-      overlapped_bytes_(0),
-      base_index_(-1),
-      parent_index_(-1),
-      score_(0),
-      bottommost_level_(false),
-      is_full_compaction_(false),
-      level_ptrs_(std::vector<size_t>(number_levels_)) {
-
-  input_version_->Ref();
-  edit_ = new VersionEdit();
-  for (int i = 0; i < number_levels_; i++) {
-    level_ptrs_[i] = 0;
-  }
-}
-
-Compaction::~Compaction() {
-  delete edit_;
-  if (input_version_ != nullptr) {
-    input_version_->Unref();
-  }
-}
-
-bool Compaction::IsTrivialMove() const {
-  // Avoid a move if there is lots of overlapping grandparent data.
-  // Otherwise, the move could create a parent file that will require
-  // a very expensive merge later on.
-  // If level_== out_level_, the purpose is to force compaction filter to be
-  // applied to that level, and thus cannot be a trivia move.
-  return (level_ != out_level_ &&
-          num_input_files(0) == 1 &&
-          num_input_files(1) == 0 &&
-          TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
-}
-
-void Compaction::AddInputDeletions(VersionEdit* edit) {
-  for (int which = 0; which < 2; which++) {
-    for (size_t i = 0; i < inputs_[which].size(); i++) {
-      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
-    }
-  }
-}
-
-bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
-  if (input_version_->vset_->options_->compaction_style ==
-      kCompactionStyleUniversal) {
-    return bottommost_level_;
-  }
-  // Maybe use binary search to find right entry instead of linear search?
-  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
-  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
-    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
-    for (; level_ptrs_[lvl] < files.size(); ) {
-      FileMetaData* f = files[level_ptrs_[lvl]];
-      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
-        // We've advanced far enough
-        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
-          // Key falls in this file's range, so definitely not base level
-          return false;
-        }
-        break;
-      }
-      level_ptrs_[lvl]++;
-    }
-  }
-  return true;
-}
-
-bool Compaction::ShouldStopBefore(const Slice& internal_key) {
-  // Scan to find earliest grandparent file that contains key.
-  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
-  while (grandparent_index_ < grandparents_.size() &&
-      icmp->Compare(internal_key,
-                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
-    if (seen_key_) {
-      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
-    }
-    assert(grandparent_index_ + 1 >= grandparents_.size() ||
-           icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
-                         grandparents_[grandparent_index_+1]->smallest.Encode())
-                         < 0);
-    grandparent_index_++;
-  }
-  seen_key_ = true;
-
-  if (overlapped_bytes_ > maxGrandParentOverlapBytes_) {
-    // Too much overlap for current output; start new output
-    overlapped_bytes_ = 0;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-// Mark (or clear) each file that is being compacted
-void Compaction::MarkFilesBeingCompacted(bool value) {
-  for (int i = 0; i < 2; i++) {
-    std::vector<FileMetaData*> v = inputs_[i];
-    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
-      assert(value ? !inputs_[i][j]->being_compacted :
-                      inputs_[i][j]->being_compacted);
-      inputs_[i][j]->being_compacted = value;
-    }
-  }
-}
-
-// Is this compaction producing files at the bottommost level?
-void Compaction::SetupBottomMostLevel(bool isManual) {
-  if (input_version_->vset_->options_->compaction_style  ==
-         kCompactionStyleUniversal) {
-    // If universal compaction style is used and manual
-    // compaction is occuring, then we are guaranteed that
-    // all files will be picked in a single compaction
-    // run. We can safely set bottommost_level_ = true.
-    // If it is not manual compaction, then bottommost_level_
-    // is already set when the Compaction was created.
-    if (isManual) {
-      bottommost_level_ = true;
-    }
-    return;
-  }
-  bottommost_level_ = true;
-  int num_levels = input_version_->vset_->NumberLevels();
-  for (int i = output_level() + 1; i < num_levels; i++) {
-    if (input_version_->NumLevelFiles(i) > 0) {
-      bottommost_level_ = false;
-      break;
-    }
-  }
-}
-
-void Compaction::ReleaseInputs() {
-  if (input_version_ != nullptr) {
-    input_version_->Unref();
-    input_version_ = nullptr;
-  }
-}
-
-void Compaction::ResetNextCompactionIndex() {
-  input_version_->ResetNextCompactionIndex(level_);
-}
-
-static void InputSummary(std::vector<FileMetaData*>& files, char* output,
-                         int len) {
-  int write = 0;
-  for (unsigned int i = 0; i < files.size(); i++) {
-    int sz = len - write;
-    int ret = snprintf(output + write, sz, "%lu(%lu) ",
-        (unsigned long)files.at(i)->number,
-        (unsigned long)files.at(i)->file_size);
-    if (ret < 0 || ret >= sz)
-      break;
-    write += ret;
-  }
-}
-
-void Compaction::Summary(char* output, int len) {
-  int write = snprintf(output, len,
-      "Base version %lu Base level %d, seek compaction:%d, inputs:",
-      (unsigned long)input_version_->GetVersionNumber(),
-      level_,
-      seek_compaction_);
-  if (write < 0 || write > len) {
-    return;
-  }
-
-  char level_low_summary[100];
-  InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary));
-  char level_up_summary[100];
-  if (inputs_[1].size()) {
-    InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary));
-  } else {
-    level_up_summary[0] = '\0';
-  }
-
-  snprintf(output + write, len - write, "[%s],[%s]",
-      level_low_summary, level_up_summary);
-}
-
 }  // namespace rocksdb
diff --git a/db/version_set.h b/db/version_set.h
index 51f6d9b6ca..319067d1ab 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -27,6 +27,7 @@
 #include "db/version_edit.h"
 #include "port/port.h"
 #include "db/table_cache.h"
+#include "db/compaction.h"
 
 namespace rocksdb {
 
@@ -546,118 +547,4 @@ class VersionSet {
                            VersionEdit* edit, port::Mutex* mu);
 };
 
-// A Compaction encapsulates information about a compaction.
-class Compaction {
- public:
-  ~Compaction();
-
-  // Return the level that is being compacted.  Inputs from "level"
-  // will be merged.
-  int level() const { return level_; }
-
-  // Outputs will go to this level
-  int output_level() const { return out_level_; }
-
-  // Return the object that holds the edits to the descriptor done
-  // by this compaction.
-  VersionEdit* edit() { return edit_; }
-
-  // "which" must be either 0 or 1
-  int num_input_files(int which) const { return inputs_[which].size(); }
-
-  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
-  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
-
-  // Maximum size of files to build during this compaction.
-  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
-
-  // Whether compression will be enabled for compaction outputs
-  bool enable_compression() const { return enable_compression_; }
-
-  // Is this a trivial compaction that can be implemented by just
-  // moving a single input file to the next level (no merging or splitting)
-  bool IsTrivialMove() const;
-
-  // Add all inputs to this compaction as delete operations to *edit.
-  void AddInputDeletions(VersionEdit* edit);
-
-  // Returns true if the information we have available guarantees that
-  // the compaction is producing data in "level+1" for which no data exists
-  // in levels greater than "level+1".
-  bool IsBaseLevelForKey(const Slice& user_key);
-
-  // Returns true iff we should stop building the current output
-  // before processing "internal_key".
-  bool ShouldStopBefore(const Slice& internal_key);
-
-  // Release the input version for the compaction, once the compaction
-  // is successful.
-  void ReleaseInputs();
-
-  void Summary(char* output, int len);
-
-  // Return the score that was used to pick this compaction run.
-  double score() const { return score_; }
-
-  // Is this compaction creating a file in the bottom most level?
-  bool BottomMostLevel() { return bottommost_level_; }
-
-  // Does this compaction include all sst files?
-  bool IsFullCompaction() { return is_full_compaction_; }
-
- private:
-  friend class Version;
-  friend class VersionSet;
-
-  Compaction(Version* input_version, int level, int out_level,
-             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
-             bool seek_compaction = false, bool enable_compression = true);
-
-  int level_;
-  int out_level_; // levels to which output files are stored
-  uint64_t max_output_file_size_;
-  uint64_t maxGrandParentOverlapBytes_;
-  Version* input_version_;
-  VersionEdit* edit_;
-  int number_levels_;
-
-  bool seek_compaction_;
-  bool enable_compression_;
-
-  // Each compaction reads inputs from "level_" and "level_+1"
-  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
-
-  // State used to check for number of of overlapping grandparent files
-  // (parent == level_ + 1, grandparent == level_ + 2)
-  std::vector<FileMetaData*> grandparents_;
-  size_t grandparent_index_;  // Index in grandparent_starts_
-  bool seen_key_;             // Some output key has been seen
-  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
-                              // and grandparent files
-  int base_index_;   // index of the file in files_[level_]
-  int parent_index_; // index of some file with same range in files_[level_+1]
-  double score_;     // score that was used to pick this compaction.
-
-  // Is this compaction creating a file in the bottom most level?
-  bool bottommost_level_;
-  // Does this compaction include all sst files?
-  bool is_full_compaction_;
-
-  // level_ptrs_ holds indices into input_version_->levels_: our state
-  // is that we are positioned at one of the file ranges for each
-  // higher level than the ones involved in this compaction (i.e. for
-  // all L >= level_ + 2).
-  std::vector<size_t> level_ptrs_;
-
-  // mark (or clear) all files that are being compacted
-  void MarkFilesBeingCompacted(bool);
-
-  // Initialize whether compaction producing files at the bottommost level
-  void SetupBottomMostLevel(bool isManual);
-
-  // In case of compaction error, reset the nextIndex that is used
-  // to pick up the next file to be compacted from files_by_size_
-  void ResetNextCompactionIndex();
-};
-
 }  // namespace rocksdb

From 787f11bb3bbd1539de1cfece609af1131e4eae9a Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 15 Jan 2014 16:23:36 -0800
Subject: [PATCH 29/30] Move more functions from VersionSet to Version

Summary:
This moves functions:
* VersionSet::Finalize() -> Version::UpdateCompactionStats()
* VersionSet::UpdateFilesBySize() -> Version::UpdateFilesBySize()

The diff depends on D15189, D15183 and D15171

Test Plan: make check

Reviewers: kailiu, sdong, haobo, dhruba

Reviewed By: sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15201
---
 db/version_set.cc | 315 +++++++++++++++++++++++-----------------------
 db/version_set.h  |  27 ++--
 2 files changed, 173 insertions(+), 169 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 05e7c7053f..64ebb14275 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -590,6 +590,159 @@ bool Version::UpdateStats(const GetStats& stats) {
   return false;
 }
 
+void Version::Finalize(std::vector<uint64_t>& size_being_compacted) {
+  // Pre-sort level0 for Get()
+  if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
+    std::sort(files_[0].begin(), files_[0].end(), NewestFirstBySeqNo);
+  } else {
+    std::sort(files_[0].begin(), files_[0].end(), NewestFirst);
+  }
+
+  double max_score = 0;
+  int max_score_level = 0;
+
+  int num_levels_to_check =
+      (vset_->options_->compaction_style != kCompactionStyleUniversal)
+          ? NumberLevels() - 1
+          : 1;
+
+  for (int level = 0; level < num_levels_to_check; level++) {
+    double score;
+    if (level == 0) {
+      // We treat level-0 specially by bounding the number of files
+      // instead of number of bytes for two reasons:
+      //
+      // (1) With larger write-buffer sizes, it is nice not to do too
+      // many level-0 compactions.
+      //
+      // (2) The files in level-0 are merged on every read and
+      // therefore we wish to avoid too many files when the individual
+      // file size is small (perhaps because of a small write-buffer
+      // setting, or very high compression ratios, or lots of
+      // overwrites/deletions).
+      int numfiles = 0;
+      for (unsigned int i = 0; i < files_[level].size(); i++) {
+        if (!files_[level][i]->being_compacted) {
+          numfiles++;
+        }
+      }
+
+      // If we are slowing down writes, then we better compact that first
+      if (numfiles >= vset_->options_->level0_stop_writes_trigger) {
+        score = 1000000;
+        // Log(options_->info_log, "XXX score l0 = 1000000000 max");
+      } else if (numfiles >= vset_->options_->level0_slowdown_writes_trigger) {
+        score = 10000;
+        // Log(options_->info_log, "XXX score l0 = 1000000 medium");
+      } else {
+        score = static_cast<double>(numfiles) /
+                vset_->options_->level0_file_num_compaction_trigger;
+        if (score >= 1) {
+          // Log(options_->info_log, "XXX score l0 = %d least", (int)score);
+        }
+      }
+    } else {
+      // Compute the ratio of current size to size limit.
+      const uint64_t level_bytes =
+          TotalFileSize(files_[level]) - size_being_compacted[level];
+      score = static_cast<double>(level_bytes) / vset_->MaxBytesForLevel(level);
+      if (score > 1) {
+        // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score);
+      }
+      if (max_score < score) {
+        max_score = score;
+        max_score_level = level;
+      }
+    }
+    compaction_level_[level] = level;
+    compaction_score_[level] = score;
+  }
+
+  // update the max compaction score in levels 1 to n-1
+  max_compaction_score_ = max_score;
+  max_compaction_score_level_ = max_score_level;
+
+  // sort all the levels based on their score. Higher scores get listed
+  // first. Use bubble sort because the number of entries are small.
+  for (int i = 0; i < NumberLevels() - 2; i++) {
+    for (int j = i + 1; j < NumberLevels() - 1; j++) {
+      if (compaction_score_[i] < compaction_score_[j]) {
+        double score = compaction_score_[i];
+        int level = compaction_level_[i];
+        compaction_score_[i] = compaction_score_[j];
+        compaction_level_[i] = compaction_level_[j];
+        compaction_score_[j] = score;
+        compaction_level_[j] = level;
+      }
+    }
+  }
+}
+
+namespace {
+
+// Compator that is used to sort files based on their size
+// In normal mode: descending size
+bool CompareSizeDescending(const Version::Fsize& first,
+                           const Version::Fsize& second) {
+  return (first.file->file_size > second.file->file_size);
+}
+// A static compator used to sort files based on their seqno
+// In universal style : descending seqno
+bool CompareSeqnoDescending(const Version::Fsize& first,
+                            const Version::Fsize& second) {
+  if (first.file->smallest_seqno > second.file->smallest_seqno) {
+    assert(first.file->largest_seqno > second.file->largest_seqno);
+    return true;
+  }
+  assert(first.file->largest_seqno <= second.file->largest_seqno);
+  return false;
+}
+
+} // anonymous namespace
+
+void Version::UpdateFilesBySize() {
+  // No need to sort the highest level because it is never compacted.
+  int max_level =
+      (vset_->options_->compaction_style == kCompactionStyleUniversal)
+          ? NumberLevels()
+          : NumberLevels() - 1;
+
+  for (int level = 0; level < max_level; level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    std::vector<int>& files_by_size = files_by_size_[level];
+    assert(files_by_size.size() == 0);
+
+    // populate a temp vector for sorting based on size
+    std::vector<Fsize> temp(files.size());
+    for (unsigned int i = 0; i < files.size(); i++) {
+      temp[i].index = i;
+      temp[i].file = files[i];
+    }
+
+    // sort the top number_of_files_to_sort_ based on file size
+    if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
+      int num = temp.size();
+      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                        CompareSeqnoDescending);
+    } else {
+      int num = Version::number_of_files_to_sort_;
+      if (num > (int)temp.size()) {
+        num = temp.size();
+      }
+      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                        CompareSizeDescending);
+    }
+    assert(temp.size() == files.size());
+
+    // initialize files_by_size_
+    for (unsigned int i = 0; i < temp.size(); i++) {
+      files_by_size.push_back(temp[i].index);
+    }
+    next_file_to_compact_by_size_[level] = 0;
+    assert(files_[level].size() == files_by_size_[level].size());
+  }
+}
+
 void Version::Ref() {
   ++refs_;
 }
@@ -1344,8 +1497,8 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
 
     // The calls to Finalize and UpdateFilesBySize are cpu-heavy
     // and is best called outside the mutex.
-    Finalize(v, size_being_compacted);
-    UpdateFilesBySize(v);
+    v->Finalize(size_being_compacted);
+    v->UpdateFilesBySize();
 
     // Write new record to MANIFEST log
     if (s.ok()) {
@@ -1580,7 +1733,7 @@ Status VersionSet::Recover() {
     // Install recovered version
     std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
-    Finalize(v, size_being_compacted);
+    v->Finalize(size_being_compacted);
 
     manifest_file_size_ = manifest_file_size;
     AppendVersion(v);
@@ -1712,7 +1865,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     // Install recovered version
     std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
-    Finalize(v, size_being_compacted);
+    v->Finalize(size_being_compacted);
 
     AppendVersion(v);
     manifest_file_number_ = next_file;
@@ -1740,158 +1893,6 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
   }
 }
 
-void VersionSet::Finalize(Version* v,
-                          std::vector<uint64_t>& size_being_compacted) {
-  // Pre-sort level0 for Get()
-  if (options_->compaction_style == kCompactionStyleUniversal) {
-    std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo);
-  } else {
-    std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst);
-  }
-
-  double max_score = 0;
-  int max_score_level = 0;
-
-  int num_levels_to_check =
-      (options_->compaction_style != kCompactionStyleUniversal) ?
-          v->NumberLevels() - 1 : 1;
-
-  for (int level = 0; level < num_levels_to_check; level++) {
-
-    double score;
-    if (level == 0) {
-      // We treat level-0 specially by bounding the number of files
-      // instead of number of bytes for two reasons:
-      //
-      // (1) With larger write-buffer sizes, it is nice not to do too
-      // many level-0 compactions.
-      //
-      // (2) The files in level-0 are merged on every read and
-      // therefore we wish to avoid too many files when the individual
-      // file size is small (perhaps because of a small write-buffer
-      // setting, or very high compression ratios, or lots of
-      // overwrites/deletions).
-      int numfiles = 0;
-      for (unsigned int i = 0; i < v->files_[level].size(); i++) {
-        if (!v->files_[level][i]->being_compacted) {
-          numfiles++;
-        }
-      }
-
-      // If we are slowing down writes, then we better compact that first
-      if (numfiles >= options_->level0_stop_writes_trigger) {
-        score = 1000000;
-        // Log(options_->info_log, "XXX score l0 = 1000000000 max");
-      } else if (numfiles >= options_->level0_slowdown_writes_trigger) {
-        score = 10000;
-        // Log(options_->info_log, "XXX score l0 = 1000000 medium");
-      } else {
-        score = numfiles /
-          static_cast<double>(options_->level0_file_num_compaction_trigger);
-        if (score >= 1) {
-          // Log(options_->info_log, "XXX score l0 = %d least", (int)score);
-        }
-      }
-    } else {
-      // Compute the ratio of current size to size limit.
-      const uint64_t level_bytes = TotalFileSize(v->files_[level]) -
-                                   size_being_compacted[level];
-      score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
-      if (score > 1) {
-        // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score);
-      }
-      if (max_score < score) {
-        max_score = score;
-        max_score_level = level;
-      }
-    }
-    v->compaction_level_[level] = level;
-    v->compaction_score_[level] = score;
-  }
-
-  // update the max compaction score in levels 1 to n-1
-  v->max_compaction_score_ = max_score;
-  v->max_compaction_score_level_ = max_score_level;
-
-  // sort all the levels based on their score. Higher scores get listed
-  // first. Use bubble sort because the number of entries are small.
-  for (int i = 0; i < v->NumberLevels() - 2; i++) {
-    for (int j = i + 1; j < v->NumberLevels() - 1; j++) {
-      if (v->compaction_score_[i] < v->compaction_score_[j]) {
-        double score = v->compaction_score_[i];
-        int level = v->compaction_level_[i];
-        v->compaction_score_[i] = v->compaction_score_[j];
-        v->compaction_level_[i] = v->compaction_level_[j];
-        v->compaction_score_[j] = score;
-        v->compaction_level_[j] = level;
-      }
-    }
-  }
-}
-
-// A static compator used to sort files based on their size
-// In normal mode: descending size
-static bool compareSizeDescending(const VersionSet::Fsize& first,
-  const VersionSet::Fsize& second) {
-  return (first.file->file_size > second.file->file_size);
-}
-// A static compator used to sort files based on their seqno
-// In universal style : descending seqno
-static bool compareSeqnoDescending(const VersionSet::Fsize& first,
-  const VersionSet::Fsize& second) {
-  if (first.file->smallest_seqno > second.file->smallest_seqno) {
-    assert(first.file->largest_seqno > second.file->largest_seqno);
-    return true;
-  }
-  assert(first.file->largest_seqno <= second.file->largest_seqno);
-  return false;
-}
-
-// sort all files in level1 to level(n-1) based on file size
-void VersionSet::UpdateFilesBySize(Version* v) {
-
-  // No need to sort the highest level because it is never compacted.
-  int max_level = (options_->compaction_style == kCompactionStyleUniversal)
-                      ? v->NumberLevels()
-                      : v->NumberLevels() - 1;
-
-  for (int level = 0; level < max_level; level++) {
-
-    const std::vector<FileMetaData*>& files = v->files_[level];
-    std::vector<int>& files_by_size = v->files_by_size_[level];
-    assert(files_by_size.size() == 0);
-
-    // populate a temp vector for sorting based on size
-    std::vector<Fsize> temp(files.size());
-    for (unsigned int i = 0; i < files.size(); i++) {
-      temp[i].index = i;
-      temp[i].file = files[i];
-    }
-
-    // sort the top number_of_files_to_sort_ based on file size
-    if (options_->compaction_style == kCompactionStyleUniversal) {
-      int num = temp.size();
-      std::partial_sort(temp.begin(),  temp.begin() + num,
-                        temp.end(), compareSeqnoDescending);
-    } else {
-      int num = Version::number_of_files_to_sort_;
-      if (num > (int)temp.size()) {
-        num = temp.size();
-      }
-      std::partial_sort(temp.begin(),  temp.begin() + num,
-                        temp.end(), compareSizeDescending);
-    }
-    assert(temp.size() == files.size());
-
-    // initialize files_by_size_
-    for (unsigned int i = 0; i < temp.size(); i++) {
-      files_by_size.push_back(temp[i].index);
-    }
-    v->next_file_to_compact_by_size_[level] = 0;
-    assert(v->files_[level].size() == v->files_by_size_[level].size());
-  }
-}
-
 Status VersionSet::WriteSnapshot(log::Writer* log) {
   // TODO: Break up into multiple records to reduce memory usage on recovery?
 
@@ -2586,7 +2587,7 @@ Compaction* VersionSet::PickCompaction() {
   // and also in LogAndApply(), otherwise the values could be stale.
   std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
   current_->vset_->SizeBeingCompacted(size_being_compacted);
-  Finalize(current_, size_being_compacted);
+  current_->Finalize(size_being_compacted);
 
   // In universal style of compaction, compact L0 files back into L0.
   if (options_->compaction_style ==  kCompactionStyleUniversal) {
diff --git a/db/version_set.h b/db/version_set.h
index 319067d1ab..8651a6eb39 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -87,6 +87,11 @@ class Version {
   // REQUIRES: lock is held
   bool UpdateStats(const GetStats& stats);
 
+  // Updates internal structures that keep track of compaction scores
+  // We use compaction scores to figure out which compaction to do next
+  // Also pre-sorts level0 files for Get()
+  void Finalize(std::vector<uint64_t>& size_being_compacted);
+
   // Reference count management (so Versions do not disappear out from
   // under live iterators)
   void Ref();
@@ -170,6 +175,12 @@ class Version {
   // Returns the version nuber of this version
   uint64_t GetVersionNumber() const { return version_number_; }
 
+  // used to sort files by size
+  struct Fsize {
+    int index;
+    FileMetaData* file;
+  };
+
  private:
   friend class Compaction;
   friend class VersionSet;
@@ -182,6 +193,10 @@ class Version {
   bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
                       const Slice& internal_prefix, Iterator* level_iter) const;
 
+  // Sort all files for this version based on their file size and
+  // record results in files_by_size_. The largest files are listed first.
+  void UpdateFilesBySize();
+
   VersionSet* vset_;            // VersionSet to which this Version belongs
   Version* next_;               // Next version in linked list
   Version* prev_;               // Previous version in linked list
@@ -417,16 +432,6 @@ class VersionSet {
   // pick the same files to compact.
   bool VerifyCompactionFileConsistency(Compaction* c);
 
-  // used to sort files by size
-  typedef struct fsize {
-    int index;
-    FileMetaData* file;
-  } Fsize;
-
-  // Sort all files for this version based on their file size and
-  // record results in files_by_size_. The largest files are listed first.
-  void UpdateFilesBySize(Version *v);
-
   // Get the max file size in a given level.
   uint64_t MaxFileSizeForLevel(int level);
 
@@ -449,8 +454,6 @@ class VersionSet {
 
   void Init(int num_levels);
 
-  void Finalize(Version* v, std::vector<uint64_t>&);
-
   void GetRange(const std::vector<FileMetaData*>& inputs,
                 InternalKey* smallest,
                 InternalKey* largest);

From eae1804f29585cc643dee798a52d71569d1d90de Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Wed, 15 Jan 2014 18:17:58 -0800
Subject: [PATCH 30/30] Remove the unnecessary use of shared_ptr

Summary:
shared_ptr is slower than unique_ptr (which literally comes with no performance cost compare with raw pointers).
In memtable and memtable rep, we use shared_ptr when we'd actually should use unique_ptr.

According to igor's previous work, we are likely to make quite some performance gain from this diff.

Test Plan: make check

Reviewers: dhruba, igor, sdong, haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15213
---
 db/memtable.cc                | 20 ++++++-------
 db/memtable.h                 |  2 +-
 db/version_set.cc             |  2 +-
 include/rocksdb/memtablerep.h | 24 +++++++---------
 util/hash_skiplist_rep.cc     | 54 ++++++++++++++---------------------
 util/hash_skiplist_rep.h      |  4 +--
 util/skiplistrep.cc           | 10 +++----
 util/vectorrep.cc             | 14 ++++-----
 8 files changed, 58 insertions(+), 72 deletions(-)

diff --git a/db/memtable.cc b/db/memtable.cc
index baff4fb340..7eb4eb165a 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -85,11 +85,11 @@ class MemTableIterator: public Iterator {
   MemTableIterator(MemTableRep* table, const ReadOptions& options)
     : iter_() {
     if (options.prefix) {
-      iter_ = table->GetPrefixIterator(*options.prefix);
+      iter_.reset(table->GetPrefixIterator(*options.prefix));
     } else if (options.prefix_seek) {
-      iter_ = table->GetDynamicPrefixIterator();
+      iter_.reset(table->GetDynamicPrefixIterator());
     } else {
-      iter_ = table->GetIterator();
+      iter_.reset(table->GetIterator());
     }
   }
 
@@ -110,7 +110,7 @@ class MemTableIterator: public Iterator {
   virtual Status status() const { return Status::OK(); }
 
  private:
-  std::shared_ptr<MemTableRep::Iterator> iter_;
+  std::unique_ptr<MemTableRep::Iterator> iter_;
   std::string tmp_;       // For passing to EncodeKey
 
   // No copying allowed
@@ -161,8 +161,8 @@ void MemTable::Add(SequenceNumber s, ValueType type,
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                    MergeContext& merge_context, const Options& options) {
   Slice memkey = key.memtable_key();
-  std::shared_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(key.user_key()));
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(key.user_key()));
   iter->Seek(memkey.data());
 
   bool merge_in_progress = s->IsMergeInProgress();
@@ -267,8 +267,8 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
   LookupKey lkey(key, seq);
   Slice memkey = lkey.memtable_key();
 
-  std::shared_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(lkey.user_key()));
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(lkey.user_key()));
   iter->Seek(memkey.data());
 
   if (iter->Valid()) {
@@ -329,8 +329,8 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
   // A total ordered iterator is costly for some memtablerep (prefix aware
   // reps). By passing in the user key, we allow efficient iterator creation.
   // The iterator only needs to be ordered within the same user key.
-  std::shared_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(key.user_key()));
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(key.user_key()));
   iter->Seek(memkey.data());
 
   size_t num_successive_merges = 0;
diff --git a/db/memtable.h b/db/memtable.h
index 24a2c852bd..1b9005800e 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -143,7 +143,7 @@ class MemTable {
   KeyComparator comparator_;
   int refs_;
   ArenaImpl arena_impl_;
-  shared_ptr<MemTableRep> table_;
+  unique_ptr<MemTableRep> table_;
 
   // These are used to manage memtable flushes to storage
   bool flush_in_progress_; // started the flush
diff --git a/db/version_set.cc b/db/version_set.cc
index 64ebb14275..22135b947f 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -698,7 +698,7 @@ bool CompareSeqnoDescending(const Version::Fsize& first,
   return false;
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 void Version::UpdateFilesBySize() {
   // No need to sort the highest level because it is never compacted.
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index fcb782d415..2fca8d1610 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -111,27 +111,23 @@ class MemTableRep {
   };
 
   // Return an iterator over the keys in this representation.
-  virtual std::shared_ptr<Iterator> GetIterator() = 0;
+  virtual Iterator* GetIterator() = 0;
 
   // Return an iterator over at least the keys with the specified user key. The
   // iterator may also allow access to other keys, but doesn't have to. Default:
   // GetIterator().
-  virtual std::shared_ptr<Iterator> GetIterator(const Slice& user_key) {
-    return GetIterator();
-  }
+  virtual Iterator* GetIterator(const Slice& user_key) { return GetIterator(); }
 
   // Return an iterator over at least the keys with the specified prefix. The
   // iterator may also allow access to other keys, but doesn't have to. Default:
   // GetIterator().
-  virtual std::shared_ptr<Iterator> GetPrefixIterator(const Slice& prefix) {
+  virtual Iterator* GetPrefixIterator(const Slice& prefix) {
     return GetIterator();
   }
 
   // Return an iterator that has a special Seek semantics. The result of
   // a Seek might only include keys with the same prefix as the target key.
-  virtual std::shared_ptr<Iterator> GetDynamicPrefixIterator() {
-    return GetIterator();
-  }
+  virtual Iterator* GetDynamicPrefixIterator() { return GetIterator(); }
 
  protected:
   // When *key is an internal key concatenated with the value, returns the
@@ -144,8 +140,8 @@ class MemTableRep {
 class MemTableRepFactory {
  public:
   virtual ~MemTableRepFactory() { };
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) = 0;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                         Arena*) = 0;
   virtual const char* Name() const = 0;
 };
 
@@ -161,8 +157,8 @@ class VectorRepFactory : public MemTableRepFactory {
   const size_t count_;
 public:
   explicit VectorRepFactory(size_t count = 0) : count_(count) { }
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) override;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                         Arena*) override;
   virtual const char* Name() const override {
     return "VectorRepFactory";
   }
@@ -171,8 +167,8 @@ public:
 // This uses a skip list to store keys. It is the default.
 class SkipListFactory : public MemTableRepFactory {
 public:
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) override;
+ virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                        Arena*) override;
   virtual const char* Name() const override {
     return "SkipListFactory";
   }
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index c669769e09..e9fe1573aa 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -31,17 +31,15 @@ class HashSkipListRep : public MemTableRep {
 
   virtual ~HashSkipListRep();
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+  virtual MemTableRep::Iterator* GetIterator() override;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator(
-      const Slice& slice) override;
+  virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetPrefixIterator(
-      const Slice& prefix) override;
-
-  virtual std::shared_ptr<MemTableRep::Iterator> GetDynamicPrefixIterator()
+  virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix)
       override;
 
+  virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
+
  private:
   friend class DynamicIterator;
   typedef SkipList<const char*, MemTableRep::KeyComparator&> Bucket;
@@ -208,18 +206,15 @@ class HashSkipListRep : public MemTableRep {
     virtual void SeekToLast() { }
    private:
   };
-
-  std::shared_ptr<EmptyIterator> empty_iterator_;
 };
 
 HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare,
-    Arena* arena, const SliceTransform* transform, size_t bucket_size)
-  : bucket_size_(bucket_size),
-    transform_(transform),
-    compare_(compare),
-    arena_(arena),
-    empty_iterator_(std::make_shared<EmptyIterator>()) {
-
+                                 Arena* arena, const SliceTransform* transform,
+                                 size_t bucket_size)
+    : bucket_size_(bucket_size),
+      transform_(transform),
+      compare_(compare),
+      arena_(arena) {
   buckets_ = new port::AtomicPointer[bucket_size];
 
   for (size_t i = 0; i < bucket_size_; ++i) {
@@ -263,7 +258,7 @@ size_t HashSkipListRep::ApproximateMemoryUsage() {
   return sizeof(buckets_);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator() {
+MemTableRep::Iterator* HashSkipListRep::GetIterator() {
   auto list = new Bucket(compare_, arena_);
   for (size_t i = 0; i < bucket_size_; ++i) {
     auto bucket = GetBucket(i);
@@ -274,35 +269,30 @@ std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator() {
       }
     }
   }
-  return std::make_shared<Iterator>(list);
+  return new Iterator(list);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetPrefixIterator(
-  const Slice& prefix) {
+MemTableRep::Iterator* HashSkipListRep::GetPrefixIterator(const Slice& prefix) {
   auto bucket = GetBucket(prefix);
   if (bucket == nullptr) {
-    return empty_iterator_;
+    return new EmptyIterator();
   }
-  return std::make_shared<Iterator>(bucket, false);
+  return new Iterator(bucket, false);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator(
-    const Slice& slice) {
+MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) {
   return GetPrefixIterator(transform_->Transform(slice));
 }
 
-std::shared_ptr<MemTableRep::Iterator>
-    HashSkipListRep::GetDynamicPrefixIterator() {
-  return std::make_shared<DynamicIterator>(*this);
+MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() {
+  return new DynamicIterator(*this);
 }
 
 } // anon namespace
 
-std::shared_ptr<MemTableRep>
-HashSkipListRepFactory::CreateMemTableRep(MemTableRep::KeyComparator &compare,
-                                          Arena *arena) {
-  return std::make_shared<HashSkipListRep>(compare, arena, transform_,
-      bucket_count_);
+MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new HashSkipListRep(compare, arena, transform_, bucket_count_);
 }
 
 MemTableRepFactory* NewHashSkipListRepFactory(
diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h
index b946cf05ef..7b8414c887 100644
--- a/util/hash_skiplist_rep.h
+++ b/util/hash_skiplist_rep.h
@@ -21,8 +21,8 @@ class HashSkipListRepFactory : public MemTableRepFactory {
 
   virtual ~HashSkipListRepFactory() { delete transform_; }
 
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-      MemTableRep::KeyComparator& compare, Arena* arena) override;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare,
+                                         Arena* arena) override;
 
   virtual const char* Name() const override {
     return "HashSkipListRepFactory";
diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc
index 955d754b15..a5b072ad16 100644
--- a/util/skiplistrep.cc
+++ b/util/skiplistrep.cc
@@ -90,15 +90,15 @@ public:
   // Unhide default implementations of GetIterator
   using MemTableRep::GetIterator;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override {
-    return std::make_shared<SkipListRep::Iterator>(&skip_list_);
+  virtual MemTableRep::Iterator* GetIterator() override {
+    return new SkipListRep::Iterator(&skip_list_);
   }
 };
 }
 
-std::shared_ptr<MemTableRep> SkipListFactory::CreateMemTableRep (
-  MemTableRep::KeyComparator& compare, Arena* arena) {
-    return std::shared_ptr<MemTableRep>(new SkipListRep(compare, arena));
+MemTableRep* SkipListFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new SkipListRep(compare, arena);
 }
 
 } // namespace rocksdb
diff --git a/util/vectorrep.cc b/util/vectorrep.cc
index 8d3ccc9dfb..87fae4bc72 100644
--- a/util/vectorrep.cc
+++ b/util/vectorrep.cc
@@ -88,7 +88,7 @@ class VectorRep : public MemTableRep {
   using MemTableRep::GetIterator;
 
   // Return an iterator over the keys in this representation.
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+  virtual MemTableRep::Iterator* GetIterator() override;
 
  private:
   friend class Iterator;
@@ -228,22 +228,22 @@ void VectorRep::Iterator::SeekToLast() {
   }
 }
 
-std::shared_ptr<MemTableRep::Iterator> VectorRep::GetIterator() {
+MemTableRep::Iterator* VectorRep::GetIterator() {
   ReadLock l(&rwlock_);
   // Do not sort here. The sorting would be done the first time
   // a Seek is performed on the iterator.
   if (immutable_) {
-    return std::make_shared<Iterator>(this, bucket_, compare_);
+    return new Iterator(this, bucket_, compare_);
   } else {
     std::shared_ptr<Bucket> tmp;
     tmp.reset(new Bucket(*bucket_)); // make a copy
-    return std::make_shared<Iterator>(nullptr, tmp, compare_);
+    return new Iterator(nullptr, tmp, compare_);
   }
 }
 } // anon namespace
 
-std::shared_ptr<MemTableRep> VectorRepFactory::CreateMemTableRep(
-  MemTableRep::KeyComparator& compare, Arena* arena) {
-  return std::make_shared<VectorRep>(compare, arena, count_);
+MemTableRep* VectorRepFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new VectorRep(compare, arena, count_);
 }
 } // namespace rocksdb