From bd998a5213b8e05c4e130acd2ec5c01f67abcc6e Mon Sep 17 00:00:00 2001 From: Bart Trojanowski Date: Tue, 3 Dec 2013 16:27:12 -0500 Subject: [PATCH 01/40] fix missing gflags library On Debian/testing and RHEL6 builds would fail due to undefined references to google::FlagRegisterer::FlagRegisterer. It would seem that -lgflags was missing from the build script. --- build_tools/build_detect_platform | 1 + 1 file changed, 1 insertion(+) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index dfe89963a8..5d2434539a 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -184,6 +184,7 @@ EOF EOF if [ "$?" = 0 ]; then COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" fi # Test whether zlib library is installed From 0a5ec49895feef55b00f38a6bf2ecf031c120956 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Fri, 6 Dec 2013 16:10:43 -0800 Subject: [PATCH 02/40] Make DBWithTTL more like StackableDB Summary: Now DBWithTTL takes DB* and can behave more like StackableDB. This saves us a lot of duplicate work by defining interfaces Test Plan: ttl_test with ASAN - OK Reviewers: emayanke Reviewed By: emayanke CC: leveldb Differential Revision: https://reviews.facebook.net/D14481 --- .../string_append/stringappend_test.cc | 7 +- utilities/ttl/db_ttl.cc | 147 ++++-------------- utilities/ttl/db_ttl.h | 81 ++-------- 3 files changed, 46 insertions(+), 189 deletions(-) diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc index 216dbe84e7..81af64622e 100644 --- a/utilities/merge_operators/string_append/stringappend_test.cc +++ b/utilities/merge_operators/string_append/stringappend_test.cc @@ -41,9 +41,7 @@ std::shared_ptr OpenTtlDb(char delim_char) { Options options; options.create_if_missing = true; options.merge_operator.reset(new StringAppendTESTOperator(delim_char)); - Status s; - db = new DBWithTTL(123456, options, kDbName, s, false); - ASSERT_OK(s); + ASSERT_OK(UtilityDB::OpenTtlDB(options, kDbName, &db, 123456)); return std::shared_ptr(db); } @@ -53,6 +51,7 @@ class StringLists { public: //Constructor: specifies the rocksdb db + /* implicit */ StringLists(std::shared_ptr db) : db_(db), merge_option_(), @@ -75,7 +74,7 @@ class StringLists { // Returns the list of strings associated with key (or "" if does not exist) bool Get(const std::string& key, std::string* const result){ - assert(result != NULL); // we should have a place to store the result + assert(result != nullptr); // we should have a place to store the result auto s = db_->Get(get_option_, key, result); if (s.ok()) { diff --git a/utilities/ttl/db_ttl.cc b/utilities/ttl/db_ttl.cc index ee4a948b9d..5b704930b2 100644 --- a/utilities/ttl/db_ttl.cc +++ b/utilities/ttl/db_ttl.cc @@ -10,40 +10,27 @@ namespace rocksdb { -// Open the db inside DBWithTTL because options needs pointer to its ttl -DBWithTTL::DBWithTTL(const int32_t ttl, - const Options& options, - const std::string& dbname, - Status& st, - bool read_only) - : StackableDB(nullptr) { - Options options_to_open = options; - - if (options.compaction_filter) { - ttl_comp_filter_.reset( - new TtlCompactionFilter(ttl, options.compaction_filter)); - options_to_open.compaction_filter = ttl_comp_filter_.get(); +void DBWithTTL::SanitizeOptions(int32_t ttl, Options* options) { + if (options->compaction_filter) { + options->compaction_filter = + new TtlCompactionFilter(ttl, options->compaction_filter); } else { - options_to_open.compaction_filter_factory = - std::shared_ptr( - new TtlCompactionFilterFactory( - ttl, options.compaction_filter_factory)); + options->compaction_filter_factory = + std::shared_ptr(new TtlCompactionFilterFactory( + ttl, options->compaction_filter_factory)); } - if (options.merge_operator) { - options_to_open.merge_operator.reset( - new TtlMergeOperator(options.merge_operator)); - } - - if (read_only) { - st = DB::OpenForReadOnly(options_to_open, dbname, &db_); - } else { - st = DB::Open(options_to_open, dbname, &db_); + if (options->merge_operator) { + options->merge_operator.reset( + new TtlMergeOperator(options->merge_operator)); } } +// Open the db inside DBWithTTL because options needs pointer to its ttl +DBWithTTL::DBWithTTL(DB* db) : StackableDB(db) {} + DBWithTTL::~DBWithTTL() { - delete db_; + delete GetOptions().compaction_filter; } Status UtilityDB::OpenTtlDB( @@ -53,9 +40,19 @@ Status UtilityDB::OpenTtlDB( int32_t ttl, bool read_only) { Status st; - *dbptr = new DBWithTTL(ttl, options, dbname, st, read_only); - if (!st.ok()) { - delete *dbptr; + Options options_to_open = options; + DBWithTTL::SanitizeOptions(ttl, &options_to_open); + DB* db; + + if (read_only) { + st = DB::OpenForReadOnly(options_to_open, dbname, &db); + } else { + st = DB::Open(options_to_open, dbname, &db); + } + if (st.ok()) { + *dbptr = new DBWithTTL(db); + } else { + delete db; } return st; } @@ -122,10 +119,8 @@ Status DBWithTTL::StripTS(std::string* str) { return st; } -Status DBWithTTL::Put( - const WriteOptions& opt, - const Slice& key, - const Slice& val) { +Status DBWithTTL::Put(const WriteOptions& opt, const Slice& key, + const Slice& val) { WriteBatch batch; batch.Put(key, val); return Write(opt, &batch); @@ -166,10 +161,6 @@ bool DBWithTTL::KeyMayExist(const ReadOptions& options, return ret; } -Status DBWithTTL::Delete(const WriteOptions& wopts, const Slice& key) { - return db_->Delete(wopts, key); -} - Status DBWithTTL::Merge(const WriteOptions& opt, const Slice& key, const Slice& value) { @@ -221,86 +212,6 @@ Iterator* DBWithTTL::NewIterator(const ReadOptions& opts) { return new TtlIterator(db_->NewIterator(opts)); } -const Snapshot* DBWithTTL::GetSnapshot() { - return db_->GetSnapshot(); -} - -void DBWithTTL::ReleaseSnapshot(const Snapshot* snapshot) { - db_->ReleaseSnapshot(snapshot); -} - -bool DBWithTTL::GetProperty(const Slice& property, std::string* value) { - return db_->GetProperty(property, value); -} - -void DBWithTTL::GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { - db_->GetApproximateSizes(r, n, sizes); -} - -void DBWithTTL::CompactRange(const Slice* begin, const Slice* end, - bool reduce_level, int target_level) { - db_->CompactRange(begin, end, reduce_level, target_level); -} - -int DBWithTTL::NumberLevels() { - return db_->NumberLevels(); -} - -int DBWithTTL::MaxMemCompactionLevel() { - return db_->MaxMemCompactionLevel(); -} - -int DBWithTTL::Level0StopWriteTrigger() { - return db_->Level0StopWriteTrigger(); -} - -Env* DBWithTTL::GetEnv() const { - return db_->GetEnv(); -} - -const Options& DBWithTTL::GetOptions() const { - return db_->GetOptions(); -} - -Status DBWithTTL::Flush(const FlushOptions& fopts) { - return db_->Flush(fopts); -} - -Status DBWithTTL::DisableFileDeletions() { - return db_->DisableFileDeletions(); -} - -Status DBWithTTL::EnableFileDeletions() { - return db_->EnableFileDeletions(); -} - -Status DBWithTTL::GetLiveFiles(std::vector& vec, uint64_t* mfs, - bool flush_memtable) { - return db_->GetLiveFiles(vec, mfs, flush_memtable); -} - -SequenceNumber DBWithTTL::GetLatestSequenceNumber() const { - return db_->GetLatestSequenceNumber(); -} - -Status DBWithTTL::GetSortedWalFiles(VectorLogPtr& files) { - return db_->GetSortedWalFiles(files); -} - -Status DBWithTTL::DeleteFile(std::string name) { - return db_->DeleteFile(name); -} - -Status DBWithTTL::GetDbIdentity(std::string& identity) { - return db_->GetDbIdentity(identity); -} - -Status DBWithTTL::GetUpdatesSince( - SequenceNumber seq_number, - unique_ptr* iter) { - return db_->GetUpdatesSince(seq_number, iter); -} - void DBWithTTL::TEST_Destroy_DBWithTtl() { ((DBImpl*) db_)->TEST_Destroy_DBImpl(); } diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h index c5270764e0..2fdc664e21 100644 --- a/utilities/ttl/db_ttl.h +++ b/utilities/ttl/db_ttl.h @@ -14,82 +14,33 @@ namespace rocksdb { class DBWithTTL : public StackableDB { public: - DBWithTTL(const int32_t ttl, - const Options& options, - const std::string& dbname, - Status& st, - bool read_only); + static void SanitizeOptions(int32_t ttl, Options* options); + + explicit DBWithTTL(DB* db); virtual ~DBWithTTL(); - virtual Status Put(const WriteOptions& o, - const Slice& key, - const Slice& val); + virtual Status Put(const WriteOptions& o, const Slice& key, + const Slice& val) override; - virtual Status Get(const ReadOptions& options, - const Slice& key, - std::string* value); + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) override; - virtual std::vector MultiGet(const ReadOptions& options, - const std::vector& keys, - std::vector* values); + virtual std::vector MultiGet( + const ReadOptions& options, const std::vector& keys, + std::vector* values) override; virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, std::string* value, bool* value_found = nullptr) override; - virtual Status Delete(const WriteOptions& wopts, const Slice& key); + virtual Status Merge(const WriteOptions& options, const Slice& key, + const Slice& value) override; - virtual Status Merge(const WriteOptions& options, - const Slice& key, - const Slice& value); + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; - - virtual Status Write(const WriteOptions& opts, WriteBatch* updates); - - virtual Iterator* NewIterator(const ReadOptions& opts); - - virtual const Snapshot* GetSnapshot(); - - virtual void ReleaseSnapshot(const Snapshot* snapshot); - - virtual bool GetProperty(const Slice& property, std::string* value); - - virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes); - - virtual void CompactRange(const Slice* begin, const Slice* end, - bool reduce_level = false, int target_level = -1); - - virtual int NumberLevels(); - - virtual int MaxMemCompactionLevel(); - - virtual int Level0StopWriteTrigger(); - - virtual Env* GetEnv() const; - - virtual const Options& GetOptions() const; - - virtual Status Flush(const FlushOptions& fopts); - - virtual Status DisableFileDeletions(); - - virtual Status EnableFileDeletions(); - - virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, - bool flush_memtable = true); - - virtual Status GetSortedWalFiles(VectorLogPtr& files); - - virtual Status DeleteFile(std::string name); - - virtual Status GetDbIdentity(std::string& identity); - - virtual SequenceNumber GetLatestSequenceNumber() const; - - virtual Status GetUpdatesSince(SequenceNumber seq_number, - unique_ptr* iter); + virtual Iterator* NewIterator(const ReadOptions& opts) override; // Simulate a db crash, no elegant closing of database. void TEST_Destroy_DBWithTtl(); @@ -113,10 +64,6 @@ class DBWithTTL : public StackableDB { static const int32_t kMinTimestamp = 1368146402; // 05/09/2013:5:40PM GMT-8 static const int32_t kMaxTimestamp = 2147483647; // 01/18/2038:7:14PM GMT-8 - - private: - DB* db_; - unique_ptr ttl_comp_filter_; }; class TtlIterator : public Iterator { From 07c8448845b772af153c91b2eef9752a0bf93b66 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Fri, 6 Dec 2013 16:22:38 -0800 Subject: [PATCH 03/40] Enable regression tests to be run on other branches Summary: When running regression tests on other branches, this will push values to entity rocksdb_build.$git_branch Test Plan: Ran regression test on regression branch, observed values send to ODS in entity rocksdb_build.regression Reviewers: kailiu Reviewed By: kailiu CC: leveldb Differential Revision: https://reviews.facebook.net/D14493 --- build_tools/regression_build_test.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index 1c44e5ad27..eb89f6a9ea 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -26,6 +26,12 @@ function cleanup { } trap cleanup EXIT +git_branch=$(git rev-parse --abbrev-ref HEAD) +if [ $git_branch == "master" ]; then + git_branch="" +else + git_branch="."$git_branch +fi make clean OPT=-DNDEBUG make db_bench -j$(nproc) @@ -150,7 +156,7 @@ function send_to_ods { echo >&2 "ERROR: Key $key doesn't have a value." return fi - curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \ + curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_branch&key=$key&value=$value" \ --connect-timeout 60 } From 9644e0e0c7892dcbf4c27a34890a74097be3a17a Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Fri, 6 Dec 2013 17:11:09 -0800 Subject: [PATCH 04/40] Print stack trace on assertion failure Summary: This will help me a lot! When we hit an assertion in unittest, we get the whole stack trace now. Also, changed stack trace a bit, we now include actual demangled C++ class::function symbols! Test Plan: Added ASSERT_TRUE(false) to a test, observed a stack trace Reviewers: haobo, dhruba, kailiu Reviewed By: kailiu CC: leveldb Differential Revision: https://reviews.facebook.net/D14499 --- port/stack_trace.cc | 32 ++++++++++++++++---------------- util/stack_trace.h | 3 +++ util/testharness.h | 2 ++ 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/port/stack_trace.cc b/port/stack_trace.cc index a98f26eacf..aa01fd0cf3 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -31,12 +31,7 @@ static const char* GetExecutableName() } } -static void StackTraceHandler(int sig) { - // reset to default handler - signal(sig, SIG_DFL); - - fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); - +void PrintStack(int first_frames_to_skip) { const int kMaxFrames = 100; void *frames[kMaxFrames]; @@ -45,11 +40,8 @@ static void StackTraceHandler(int sig) { auto executable = GetExecutableName(); - const int kSkip = 2; // skip the top two signal handler related frames - - for (int i = kSkip; i < num_frames; ++i) - { - fprintf(stderr, "#%-2d %p ", i - kSkip, frames[i]); + for (int i = first_frames_to_skip; i < num_frames; ++i) { + fprintf(stderr, "#%-2d ", i - first_frames_to_skip); if (symbols) { fprintf(stderr, "%s ", symbols[i]); } @@ -57,22 +49,29 @@ static void StackTraceHandler(int sig) { // out source to addr2line, for the address translation const int kLineMax = 256; char cmd[kLineMax]; - sprintf(cmd,"addr2line %p -e %s 2>&1", frames[i] , executable); + sprintf(cmd, "addr2line %p -e %s -f -C 2>&1", frames[i], executable); auto f = popen(cmd, "r"); if (f) { char line[kLineMax]; while (fgets(line, sizeof(line), f)) { - fprintf(stderr, "%s", line); + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); } pclose(f); - } else { - fprintf(stderr, "\n"); } } else { - fprintf(stderr, "\n"); + fprintf(stderr, " %p", frames[i]); } + fprintf(stderr, "\n"); } +} +static void StackTraceHandler(int sig) { + // reset to default handler + signal(sig, SIG_DFL); + fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); + // skip the top three signal handler related frames + PrintStack(3); // re-signal to default handler (so we still get core dump if needed...) raise(sig); } @@ -96,6 +95,7 @@ void InstallStackTraceHandler() { namespace rocksdb { void InstallStackTraceHandler() {} +void PrintStack(int first_frames_to_skip) {} } diff --git a/util/stack_trace.h b/util/stack_trace.h index 888304462e..3b06e1df06 100644 --- a/util/stack_trace.h +++ b/util/stack_trace.h @@ -11,4 +11,7 @@ namespace rocksdb { // Currently supports linux only. No-op otherwise. void InstallStackTraceHandler(); +// Prints stack, skips skip_first_frames frames +void PrintStack(int first_frames_to_skip = 0); + } // namespace rocksdb diff --git a/util/testharness.h b/util/testharness.h index 936ee8b6c4..f15917816e 100644 --- a/util/testharness.h +++ b/util/testharness.h @@ -15,6 +15,7 @@ #include "rocksdb/env.h" #include "rocksdb/slice.h" #include "util/random.h" +#include "util/stack_trace.h" namespace rocksdb { namespace test { @@ -58,6 +59,7 @@ class Tester { ~Tester() { if (!ok_) { fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); + PrintStack(2); exit(1); } } From 26bc40a89a077339ee3f76ea1f88a49e194525bf Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 9 Dec 2013 10:36:39 -0800 Subject: [PATCH 05/40] Fixing git branch detection in Jenkins Branch detection did not work in Jenkins. I realized that it set GIT_BRANCH env variable to point to the current branch, so let's try using this for branch detection. --- build_tools/regression_build_test.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index eb89f6a9ea..b0140ef48f 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -26,11 +26,11 @@ function cleanup { } trap cleanup EXIT -git_branch=$(git rev-parse --abbrev-ref HEAD) -if [ $git_branch == "master" ]; then - git_branch="" +git_br=$(basename $GIT_BRANCH) +if [ $git_br == "master" ]; then + git_br="" else - git_branch="."$git_branch + git_br="."$git_br fi make clean @@ -156,7 +156,7 @@ function send_to_ods { echo >&2 "ERROR: Key $key doesn't have a value." return fi - curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_branch&key=$key&value=$value" \ + curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \ --connect-timeout 60 } From fb9fce4fc30b368fcd701d6566e441ff93767326 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 9 Dec 2013 14:06:52 -0800 Subject: [PATCH 06/40] [RocksDB] BackupableDB Summary: In this diff I present you BackupableDB v1. You can easily use it to backup your DB and it will do incremental snapshots for you. Let's first describe how you would use BackupableDB. It's inheriting StackableDB interface so you can easily construct it with your DB object -- it will add a method RollTheSnapshot() to the DB object. When you call RollTheSnapshot(), current snapshot of the DB will be stored in the backup dir. To restore, you can just call RestoreDBFromBackup() on a BackupableDB (which is a static method) and it will restore all files from the backup dir. In the next version, it will even support automatic backuping every X minutes. There are multiple things you can configure: 1. backup_env and db_env can be different, which is awesome because then you can easily backup to HDFS or wherever you feel like. 2. sync - if true, it *guarantees* backup consistency on machine reboot 3. number of snapshots to keep - this will keep last N snapshots around if you want, for some reason, be able to restore from an earlier snapshot. All the backuping is done in incremental fashion - if we already have 00010.sst, we will not copy it again. *IMPORTANT* -- This is based on assumption that 00010.sst never changes - two files named 00010.sst from the same DB will always be exactly the same. Is this true? I always copy manifest, current and log files. 4. You can decide if you want to flush the memtables before you backup, or you're fine with backing up the log files -- either way, you get a complete and consistent view of the database at a time of backup. 5. More things you can find in BackupableDBOptions Here is the directory structure I use: backup_dir/CURRENT_SNAPSHOT - just 4 bytes holding the latest snapshot 0, 1, 2, ... - files containing serialized version of each snapshot - containing a list of files files/*.sst - sst files shared between snapshots - if one snapshot references 00010.sst and another one needs to backup it from the DB, it will just reference the same file files/ 0/, 1/, 2/, ... - snapshot directories containing private snapshot files - current, manifest and log files All the files are ref counted and deleted immediatelly when they get out of scope. Some other stuff in this diff: 1. Added GetEnv() method to the DB. Discussed with @haobo and we agreed that it seems right thing to do. 2. Fixed StackableDB interface. The way it was set up before, I was not able to implement BackupableDB. Test Plan: I have a unittest, but please don't look at this yet. I just hacked it up to help me with debugging. I will write a lot of good tests and update the diff. Also, `make asan_check` Reviewers: dhruba, haobo, emayanke Reviewed By: dhruba CC: leveldb, haobo Differential Revision: https://reviews.facebook.net/D14295 --- Makefile | 4 + db/db_impl.cc | 4 + db/db_impl.h | 1 + db/db_test.cc | 5 + include/rocksdb/db.h | 4 + include/utilities/backupable_db.h | 128 ++++ include/utilities/stackable_db.h | 4 + util/coding.cc | 11 + util/coding.h | 2 + utilities/backupable/backupable_db.cc | 821 +++++++++++++++++++++ utilities/backupable/backupable_db_test.cc | 625 ++++++++++++++++ 11 files changed, 1609 insertions(+) create mode 100644 include/utilities/backupable_db.h create mode 100644 utilities/backupable/backupable_db.cc create mode 100644 utilities/backupable/backupable_db_test.cc diff --git a/Makefile b/Makefile index be7758de96..b320b58901 100644 --- a/Makefile +++ b/Makefile @@ -76,6 +76,7 @@ TESTS = \ skiplist_test \ stringappend_test \ ttl_test \ + backupable_db_test \ version_edit_test \ version_set_test \ write_batch_test\ @@ -272,6 +273,9 @@ perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) +backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) diff --git a/db/db_impl.cc b/db/db_impl.cc index 9061adecc8..697d0017b5 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -3176,6 +3176,10 @@ Status DBImpl::MakeRoomForWrite(bool force) { return s; } +const std::string& DBImpl::GetName() const { + return dbname_; +} + Env* DBImpl::GetEnv() const { return env_; } diff --git a/db/db_impl.h b/db/db_impl.h index d7a346b6ea..0591839403 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -67,6 +67,7 @@ class DBImpl : public DB { virtual int NumberLevels(); virtual int MaxMemCompactionLevel(); virtual int Level0StopWriteTrigger(); + virtual const std::string& GetName() const; virtual Env* GetEnv() const; virtual const Options& GetOptions() const; virtual Status Flush(const FlushOptions& options); diff --git a/db/db_test.cc b/db/db_test.cc index fea7f1e1b4..8cfdedd5e3 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -4444,6 +4444,10 @@ class ModelDB: public DB { return -1; } + virtual const std::string& GetName() const { + return name_; + } + virtual Env* GetEnv() const { return nullptr; } @@ -4521,6 +4525,7 @@ class ModelDB: public DB { }; const Options options_; KVMap map_; + std::string name_ = ""; }; static std::string RandomKey(Random* rnd, int minimum = 0) { diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 7396f84454..c4c5aa87fe 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -228,6 +228,10 @@ class DB { // Number of files in level-0 that would stop writes. virtual int Level0StopWriteTrigger() = 0; + // Get DB name -- the exact same name that was provided as an argument to + // DB::Open() + virtual const std::string& GetName() const = 0; + // Get Env object from the DB virtual Env* GetEnv() const = 0; diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h new file mode 100644 index 0000000000..b90c3e93a3 --- /dev/null +++ b/include/utilities/backupable_db.h @@ -0,0 +1,128 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "utilities/stackable_db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +#include +#include +#include + +namespace rocksdb { + +struct BackupableDBOptions { + // Where to keep the backup files. Has to be different than dbname_ + // Best to set this to dbname_ + "/backups" + // Required + std::string backup_dir; + + // Backup Env object. It will be used for backup file I/O. If it's + // nullptr, backups will be written out using DBs Env. If it's + // non-nullptr, backup's I/O will be performed using this object. + // If you want to have backups on HDFS, use HDFS Env here! + // Default: nullptr + Env* backup_env; + + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + // If sync == true, we can guarantee you'll get consistent backup even + // on a machine crash/reboot. Backup process is slower with sync enabled. + // If sync == false, we don't guarantee anything on machine reboot. However, + // chances are some of the backups are consistent. + // Default: true + bool sync; + + // If true, it will delete whatever backups there are already + // Default: false + bool destroy_old_data; + + explicit BackupableDBOptions(const std::string& _backup_dir, + Env* _backup_env = nullptr, + Logger* _info_log = nullptr, + bool _sync = true, + bool _destroy_old_data = false) : + backup_dir(_backup_dir), + backup_env(_backup_env), + info_log(_info_log), + sync(_sync), + destroy_old_data(_destroy_old_data) { } +}; + +class BackupEngine; + +typedef uint32_t BackupID; + +struct BackupInfo { + BackupID backup_id; + int64_t timestamp; + uint64_t size; + + BackupInfo() {} + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size) + : backup_id(_backup_id), timestamp(_timestamp), size(_size) {} +}; + +// Stack your DB with BackupableDB to be able to backup the DB +class BackupableDB : public StackableDB { + public: + // BackupableDBOptions have to be the same as the ones used in a previous + // incarnation of the DB + BackupableDB(DB* db, const BackupableDBOptions& options); + virtual ~BackupableDB(); + + // Captures the state of the database in the latest backup + // NOT a thread safe call + Status CreateNewBackup(bool flush_before_backup = false); + // Returns info about backups in backup_info + void GetBackupInfo(std::vector* backup_info); + // deletes old backups, keeping latest num_backups_to_keep alive + Status PurgeOldBackups(uint32_t num_backups_to_keep); + // deletes a specific backup + Status DeleteBackup(BackupID backup_id); + + private: + BackupEngine* backup_engine_; +}; + +// Use this class to access information about backups and restore from them +class RestoreBackupableDB { + public: + RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options); + ~RestoreBackupableDB(); + + // Returns info about backups in backup_info + void GetBackupInfo(std::vector* backup_info); + + // restore from backup with backup_id + // IMPORTANT -- if you restore from some backup that is not the latest, + // you HAVE to delete all the newer backups immediately, before creating + // new backup on the restored database. Otherwise, your new backups + // will be corrupted. + // TODO should we enforce this somehow? + Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir, + const std::string& wal_dir); + + // restore from the latest backup + Status RestoreDBFromLatestBackup(const std::string& db_dir, + const std::string& wal_dir); + // deletes old backups, keeping latest num_backups_to_keep alive + Status PurgeOldBackups(uint32_t num_backups_to_keep); + // deletes a specific backup + Status DeleteBackup(BackupID backup_id); + + private: + BackupEngine* backup_engine_; +}; + +} // rocksdb namespace diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h index e74bf353b4..2d86a611b7 100644 --- a/include/utilities/stackable_db.h +++ b/include/utilities/stackable_db.h @@ -103,6 +103,10 @@ class StackableDB : public DB { return db_->Level0StopWriteTrigger(); } + virtual const std::string& GetName() const override { + return db_->GetName(); + } + virtual Env* GetEnv() const override { return db_->GetEnv(); } diff --git a/util/coding.cc b/util/coding.cc index 2d70647fb7..ce67fa4866 100644 --- a/util/coding.cc +++ b/util/coding.cc @@ -217,6 +217,17 @@ Slice GetLengthPrefixedSlice(const char* data) { return Slice(p, len); } +Slice GetSliceUntil(Slice* slice, char delimiter) { + uint32_t len; + for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) { + // nothing + } + + Slice ret(slice->data(), len); + slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0)); + return ret; +} + void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, uint32_t bits, uint64_t value) { assert((offset + bits + 7)/8 <= dstlen); diff --git a/util/coding.h b/util/coding.h index 3fd892f791..4477dc799e 100644 --- a/util/coding.h +++ b/util/coding.h @@ -40,6 +40,8 @@ extern bool GetVarint64(Slice* input, uint64_t* value); extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); extern Slice GetLengthPrefixedSlice(const char* data); +extern Slice GetSliceUntil(Slice* slice, char delimiter); + // Pointer-based variants of GetVarint... These either store a value // in *v and return a pointer just past the parsed value, or return // nullptr on error. These routines only look at bytes in the range diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc new file mode 100644 index 0000000000..3e87b02c54 --- /dev/null +++ b/utilities/backupable/backupable_db.cc @@ -0,0 +1,821 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "utilities/backupable_db.h" +#include "db/filename.h" +#include "util/coding.h" +#include "rocksdb/transaction_log.h" + +#define __STDC_FORMAT_MACROS + +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +// -------- BackupEngine class --------- +class BackupEngine { + public: + BackupEngine(Env* db_env, const BackupableDBOptions& options); + ~BackupEngine(); + Status CreateNewBackup(DB* db, bool flush_before_backup = false); + Status PurgeOldBackups(uint32_t num_backups_to_keep); + Status DeleteBackup(BackupID backup_id); + + void GetBackupInfo(std::vector* backup_info); + Status RestoreDBFromBackup(BackupID backup_id, const std::string &db_dir, + const std::string &wal_dir); + Status RestoreDBFromLatestBackup(const std::string &db_dir, + const std::string &wal_dir) { + return RestoreDBFromBackup(latest_backup_id_, db_dir, wal_dir); + } + + private: + class BackupMeta { + public: + BackupMeta(const std::string& meta_filename, + std::unordered_map* file_refs, Env* env) + : timestamp_(0), size_(0), meta_filename_(meta_filename), + file_refs_(file_refs), env_(env) {} + + ~BackupMeta() {} + + void RecordTimestamp() { + env_->GetCurrentTime(×tamp_); + } + int64_t GetTimestamp() const { + return timestamp_; + } + uint64_t GetSize() const { + return size_; + } + + void AddFile(const std::string& filename, uint64_t size); + void Delete(); + + bool Empty() { + return files_.empty(); + } + + const std::vector& GetFiles() { + return files_; + } + + Status LoadFromFile(const std::string& backup_dir); + Status StoreToFile(bool sync); + + private: + int64_t timestamp_; + uint64_t size_; + std::string const meta_filename_; + // files with relative paths (without "/" prefix!!) + std::vector files_; + std::unordered_map* file_refs_; + Env* env_; + }; // BackupMeta + + inline std::string GetAbsolutePath( + const std::string &relative_path = "") const { + assert(relative_path.size() == 0 || relative_path[0] != '/'); + return options_.backup_dir + "/" + relative_path; + } + inline std::string GetPrivateDirRel() const { + return "private"; + } + inline std::string GetPrivateFileRel(BackupID backup_id, + const std::string &file = "") const { + assert(file.size() == 0 || file[0] != '/'); + return GetPrivateDirRel() + "/" + std::to_string(backup_id) + "/" + file; + } + inline std::string GetSharedFileRel(const std::string& file = "") const { + assert(file.size() == 0 || file[0] != '/'); + return "shared/" + file; + } + inline std::string GetLatestBackupFile(bool tmp = false) const { + return GetAbsolutePath(std::string("LATEST_BACKUP") + (tmp ? ".tmp" : "")); + } + inline std::string GetBackupMetaDir() const { + return GetAbsolutePath("meta"); + } + inline std::string GetBackupMetaFile(BackupID backup_id) const { + return GetBackupMetaDir() + "/" + std::to_string(backup_id); + } + + Status GetLatestBackupFileContents(uint32_t* latest_backup); + Status PutLatestBackupFileContents(uint32_t latest_backup); + // if size_limit == 0, there is no size limit, copy everything + Status CopyFile(const std::string& src, + const std::string& dst, + Env* src_env, + Env* dst_env, + bool sync, + uint64_t* size = nullptr, + uint64_t size_limit = 0); + // if size_limit == 0, there is no size limit, copy everything + Status BackupFile(BackupID backup_id, + BackupMeta* backup, + bool shared, + const std::string& src_dir, + const std::string& src_fname, // starts with "/" + uint64_t size_limit = 0); + // Will delete all the files we don't need anymore + // If full_scan == true, it will do the full scan of files/ directory + // and delete all the files that are not referenced from backuped_file_refs_ + void GarbageCollection(bool full_scan); + + // backup state data + BackupID latest_backup_id_; + std::map backups_; + std::unordered_map backuped_file_refs_; + std::vector obsolete_backups_; + + // options data + BackupableDBOptions options_; + Env* db_env_; + Env* backup_env_; + + // constants + static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024; // 10MB + static const size_t copy_file_buffer_size_ = 5 * 1024 * 1024LL; // 5MB +}; + +BackupEngine::BackupEngine(Env* db_env, const BackupableDBOptions& options) + : options_(options), + db_env_(db_env), + backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_) { + + // create all the dirs we need + backup_env_->CreateDirIfMissing(GetAbsolutePath()); + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel())); + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel())); + backup_env_->CreateDirIfMissing(GetBackupMetaDir()); + + std::vector backup_meta_files; + backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files); + // create backups_ structure + for (auto& file : backup_meta_files) { + BackupID backup_id = 0; + sscanf(file.c_str(), "%u", &backup_id); + if (backup_id == 0 || file != std::to_string(backup_id)) { + // invalid file name, delete that + backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file); + continue; + } + assert(backups_.find(backup_id) == backups_.end()); + backups_.insert(std::make_pair( + backup_id, BackupMeta(GetBackupMetaFile(backup_id), + &backuped_file_refs_, backup_env_))); + } + + if (options_.destroy_old_data) { // Destory old data + for (auto& backup : backups_) { + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + backups_.clear(); + // start from beginning + latest_backup_id_ = 0; + // GarbageCollection() will do the actual deletion + } else { // Load data from storage + // load the backups if any + for (auto& backup : backups_) { + Status s = backup.second.LoadFromFile(options_.backup_dir); + if (!s.ok()) { + Log(options_.info_log, "Backup %u corrupted - deleting -- %s", + backup.first, s.ToString().c_str()); + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + } + // delete obsolete backups from the structure + for (auto ob : obsolete_backups_) { + backups_.erase(ob); + } + + Status s = GetLatestBackupFileContents(&latest_backup_id_); + // If latest backup file is corrupted or non-existent + // set latest backup as the biggest backup we have + // or 0 if we have no backups + if (!s.ok() || + backups_.find(latest_backup_id_) == backups_.end()) { + auto itr = backups_.end(); + latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first; + } + } + + // delete any backups that claim to be later than latest + for (auto itr = backups_.upper_bound(latest_backup_id_); + itr != backups_.end();) { + itr->second.Delete(); + obsolete_backups_.push_back(itr->first); + itr = backups_.erase(itr); + } + + PutLatestBackupFileContents(latest_backup_id_); // Ignore errors + GarbageCollection(true); + Log(options_.info_log, + "Initialized BackupEngine, the latest backup is %u.", + latest_backup_id_); +} + +BackupEngine::~BackupEngine() { + LogFlush(options_.info_log); +} + +Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) { + Status s; + std::vector live_files; + VectorLogPtr live_wal_files; + uint64_t manifest_file_size = 0; + + s = db->DisableFileDeletions(); + if (s.ok()) { + // this will return live_files prefixed with "/" + s = db->GetLiveFiles(live_files, &manifest_file_size, flush_before_backup); + } + // if we didn't flush before backup, we need to also get WAL files + if (s.ok() && !flush_before_backup) { + // returns file names prefixed with "/" + s = db->GetSortedWalFiles(live_wal_files); + } + if (!s.ok()) { + db->EnableFileDeletions(); + return s; + } + + BackupID new_backup_id = latest_backup_id_ + 1; + assert(backups_.find(new_backup_id) == backups_.end()); + auto ret = backups_.insert(std::make_pair( + new_backup_id, BackupMeta(GetBackupMetaFile(new_backup_id), + &backuped_file_refs_, backup_env_))); + assert(ret.second == true); + auto& new_backup = ret.first->second; + new_backup.RecordTimestamp(); + + Log(options_.info_log, "Started the backup process -- creating backup %u", + new_backup_id); + + // create private dir + s = backup_env_->CreateDir(GetAbsolutePath(GetPrivateFileRel(new_backup_id))); + + // copy live_files + for (size_t i = 0; s.ok() && i < live_files.size(); ++i) { + uint64_t number; + FileType type; + bool ok = ParseFileName(live_files[i], &number, &type); + assert(ok); + // we should only get sst, manifest and current files here + assert(type == kTableFile || + type == kDescriptorFile || + type == kCurrentFile); + + // rules: + // * if it's kTableFile, than it's shared + // * if it's kDescriptorFile, limit the size to manifest_file_size + s = BackupFile(new_backup_id, + &new_backup, + type == kTableFile, /* shared */ + db->GetName(), /* src_dir */ + live_files[i], /* src_fname */ + (type == kDescriptorFile) ? manifest_file_size : 0); + } + + // copy WAL files + for (size_t i = 0; s.ok() && i < live_wal_files.size(); ++i) { + if (live_wal_files[i]->Type() == kAliveLogFile) { + // we only care about live log files + // copy the file into backup_dir/files// + s = BackupFile(new_backup_id, + &new_backup, + false, /* not shared */ + db->GetOptions().wal_dir, + live_wal_files[i]->PathName()); + } + } + + // we copied all the files, enable file deletions + db->EnableFileDeletions(); + + if (s.ok()) { + // persist the backup metadata on the disk + s = new_backup.StoreToFile(options_.sync); + } + if (s.ok()) { + // install the newly created backup meta! (atomic) + s = PutLatestBackupFileContents(new_backup_id); + } + if (!s.ok()) { + // clean all the files we might have created + Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str()); + backups_.erase(new_backup_id); + GarbageCollection(true); + return s; + } + + // here we know that we succeeded and installed the new backup + // in the LATEST_BACKUP file + latest_backup_id_ = new_backup_id; + Log(options_.info_log, "Backup DONE. All is good"); + return s; +} + +Status BackupEngine::PurgeOldBackups(uint32_t num_backups_to_keep) { + Log(options_.info_log, "Purging old backups, keeping %u", + num_backups_to_keep); + while (num_backups_to_keep < backups_.size()) { + Log(options_.info_log, "Deleting backup %u", backups_.begin()->first); + backups_.begin()->second.Delete(); + obsolete_backups_.push_back(backups_.begin()->first); + backups_.erase(backups_.begin()); + } + GarbageCollection(false); + return Status::OK(); +} + +Status BackupEngine::DeleteBackup(BackupID backup_id) { + Log(options_.info_log, "Deleting backup %u", backup_id); + auto backup = backups_.find(backup_id); + if (backup == backups_.end()) { + return Status::NotFound("Backup not found"); + } + backup->second.Delete(); + obsolete_backups_.push_back(backup_id); + backups_.erase(backup); + GarbageCollection(false); + return Status::OK(); +} + +void BackupEngine::GetBackupInfo(std::vector* backup_info) { + backup_info->reserve(backups_.size()); + for (auto& backup : backups_) { + if (!backup.second.Empty()) { + backup_info->push_back(BackupInfo( + backup.first, backup.second.GetTimestamp(), backup.second.GetSize())); + } + } +} + +Status BackupEngine::RestoreDBFromBackup(BackupID backup_id, + const std::string &db_dir, + const std::string &wal_dir) { + auto backup_itr = backups_.find(backup_id); + if (backup_itr == backups_.end()) { + return Status::NotFound("Backup not found"); + } + auto& backup = backup_itr->second; + if (backup.Empty()) { + return Status::NotFound("Backup not found"); + } + + Log(options_.info_log, "Restoring backup id %u\n", backup_id); + + // just in case. Ignore errors + db_env_->CreateDirIfMissing(db_dir); + db_env_->CreateDirIfMissing(wal_dir); + + // delete log files that might have been already in wal_dir. + // This is important since they might get replayed to the restored DB, + // which will then differ from the backuped DB + std::vector wal_dir_children; + db_env_->GetChildren(wal_dir, &wal_dir_children); // ignore errors + for (auto f : wal_dir_children) { + db_env_->DeleteFile(wal_dir + "/" + f); // ignore errors + } + + Status s; + for (auto& file : backup.GetFiles()) { + std::string dst; + // 1. extract the filename + size_t slash = file.find_last_of('/'); + // file will either be shared/ or private// + assert(slash != std::string::npos); + dst = file.substr(slash + 1); + + // 2. find the filetype + uint64_t number; + FileType type; + bool ok = ParseFileName(dst, &number, &type); + if (!ok) { + return Status::Corruption("Backup corrupted"); + } + // 3. Construct the final path + // kLogFile lives in wal_dir and all the rest live in db_dir + dst = ((type == kLogFile) ? wal_dir : db_dir) + + "/" + dst; + + Log(options_.info_log, "Restoring %s to %s\n", file.c_str(), dst.c_str()); + s = CopyFile(GetAbsolutePath(file), dst, backup_env_, db_env_, false); + if (!s.ok()) { + break; + } + } + + Log(options_.info_log, "Restoring done -- %s\n", s.ToString().c_str()); + return s; +} + +// latest backup id is an ASCII representation of latest backup id +Status BackupEngine::GetLatestBackupFileContents(uint32_t* latest_backup) { + Status s; + unique_ptr file; + s = backup_env_->NewSequentialFile(GetLatestBackupFile(), + &file, + EnvOptions()); + if (!s.ok()) { + return s; + } + + char* buf = new char[10]; + Slice data(buf, 0); + + s = file->Read(10, &data, buf); + + if (!s.ok() || data.size() == 0) { + delete[] buf; + return s.ok() ? Status::Corruption("Latest backup file corrupted") : s; + } + + sscanf(data.data(), "%u", latest_backup); + if (backup_env_->FileExists(GetBackupMetaFile(*latest_backup)) == false) { + s = Status::Corruption("Latest backup file corrupted"); + } + delete[] buf; + return Status::OK(); +} + +// this operation HAS to be atomic +// writing 4 bytes to the file is atomic alright, but we should *never* +// do something like 1. delete file, 2. write new file +// We write to a tmp file and then atomically rename +Status BackupEngine::PutLatestBackupFileContents(uint32_t latest_backup) { + Status s; + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = backup_env_->NewWritableFile(GetLatestBackupFile(true), + &file, + env_options); + if (!s.ok()) { + backup_env_->DeleteFile(GetLatestBackupFile(true)); + return s; + } + + char* file_contents = new char[10]; + int len = sprintf(file_contents, "%u\n", latest_backup); + s = file->Append(Slice(file_contents, len)); + if (s.ok() && options_.sync) { + file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + if (s.ok()) { + // atomically replace real file with new tmp + s = backup_env_->RenameFile(GetLatestBackupFile(true), + GetLatestBackupFile(false)); + } + return s; +} + +Status BackupEngine::CopyFile(const std::string& src, + const std::string& dst, + Env* src_env, + Env* dst_env, + bool sync, + uint64_t* size, + uint64_t size_limit) { + Status s; + unique_ptr dst_file; + unique_ptr src_file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + if (size != nullptr) { + *size = 0; + } + + // Check if size limit is set. if not, set it to very big number + if (size_limit == 0) { + size_limit = std::numeric_limits::max(); + } + + s = src_env->NewSequentialFile(src, &src_file, env_options); + if (s.ok()) { + s = dst_env->NewWritableFile(dst, &dst_file, env_options); + } + if (!s.ok()) { + return s; + } + + char* buf = new char[copy_file_buffer_size_]; + Slice data(buf, 0); + + do { + size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? + copy_file_buffer_size_ : size_limit; + s = src_file->Read(buffer_to_read, &data, buf); + size_limit -= data.size(); + if (size != nullptr) { + *size += data.size(); + } + if (s.ok()) { + s = dst_file->Append(data); + } + } while (s.ok() && data.size() > 0 && size_limit > 0); + + if (s.ok() && sync) { + s = dst_file->Sync(); + } + + return s; +} + +// src_fname will always start with "/" +Status BackupEngine::BackupFile(BackupID backup_id, + BackupMeta* backup, + bool shared, + const std::string& src_dir, + const std::string& src_fname, + uint64_t size_limit) { + + assert(src_fname.size() > 0 && src_fname[0] == '/'); + std::string dst_relative = src_fname.substr(1); + if (shared) { + dst_relative = GetSharedFileRel(dst_relative); + } else { + dst_relative = GetPrivateFileRel(backup_id, dst_relative); + } + std::string dst_path = GetAbsolutePath(dst_relative); + Status s; + uint64_t size; + + // if it's shared, we also need to check if it exists -- if it does, + // no need to copy it again + if (shared && backup_env_->FileExists(dst_path)) { + backup_env_->GetFileSize(dst_path, &size); // Ignore error + Log(options_.info_log, "%s already present", src_fname.c_str()); + } else { + Log(options_.info_log, "Copying %s", src_fname.c_str()); + s = CopyFile(src_dir + src_fname, + dst_path, + db_env_, + backup_env_, + options_.sync, + &size, + size_limit); + } + if (s.ok()) { + backup->AddFile(dst_relative, size); + } + return s; +} + +void BackupEngine::GarbageCollection(bool full_scan) { + Log(options_.info_log, "Starting garbage collection"); + std::vector to_delete; + for (auto& itr : backuped_file_refs_) { + if (itr.second == 0) { + Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first)); + Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(), + s.ToString().c_str()); + to_delete.push_back(itr.first); + } + } + for (auto& td : to_delete) { + backuped_file_refs_.erase(td); + } + if (!full_scan) { + // take care of private dirs -- if full_scan == true, then full_scan will + // take care of them + for (auto backup_id : obsolete_backups_) { + std::string private_dir = GetPrivateFileRel(backup_id); + Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir)); + Log(options_.info_log, "Deleting private dir %s -- %s", + private_dir.c_str(), s.ToString().c_str()); + } + obsolete_backups_.clear(); + } + + if (full_scan) { + Log(options_.info_log, "Starting full scan garbage collection"); + // delete obsolete shared files + std::vector shared_children; + backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()), + &shared_children); + for (auto& child : shared_children) { + std::string rel_fname = GetSharedFileRel(child); + // if it's not refcounted, delete it + if (backuped_file_refs_.find(rel_fname) == backuped_file_refs_.end()) { + // this might be a directory, but DeleteFile will just fail in that + // case, so we're good + Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname)); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", rel_fname.c_str()); + } + } + } + + // delete obsolete private files + std::vector private_children; + backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()), + &private_children); + for (auto& child : private_children) { + BackupID backup_id = 0; + sscanf(child.c_str(), "%u", &backup_id); + if (backup_id == 0 || backups_.find(backup_id) != backups_.end()) { + // it's either not a number or it's still alive. continue + continue; + } + // here we have to delete the dir and all its children + std::string full_private_path = + GetAbsolutePath(GetPrivateFileRel(backup_id)); + std::vector subchildren; + backup_env_->GetChildren(full_private_path, &subchildren); + for (auto& subchild : subchildren) { + Status s = backup_env_->DeleteFile(full_private_path + subchild); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", + (full_private_path + subchild).c_str()); + } + } + // finally delete the private dir + Status s = backup_env_->DeleteDir(full_private_path); + Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(), + s.ToString().c_str()); + } + } +} + +// ------- BackupMeta class -------- + +void BackupEngine::BackupMeta::AddFile(const std::string& filename, + uint64_t size) { + size_ += size; + files_.push_back(filename); + auto itr = file_refs_->find(filename); + if (itr == file_refs_->end()) { + file_refs_->insert(std::make_pair(filename, 1)); + } else { + ++itr->second; // increase refcount if already present + } +} + +void BackupEngine::BackupMeta::Delete() { + for (auto& file : files_) { + auto itr = file_refs_->find(file); + assert(itr != file_refs_->end()); + --(itr->second); // decrease refcount + } + files_.clear(); + // delete meta file + env_->DeleteFile(meta_filename_); + timestamp_ = 0; +} + +// each backup meta file is of the format: +// +// +// +// +// ... +// TODO: maybe add checksum? +Status BackupEngine::BackupMeta::LoadFromFile(const std::string& backup_dir) { + assert(Empty()); + Status s; + unique_ptr backup_meta_file; + s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions()); + if (!s.ok()) { + return s; + } + + char* buf = new char[max_backup_meta_file_size_ + 1]; + Slice data(buf, 0); + s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf); + + if (!s.ok() || data.size() == max_backup_meta_file_size_) { + delete[] buf; + return s.ok() ? Status::IOError("File size too big") : s; + } + buf[data.size()] = 0; + + uint32_t num_files = 0; + int bytes_read = 0; + sscanf(data.data(), "%ld%n", ×tamp_, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + sscanf(data.data(), "%u%n", &num_files, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + + for (uint32_t i = 0; s.ok() && i < num_files; ++i) { + std::string filename = GetSliceUntil(&data, '\n').ToString(); + uint64_t size; + s = env_->GetFileSize(backup_dir + "/" + filename, &size); + AddFile(filename, size); + } + + delete[] buf; + return s; +} + +Status BackupEngine::BackupMeta::StoreToFile(bool sync) { + Status s; + unique_ptr backup_meta_file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file, + env_options); + if (!s.ok()) { + return s; + } + + char* buf = new char[max_backup_meta_file_size_]; + int len = 0, buf_size = max_backup_meta_file_size_; + len += snprintf(buf, buf_size, "%" PRId64 "\n", timestamp_); + len += snprintf(buf + len, buf_size - len, "%zu\n", files_.size()); + for (size_t i = 0; i < files_.size(); ++i) { + len += snprintf(buf + len, buf_size - len, "%s\n", files_[i].c_str()); + } + + s = backup_meta_file->Append(Slice(buf, (size_t)len)); + if (s.ok() && sync) { + s = backup_meta_file->Sync(); + } + if (s.ok()) { + s = backup_meta_file->Close(); + } + if (s.ok()) { + s = env_->RenameFile(meta_filename_ + ".tmp", meta_filename_); + } + return s; +} + +// --- BackupableDB methods -------- + +BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options) : + StackableDB(db), + backup_engine_(new BackupEngine(db->GetEnv(), options)) {} + +BackupableDB::~BackupableDB() { + delete backup_engine_; +} + +Status BackupableDB::CreateNewBackup(bool flush_before_backup) { + return backup_engine_->CreateNewBackup(this, flush_before_backup); +} + +void BackupableDB::GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); +} + +Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) { + return backup_engine_->PurgeOldBackups(num_backups_to_keep); +} + +Status BackupableDB::DeleteBackup(BackupID backup_id) { + return backup_engine_->DeleteBackup(backup_id); +} + +// --- RestoreBackupableDB methods ------ + +RestoreBackupableDB::RestoreBackupableDB(Env* db_env, + const BackupableDBOptions& options) + : backup_engine_(new BackupEngine(db_env, options)) {} + +RestoreBackupableDB::~RestoreBackupableDB() { + delete backup_engine_; +} + +void +RestoreBackupableDB::GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); +} + +Status RestoreBackupableDB::RestoreDBFromBackup(BackupID backup_id, + const std::string& db_dir, + const std::string& wal_dir) { + return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir); +} + +Status +RestoreBackupableDB::RestoreDBFromLatestBackup(const std::string& db_dir, + const std::string& wal_dir) { + return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir); +} + +Status RestoreBackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) { + return backup_engine_->PurgeOldBackups(num_backups_to_keep); +} + +Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) { + return backup_engine_->DeleteBackup(backup_id); +} + +} // namespace rocksdb diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc new file mode 100644 index 0000000000..31a5abf87d --- /dev/null +++ b/utilities/backupable/backupable_db_test.cc @@ -0,0 +1,625 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/transaction_log.h" +#include "utilities/utility_db.h" +#include "utilities/backupable_db.h" +#include "util/testharness.h" +#include "util/random.h" +#include "util/testutil.h" +#include "util/auto_roll_logger.h" + +#include +#include + +namespace rocksdb { + +namespace { + +using std::unique_ptr; + +class DummyDB : public StackableDB { + public: + /* implicit */ + DummyDB(const Options& options, const std::string& dbname) + : StackableDB(nullptr), options_(options), dbname_(dbname), + deletions_enabled_(true) {} + + virtual const std::string& GetName() const override { + return dbname_; + } + + virtual Env* GetEnv() const override { + return options_.env; + } + + virtual const Options& GetOptions() const override { + return options_; + } + + virtual Status EnableFileDeletions() override { + ASSERT_TRUE(!deletions_enabled_); + deletions_enabled_ = true; + return Status::OK(); + } + + virtual Status DisableFileDeletions() override { + ASSERT_TRUE(deletions_enabled_); + deletions_enabled_ = false; + return Status::OK(); + } + + virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, + bool flush_memtable = true) override { + ASSERT_TRUE(!deletions_enabled_); + vec = live_files_; + *mfs = 100; + return Status::OK(); + } + + class DummyLogFile : public LogFile { + public: + /* implicit */ + DummyLogFile(const std::string& path, bool alive = true) + : path_(path), alive_(alive) {} + + virtual std::string PathName() const override { + return path_; + } + + virtual uint64_t LogNumber() const { + // what business do you have calling this method? + ASSERT_TRUE(false); + return 0; + } + + virtual WalFileType Type() const override { + return alive_ ? kAliveLogFile : kArchivedLogFile; + } + + virtual SequenceNumber StartSequence() const { + // backupabledb should not need this method + ASSERT_TRUE(false); + return 0; + } + + virtual uint64_t SizeFileBytes() const { + // backupabledb should not need this method + ASSERT_TRUE(false); + return 0; + } + + private: + std::string path_; + bool alive_; + }; // DummyLogFile + + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + ASSERT_TRUE(!deletions_enabled_); + files.resize(wal_files_.size()); + for (size_t i = 0; i < files.size(); ++i) { + files[i].reset( + new DummyLogFile(wal_files_[i].first, wal_files_[i].second)); + } + return Status::OK(); + } + + std::vector live_files_; + // pair + std::vector> wal_files_; + private: + Options options_; + std::string dbname_; + bool deletions_enabled_; +}; // DummyDB + +class TestEnv : public EnvWrapper { + public: + explicit TestEnv(Env* t) : EnvWrapper(t) {} + + class DummySequentialFile : public SequentialFile { + virtual Status Read(size_t n, Slice* result, char* scratch) { + size_t read_size = (n > size_left) ? size_left : n; + *result = Slice(scratch, read_size); + size_left -= read_size; + return Status::OK(); + } + + virtual Status Skip(uint64_t n) { + size_left = (n > size_left) ? size_left - n : 0; + return Status::OK(); + } + private: + size_t size_left = 200; + }; + + Status NewSequentialFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) { + opened_files_.push_back(f); + if (dummy_sequential_file_) { + r->reset(new TestEnv::DummySequentialFile()); + return Status::OK(); + } else { + return EnvWrapper::NewSequentialFile(f, r, options); + } + } + + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) { + if (limit_written_files_ <= 0) { + return Status::IOError("Sorry, can't do this"); + } + limit_written_files_--; + return EnvWrapper::NewWritableFile(f, r, options); + } + + void AssertOpenedFiles(std::vector& should_have_opened) { + sort(should_have_opened.begin(), should_have_opened.end()); + sort(opened_files_.begin(), opened_files_.end()); + ASSERT_TRUE(opened_files_ == should_have_opened); + } + + void ClearOpenedFiles() { + opened_files_.clear(); + } + + void SetLimitWrittenFiles(uint64_t limit) { + limit_written_files_ = limit; + } + + void SetDummySequentialFile(bool dummy_sequential_file) { + dummy_sequential_file_ = dummy_sequential_file; + } + + private: + bool dummy_sequential_file_ = false; + std::vector opened_files_; + uint64_t limit_written_files_ = 1000000; +}; // TestEnv + +class FileManager : public EnvWrapper { + public: + explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {} + + Status DeleteRandomFileInDir(const std::string dir) { + std::vector children; + GetChildren(dir, &children); + if (children.size() <= 2) { // . and .. + return Status::NotFound(""); + } + while (true) { + int i = rnd_.Next() % children.size(); + if (children[i] != "." && children[i] != "..") { + return DeleteFile(dir + "/" + children[i]); + } + } + // should never get here + assert(false); + return Status::NotFound(""); + } + + Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) { + uint64_t size; + Status s = GetFileSize(fname, &size); + if (!s.ok()) { + return s; + } + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = NewRandomRWFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + + for (uint64_t i = 0; s.ok() && i < bytes_to_corrupt; ++i) { + std::string tmp; + // write one random byte to a random position + s = file->Write(rnd_.Next() % size, test::RandomString(&rnd_, 1, &tmp)); + } + return s; + } + + Status WriteToFile(const std::string& fname, const std::string& data) { + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + Status s = EnvWrapper::NewWritableFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + return file->Append(Slice(data)); + } + private: + Random rnd_; +}; // FileManager + +// utility functions +static void FillDB(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + + ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value))); + } +} + +static void AssertExists(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value; + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_EQ(value, "testvalue" + std::to_string(i)); + } +} + +static void AssertEmpty(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_TRUE(s.IsNotFound()); + } +} + +class BackupableDBTest { + public: + BackupableDBTest() { + // set up files + dbname_ = test::TmpDir() + "/backupable_db"; + backupdir_ = test::TmpDir() + "/backupable_db_backup"; + + // set up envs + env_ = Env::Default(); + test_db_env_.reset(new TestEnv(env_)); + test_backup_env_.reset(new TestEnv(env_)); + file_manager_.reset(new FileManager(env_)); + + // set up db options + options_.create_if_missing = true; + options_.paranoid_checks = true; + options_.write_buffer_size = 1 << 19; // 512KB + options_.env = test_db_env_.get(); + options_.wal_dir = dbname_; + // set up backup db options + CreateLoggerFromOptions(dbname_, backupdir_, env_, + Options(), &logger); + backupable_options_.reset(new BackupableDBOptions( + backupdir_, test_backup_env_.get(), logger.get(), true)); + + // delete old files in db + DestroyDB(dbname_, Options()); + } + + void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false) { + // reset all the defaults + test_backup_env_->SetLimitWrittenFiles(1000000); + test_db_env_->SetLimitWrittenFiles(1000000); + test_db_env_->SetDummySequentialFile(dummy); + + DB* db; + if (dummy) { + dummy_db_ = new DummyDB(options_, dbname_); + db = dummy_db_; + } else { + ASSERT_OK(DB::Open(options_, dbname_, &db)); + } + backupable_options_->destroy_old_data = destroy_old_data; + db_.reset(new BackupableDB(db, *backupable_options_)); + } + + void CloseBackupableDB() { + db_.reset(nullptr); + } + + void OpenRestoreDB() { + backupable_options_->destroy_old_data = false; + restore_db_.reset( + new RestoreBackupableDB(test_db_env_.get(), *backupable_options_)); + } + + void CloseRestoreDB() { + restore_db_.reset(nullptr); + } + + // restores backup backup_id and asserts the existence of + // [start_exist, end_exist> and not-existence of + // [end_exist, end> + // + // if backup_id == 0, it means restore from latest + // if end == 0, don't check AssertEmpty + void AssertBackupConsistency(BackupID backup_id, uint32_t start_exist, + uint32_t end_exist, uint32_t end = 0) { + bool opened_restore = false; + if (restore_db_.get() == nullptr) { + opened_restore = true; + OpenRestoreDB(); + } + if (backup_id > 0) { + ASSERT_OK(restore_db_->RestoreDBFromBackup(backup_id, dbname_, dbname_)); + } else { + ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_)); + } + OpenBackupableDB(); + AssertExists(db_.get(), start_exist, end_exist); + if (end != 0) { + AssertEmpty(db_.get(), end_exist, end); + } + CloseBackupableDB(); + if (opened_restore) { + CloseRestoreDB(); + } + } + + // files + std::string dbname_; + std::string backupdir_; + + // envs + Env* env_; + unique_ptr test_db_env_; + unique_ptr test_backup_env_; + unique_ptr file_manager_; + + // all the dbs! + DummyDB* dummy_db_; // BackupableDB owns dummy_db_ + unique_ptr db_; + unique_ptr restore_db_; + + // options + Options options_; + unique_ptr backupable_options_; + std::shared_ptr logger; +}; // BackupableDBTest + +void AppendPath(const std::string& path, std::vector& v) { + for (auto& f : v) { + f = path + f; + } +} + +// this will make sure that backup does not copy the same file twice +TEST(BackupableDBTest, NoDoubleCopy) { + OpenBackupableDB(true, true); + + // should write 5 DB files + LATEST_BACKUP + one meta file + test_backup_env_->SetLimitWrittenFiles(7); + test_db_env_->ClearOpenedFiles(); + test_db_env_->SetLimitWrittenFiles(0); + dummy_db_->live_files_ = { "/00010.sst", "/00011.sst", + "/CURRENT", "/MANIFEST-01" }; + dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + ASSERT_OK(db_->CreateNewBackup(false)); + std::vector should_have_openened = dummy_db_->live_files_; + should_have_openened.push_back("/00011.log"); + AppendPath(dbname_, should_have_openened); + test_db_env_->AssertOpenedFiles(should_have_openened); + + // should write 4 new DB files + LATEST_BACKUP + one meta file + // should not write/copy 00010.sst, since it's already there! + test_backup_env_->SetLimitWrittenFiles(6); + test_db_env_->ClearOpenedFiles(); + dummy_db_->live_files_ = { "/00010.sst", "/00015.sst", + "/CURRENT", "/MANIFEST-01" }; + dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + ASSERT_OK(db_->CreateNewBackup(false)); + // should not open 00010.sst - it's already there + should_have_openened = { "/00015.sst", "/CURRENT", + "/MANIFEST-01", "/00011.log" }; + AppendPath(dbname_, should_have_openened); + test_db_env_->AssertOpenedFiles(should_have_openened); + + ASSERT_OK(db_->DeleteBackup(1)); + ASSERT_EQ(true, + test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst")); + // 00011.sst was only in backup 1, should be deleted + ASSERT_EQ(false, + test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst")); + ASSERT_EQ(true, + test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst")); + + // MANIFEST file size should be only 100 + uint64_t size; + test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size); + ASSERT_EQ(100, size); + test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size); + ASSERT_EQ(200, size); +} + +// test various kind of corruptions that may happen: +// 1. Not able to write a file for backup - that backup should fail, +// everything else should work +// 2. Corrupted/deleted LATEST_BACKUP - everything should work fine +// 3. Corrupted backup meta file or missing backuped file - we should +// not be able to open that backup, but all other backups should be +// fine +TEST(BackupableDBTest, CorruptionsTest) { + const int keys_iteration = 20000; + Random rnd(6); + Status s; + + OpenBackupableDB(true); + // create five backups + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + } + + // ---------- case 1. - fail a write ----------- + // try creating backup 6, but fail a write + FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6); + test_backup_env_->SetLimitWrittenFiles(2); + // should fail + s = db_->CreateNewBackup(!!(rnd.Next() % 2)); + ASSERT_TRUE(!s.ok()); + test_backup_env_->SetLimitWrittenFiles(1000000); + // latest backup should have all the keys + CloseBackupableDB(); + AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6); + + // ---------- case 2. - corrupt/delete latest backup ----------- + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/LATEST_BACKUP", 2)); + AssertBackupConsistency(0, 0, keys_iteration * 5); + ASSERT_OK(file_manager_->DeleteFile(backupdir_ + "/LATEST_BACKUP")); + AssertBackupConsistency(0, 0, keys_iteration * 5); + // create backup 6, point LATEST_BACKUP to 5 + OpenBackupableDB(); + FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6); + ASSERT_OK(db_->CreateNewBackup(false)); + CloseBackupableDB(); + ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "5")); + AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6); + // assert that all 6 data is gone! + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/6") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/6") == false); + + // --------- case 3. corrupted backup meta or missing backuped file ---- + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3)); + // since 5 meta is now corrupted, latest backup should be 4 + AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5); + OpenRestoreDB(); + s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + CloseRestoreDB(); + ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4")); + // 4 is corrupted, 3 is the latest backup now + AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5); + OpenRestoreDB(); + s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_); + CloseRestoreDB(); + ASSERT_TRUE(!s.ok()); + + // new backup should be 4! + OpenBackupableDB(); + FillDB(db_.get(), keys_iteration * 3, keys_iteration * 4); + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + CloseBackupableDB(); + AssertBackupConsistency(4, 0, keys_iteration * 4, keys_iteration * 5); +} + +// open DB, write, close DB, backup, restore, repeat +TEST(BackupableDBTest, OfflineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 20000; + const int max_key = keys_iteration * 4 + 10; + // first iter -- flush before backup + // second iter -- don't flush before backup + for (int iter = 0; iter < 2; ++iter) { + // delete old data + DestroyDB(dbname_, Options()); + bool destroy_data = true; + + // every iteration -- + // 1. insert new data in the DB + // 2. backup the DB + // 3. destroy the db + // 4. restore the db, check everything is still there + for (int i = 0; i < 5; ++i) { + // in last iteration, put smaller amount of data, + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + // ---- insert new data and back up ---- + OpenBackupableDB(destroy_data); + destroy_data = false; + FillDB(db_.get(), keys_iteration * i, fill_up_to); + ASSERT_OK(db_->CreateNewBackup(iter == 0)); + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + // ---- make sure it's empty ---- + OpenBackupableDB(); + AssertEmpty(db_.get(), 0, fill_up_to); + CloseBackupableDB(); + + // ---- restore the DB ---- + OpenRestoreDB(); + if (i >= 3) { // test purge old backups + // when i == 4, purge to only 1 backup + // when i == 3, purge to 2 backups + ASSERT_OK(restore_db_->PurgeOldBackups(5 - i)); + } + // ---- make sure the data is there --- + AssertBackupConsistency(0, 0, fill_up_to, max_key); + CloseRestoreDB(); + } + } +} + +// open DB, write, backup, write, backup, close, restore +TEST(BackupableDBTest, OnlineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 20000; + const int max_key = keys_iteration * 4 + 10; + Random rnd(7); + // delete old data + DestroyDB(dbname_, Options()); + + OpenBackupableDB(true); + // write some data, backup, repeat + for (int i = 0; i < 5; ++i) { + if (i == 4) { + // delete backup number 2, online delete! + OpenRestoreDB(); + ASSERT_OK(restore_db_->DeleteBackup(2)); + CloseRestoreDB(); + } + // in last iteration, put smaller amount of data, + // so that backups can share sst files + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + FillDB(db_.get(), keys_iteration * i, fill_up_to); + // we should get consistent results with flush_before_backup + // set to both true and false + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + } + // close and destroy + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + // ---- make sure it's empty ---- + OpenBackupableDB(); + AssertEmpty(db_.get(), 0, max_key); + CloseBackupableDB(); + + // ---- restore every backup and verify all the data is there ---- + OpenRestoreDB(); + for (int i = 1; i <= 5; ++i) { + if (i == 2) { + // we deleted backup 2 + Status s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + } else { + int fill_up_to = std::min(keys_iteration * i, max_key); + AssertBackupConsistency(i, 0, fill_up_to, max_key); + } + } + + // delete some backups -- this should leave only backups 3 and 5 alive + ASSERT_OK(restore_db_->DeleteBackup(4)); + ASSERT_OK(restore_db_->PurgeOldBackups(2)); + + std::vector backup_info; + restore_db_->GetBackupInfo(&backup_info); + ASSERT_EQ(2, backup_info.size()); + + // check backup 3 + AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key); + // check backup 5 + AssertBackupConsistency(5, 0, max_key); + + CloseRestoreDB(); +} + +} // anon namespace + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} From 784e62f98dcf0b1dbcf3dedc3ad4c26bac5fae7e Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 9 Dec 2013 16:44:47 -0800 Subject: [PATCH 07/40] Fix unused variable warning --- utilities/backupable/backupable_db.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 3e87b02c54..498606045a 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -273,7 +273,10 @@ Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) { uint64_t number; FileType type; bool ok = ParseFileName(live_files[i], &number, &type); - assert(ok); + if (!ok) { + assert(false); + return Status::Corruption("Can't parse file name. This is very bad"); + } // we should only get sst, manifest and current files here assert(type == kTableFile || type == kDescriptorFile || From f6012ab826fa1e94e3232726303a6ce82446924f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dog=CC=86an=20C=CC=A7ec=CC=A7en?= Date: Sun, 8 Dec 2013 20:49:11 +0200 Subject: [PATCH 08/40] Fix shared lib build --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index be7758de96..4ff6c1cd32 100644 --- a/Makefile +++ b/Makefile @@ -126,7 +126,7 @@ $(SHARED2): $(SHARED3) endif $(SHARED3): - $(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3) + $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $@ $(LDFLAGS) endif # PLATFORM_SHARED_EXT From 6c4e110c8cd04588c40ea646b57b48e433a7c482 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dog=CC=86an=20C=CC=A7ec=CC=A7en?= Date: Tue, 10 Dec 2013 10:45:07 +0200 Subject: [PATCH 09/40] Rename leveldb to rocksdb in C api --- db/c.cc | 326 ++++++++++++++++++++++---------------------- db/c_test.c | 212 ++++++++++++++-------------- include/rocksdb/c.h | 238 ++++++++++++++++---------------- 3 files changed, 388 insertions(+), 388 deletions(-) diff --git a/db/c.cc b/db/c.cc index 0d99c44dd7..021122301b 100644 --- a/db/c.cc +++ b/db/c.cc @@ -48,21 +48,21 @@ using std::shared_ptr; extern "C" { -struct leveldb_t { DB* rep; }; -struct leveldb_iterator_t { Iterator* rep; }; -struct leveldb_writebatch_t { WriteBatch rep; }; -struct leveldb_snapshot_t { const Snapshot* rep; }; -struct leveldb_readoptions_t { ReadOptions rep; }; -struct leveldb_writeoptions_t { WriteOptions rep; }; -struct leveldb_options_t { Options rep; }; -struct leveldb_seqfile_t { SequentialFile* rep; }; -struct leveldb_randomfile_t { RandomAccessFile* rep; }; -struct leveldb_writablefile_t { WritableFile* rep; }; -struct leveldb_filelock_t { FileLock* rep; }; -struct leveldb_logger_t { shared_ptr rep; }; -struct leveldb_cache_t { shared_ptr rep; }; +struct rocksdb_t { DB* rep; }; +struct rocksdb_iterator_t { Iterator* rep; }; +struct rocksdb_writebatch_t { WriteBatch rep; }; +struct rocksdb_snapshot_t { const Snapshot* rep; }; +struct rocksdb_readoptions_t { ReadOptions rep; }; +struct rocksdb_writeoptions_t { WriteOptions rep; }; +struct rocksdb_options_t { Options rep; }; +struct rocksdb_seqfile_t { SequentialFile* rep; }; +struct rocksdb_randomfile_t { RandomAccessFile* rep; }; +struct rocksdb_writablefile_t { WritableFile* rep; }; +struct rocksdb_filelock_t { FileLock* rep; }; +struct rocksdb_logger_t { shared_ptr rep; }; +struct rocksdb_cache_t { shared_ptr rep; }; -struct leveldb_comparator_t : public Comparator { +struct rocksdb_comparator_t : public Comparator { void* state_; void (*destructor_)(void*); int (*compare_)( @@ -71,7 +71,7 @@ struct leveldb_comparator_t : public Comparator { const char* b, size_t blen); const char* (*name_)(void*); - virtual ~leveldb_comparator_t() { + virtual ~rocksdb_comparator_t() { (*destructor_)(state_); } @@ -88,7 +88,7 @@ struct leveldb_comparator_t : public Comparator { virtual void FindShortSuccessor(std::string* key) const { } }; -struct leveldb_filterpolicy_t : public FilterPolicy { +struct rocksdb_filterpolicy_t : public FilterPolicy { void* state_; void (*destructor_)(void*); const char* (*name_)(void*); @@ -102,7 +102,7 @@ struct leveldb_filterpolicy_t : public FilterPolicy { const char* key, size_t length, const char* filter, size_t filter_length); - virtual ~leveldb_filterpolicy_t() { + virtual ~rocksdb_filterpolicy_t() { (*destructor_)(state_); } @@ -129,7 +129,7 @@ struct leveldb_filterpolicy_t : public FilterPolicy { } }; -struct leveldb_env_t { +struct rocksdb_env_t { Env* rep; bool is_default; }; @@ -154,27 +154,27 @@ static char* CopyString(const std::string& str) { return result; } -leveldb_t* leveldb_open( - const leveldb_options_t* options, +rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, const char* name, char** errptr) { DB* db; if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { return NULL; } - leveldb_t* result = new leveldb_t; + rocksdb_t* result = new rocksdb_t; result->rep = db; return result; } -void leveldb_close(leveldb_t* db) { +void rocksdb_close(rocksdb_t* db) { delete db->rep; delete db; } -void leveldb_put( - leveldb_t* db, - const leveldb_writeoptions_t* options, +void rocksdb_put( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, const char* key, size_t keylen, const char* val, size_t vallen, char** errptr) { @@ -182,26 +182,26 @@ void leveldb_put( db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); } -void leveldb_delete( - leveldb_t* db, - const leveldb_writeoptions_t* options, +void rocksdb_delete( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, const char* key, size_t keylen, char** errptr) { SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen))); } -void leveldb_write( - leveldb_t* db, - const leveldb_writeoptions_t* options, - leveldb_writebatch_t* batch, +void rocksdb_write( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr) { SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); } -char* leveldb_get( - leveldb_t* db, - const leveldb_readoptions_t* options, +char* rocksdb_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, size_t* vallen, char** errptr) { @@ -220,30 +220,30 @@ char* leveldb_get( return result; } -leveldb_iterator_t* leveldb_create_iterator( - leveldb_t* db, - const leveldb_readoptions_t* options) { - leveldb_iterator_t* result = new leveldb_iterator_t; +rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, + const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; result->rep = db->rep->NewIterator(options->rep); return result; } -const leveldb_snapshot_t* leveldb_create_snapshot( - leveldb_t* db) { - leveldb_snapshot_t* result = new leveldb_snapshot_t; +const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; result->rep = db->rep->GetSnapshot(); return result; } -void leveldb_release_snapshot( - leveldb_t* db, - const leveldb_snapshot_t* snapshot) { +void rocksdb_release_snapshot( + rocksdb_t* db, + const rocksdb_snapshot_t* snapshot) { db->rep->ReleaseSnapshot(snapshot->rep); delete snapshot; } -char* leveldb_property_value( - leveldb_t* db, +char* rocksdb_property_value( + rocksdb_t* db, const char* propname) { std::string tmp; if (db->rep->GetProperty(Slice(propname), &tmp)) { @@ -254,8 +254,8 @@ char* leveldb_property_value( } } -void leveldb_approximate_sizes( - leveldb_t* db, +void rocksdb_approximate_sizes( + rocksdb_t* db, int num_ranges, const char* const* range_start_key, const size_t* range_start_key_len, const char* const* range_limit_key, const size_t* range_limit_key_len, @@ -269,8 +269,8 @@ void leveldb_approximate_sizes( delete[] ranges; } -void leveldb_compact_range( - leveldb_t* db, +void rocksdb_compact_range( + rocksdb_t* db, const char* start_key, size_t start_key_len, const char* limit_key, size_t limit_key_len) { Slice a, b; @@ -280,92 +280,92 @@ void leveldb_compact_range( (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL)); } -void leveldb_destroy_db( - const leveldb_options_t* options, +void rocksdb_destroy_db( + const rocksdb_options_t* options, const char* name, char** errptr) { SaveError(errptr, DestroyDB(name, options->rep)); } -void leveldb_repair_db( - const leveldb_options_t* options, +void rocksdb_repair_db( + const rocksdb_options_t* options, const char* name, char** errptr) { SaveError(errptr, RepairDB(name, options->rep)); } -void leveldb_iter_destroy(leveldb_iterator_t* iter) { +void rocksdb_iter_destroy(rocksdb_iterator_t* iter) { delete iter->rep; delete iter; } -unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) { +unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) { return iter->rep->Valid(); } -void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) { +void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) { iter->rep->SeekToFirst(); } -void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) { +void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) { iter->rep->SeekToLast(); } -void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) { +void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) { iter->rep->Seek(Slice(k, klen)); } -void leveldb_iter_next(leveldb_iterator_t* iter) { +void rocksdb_iter_next(rocksdb_iterator_t* iter) { iter->rep->Next(); } -void leveldb_iter_prev(leveldb_iterator_t* iter) { +void rocksdb_iter_prev(rocksdb_iterator_t* iter) { iter->rep->Prev(); } -const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) { +const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) { Slice s = iter->rep->key(); *klen = s.size(); return s.data(); } -const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) { +const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) { Slice s = iter->rep->value(); *vlen = s.size(); return s.data(); } -void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) { +void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) { SaveError(errptr, iter->rep->status()); } -leveldb_writebatch_t* leveldb_writebatch_create() { - return new leveldb_writebatch_t; +rocksdb_writebatch_t* rocksdb_writebatch_create() { + return new rocksdb_writebatch_t; } -void leveldb_writebatch_destroy(leveldb_writebatch_t* b) { +void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { delete b; } -void leveldb_writebatch_clear(leveldb_writebatch_t* b) { +void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { b->rep.Clear(); } -void leveldb_writebatch_put( - leveldb_writebatch_t* b, +void rocksdb_writebatch_put( + rocksdb_writebatch_t* b, const char* key, size_t klen, const char* val, size_t vlen) { b->rep.Put(Slice(key, klen), Slice(val, vlen)); } -void leveldb_writebatch_delete( - leveldb_writebatch_t* b, +void rocksdb_writebatch_delete( + rocksdb_writebatch_t* b, const char* key, size_t klen) { b->rep.Delete(Slice(key, klen)); } -void leveldb_writebatch_iterate( - leveldb_writebatch_t* b, +void rocksdb_writebatch_iterate( + rocksdb_writebatch_t* b, void* state, void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), void (*deleted)(void*, const char* k, size_t klen)) { @@ -388,132 +388,132 @@ void leveldb_writebatch_iterate( b->rep.Iterate(&handler); } -leveldb_options_t* leveldb_options_create() { - return new leveldb_options_t; +rocksdb_options_t* rocksdb_options_create() { + return new rocksdb_options_t; } -void leveldb_options_destroy(leveldb_options_t* options) { +void rocksdb_options_destroy(rocksdb_options_t* options) { delete options; } -void leveldb_options_set_comparator( - leveldb_options_t* opt, - leveldb_comparator_t* cmp) { +void rocksdb_options_set_comparator( + rocksdb_options_t* opt, + rocksdb_comparator_t* cmp) { opt->rep.comparator = cmp; } -void leveldb_options_set_filter_policy( - leveldb_options_t* opt, - leveldb_filterpolicy_t* policy) { +void rocksdb_options_set_filter_policy( + rocksdb_options_t* opt, + rocksdb_filterpolicy_t* policy) { opt->rep.filter_policy = policy; } -void leveldb_options_set_create_if_missing( - leveldb_options_t* opt, unsigned char v) { +void rocksdb_options_set_create_if_missing( + rocksdb_options_t* opt, unsigned char v) { opt->rep.create_if_missing = v; } -void leveldb_options_set_error_if_exists( - leveldb_options_t* opt, unsigned char v) { +void rocksdb_options_set_error_if_exists( + rocksdb_options_t* opt, unsigned char v) { opt->rep.error_if_exists = v; } -void leveldb_options_set_paranoid_checks( - leveldb_options_t* opt, unsigned char v) { +void rocksdb_options_set_paranoid_checks( + rocksdb_options_t* opt, unsigned char v) { opt->rep.paranoid_checks = v; } -void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) { +void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) { opt->rep.env = (env ? env->rep : NULL); } -void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) { +void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) { if (l) { opt->rep.info_log = l->rep; } } -void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) { +void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { opt->rep.write_buffer_size = s; } -void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) { +void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) { opt->rep.max_open_files = n; } -void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) { +void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) { if (c) { opt->rep.block_cache = c->rep; } } -void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) { +void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) { opt->rep.block_size = s; } -void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) { +void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) { opt->rep.block_restart_interval = n; } -void leveldb_options_set_target_file_size_base( - leveldb_options_t* opt, uint64_t n) { +void rocksdb_options_set_target_file_size_base( + rocksdb_options_t* opt, uint64_t n) { opt->rep.target_file_size_base = n; } -void leveldb_options_set_target_file_size_multiplier( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t* opt, int n) { opt->rep.target_file_size_multiplier = n; } -void leveldb_options_set_max_bytes_for_level_base( - leveldb_options_t* opt, uint64_t n) { +void rocksdb_options_set_max_bytes_for_level_base( + rocksdb_options_t* opt, uint64_t n) { opt->rep.max_bytes_for_level_base = n; } -void leveldb_options_set_max_bytes_for_level_multiplier( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_max_bytes_for_level_multiplier( + rocksdb_options_t* opt, int n) { opt->rep.max_bytes_for_level_multiplier = n; } -void leveldb_options_set_expanded_compaction_factor( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_expanded_compaction_factor( + rocksdb_options_t* opt, int n) { opt->rep.expanded_compaction_factor = n; } -void leveldb_options_set_max_grandparent_overlap_factor( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_max_grandparent_overlap_factor( + rocksdb_options_t* opt, int n) { opt->rep.max_grandparent_overlap_factor = n; } -void leveldb_options_set_num_levels(leveldb_options_t* opt, int n) { +void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { opt->rep.num_levels = n; } -void leveldb_options_set_level0_file_num_compaction_trigger( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t* opt, int n) { opt->rep.level0_file_num_compaction_trigger = n; } -void leveldb_options_set_level0_slowdown_writes_trigger( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_level0_slowdown_writes_trigger( + rocksdb_options_t* opt, int n) { opt->rep.level0_slowdown_writes_trigger = n; } -void leveldb_options_set_level0_stop_writes_trigger( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t* opt, int n) { opt->rep.level0_stop_writes_trigger = n; } -void leveldb_options_set_max_mem_compaction_level( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_max_mem_compaction_level( + rocksdb_options_t* opt, int n) { opt->rep.max_mem_compaction_level = n; } -void leveldb_options_set_compression(leveldb_options_t* opt, int t) { +void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) { opt->rep.compression = static_cast(t); } -void leveldb_options_set_compression_per_level(leveldb_options_t* opt, +void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt, int* level_values, size_t num_levels) { opt->rep.compression_per_level.resize(num_levels); @@ -523,43 +523,43 @@ void leveldb_options_set_compression_per_level(leveldb_options_t* opt, } } -void leveldb_options_set_compression_options( - leveldb_options_t* opt, int w_bits, int level, int strategy) { +void rocksdb_options_set_compression_options( + rocksdb_options_t* opt, int w_bits, int level, int strategy) { opt->rep.compression_opts.window_bits = w_bits; opt->rep.compression_opts.level = level; opt->rep.compression_opts.strategy = strategy; } -void leveldb_options_set_disable_data_sync( - leveldb_options_t* opt, bool disable_data_sync) { +void rocksdb_options_set_disable_data_sync( + rocksdb_options_t* opt, bool disable_data_sync) { opt->rep.disableDataSync = disable_data_sync; } -void leveldb_options_set_use_fsync( - leveldb_options_t* opt, bool use_fsync) { +void rocksdb_options_set_use_fsync( + rocksdb_options_t* opt, bool use_fsync) { opt->rep.use_fsync = use_fsync; } -void leveldb_options_set_db_stats_log_interval( - leveldb_options_t* opt, int db_stats_log_interval) { +void rocksdb_options_set_db_stats_log_interval( + rocksdb_options_t* opt, int db_stats_log_interval) { opt->rep.db_stats_log_interval = db_stats_log_interval; } -void leveldb_options_set_db_log_dir( - leveldb_options_t* opt, const char* db_log_dir) { +void rocksdb_options_set_db_log_dir( + rocksdb_options_t* opt, const char* db_log_dir) { opt->rep.db_log_dir = db_log_dir; } -void leveldb_options_set_WAL_ttl_seconds(leveldb_options_t* opt, uint64_t ttl) { +void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) { opt->rep.WAL_ttl_seconds = ttl; } -void leveldb_options_set_WAL_size_limit_MB( - leveldb_options_t* opt, uint64_t limit) { +void rocksdb_options_set_WAL_size_limit_MB( + rocksdb_options_t* opt, uint64_t limit) { opt->rep.WAL_size_limit_MB = limit; } -leveldb_comparator_t* leveldb_comparator_create( +rocksdb_comparator_t* rocksdb_comparator_create( void* state, void (*destructor)(void*), int (*compare)( @@ -567,7 +567,7 @@ leveldb_comparator_t* leveldb_comparator_create( const char* a, size_t alen, const char* b, size_t blen), const char* (*name)(void*)) { - leveldb_comparator_t* result = new leveldb_comparator_t; + rocksdb_comparator_t* result = new rocksdb_comparator_t; result->state_ = state; result->destructor_ = destructor; result->compare_ = compare; @@ -575,11 +575,11 @@ leveldb_comparator_t* leveldb_comparator_create( return result; } -void leveldb_comparator_destroy(leveldb_comparator_t* cmp) { +void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { delete cmp; } -leveldb_filterpolicy_t* leveldb_filterpolicy_create( +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( void* state, void (*destructor)(void*), char* (*create_filter)( @@ -592,7 +592,7 @@ leveldb_filterpolicy_t* leveldb_filterpolicy_create( const char* key, size_t length, const char* filter, size_t filter_length), const char* (*name)(void*)) { - leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t; + rocksdb_filterpolicy_t* result = new rocksdb_filterpolicy_t; result->state_ = state; result->destructor_ = destructor; result->create_ = create_filter; @@ -601,15 +601,15 @@ leveldb_filterpolicy_t* leveldb_filterpolicy_create( return result; } -void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) { +void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) { delete filter; } -leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) { - // Make a leveldb_filterpolicy_t, but override all of its methods so +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) { + // Make a rocksdb_filterpolicy_t, but override all of its methods so // they delegate to a NewBloomFilterPolicy() instead of user // supplied C functions. - struct Wrapper : public leveldb_filterpolicy_t { + struct Wrapper : public rocksdb_filterpolicy_t { const FilterPolicy* rep_; ~Wrapper() { delete rep_; } const char* Name() const { return rep_->Name(); } @@ -628,62 +628,62 @@ leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) { return wrapper; } -leveldb_readoptions_t* leveldb_readoptions_create() { - return new leveldb_readoptions_t; +rocksdb_readoptions_t* rocksdb_readoptions_create() { + return new rocksdb_readoptions_t; } -void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) { +void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { delete opt; } -void leveldb_readoptions_set_verify_checksums( - leveldb_readoptions_t* opt, +void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.verify_checksums = v; } -void leveldb_readoptions_set_fill_cache( - leveldb_readoptions_t* opt, unsigned char v) { +void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.fill_cache = v; } -void leveldb_readoptions_set_snapshot( - leveldb_readoptions_t* opt, - const leveldb_snapshot_t* snap) { +void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t* opt, + const rocksdb_snapshot_t* snap) { opt->rep.snapshot = (snap ? snap->rep : NULL); } -leveldb_writeoptions_t* leveldb_writeoptions_create() { - return new leveldb_writeoptions_t; +rocksdb_writeoptions_t* rocksdb_writeoptions_create() { + return new rocksdb_writeoptions_t; } -void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) { +void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { delete opt; } -void leveldb_writeoptions_set_sync( - leveldb_writeoptions_t* opt, unsigned char v) { +void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t* opt, unsigned char v) { opt->rep.sync = v; } -leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) { - leveldb_cache_t* c = new leveldb_cache_t; +rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) { + rocksdb_cache_t* c = new rocksdb_cache_t; c->rep = NewLRUCache(capacity); return c; } -void leveldb_cache_destroy(leveldb_cache_t* cache) { +void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; } -leveldb_env_t* leveldb_create_default_env() { - leveldb_env_t* result = new leveldb_env_t; +rocksdb_env_t* rocksdb_create_default_env() { + rocksdb_env_t* result = new rocksdb_env_t; result->rep = Env::Default(); result->is_default = true; return result; } -void leveldb_env_destroy(leveldb_env_t* env) { +void rocksdb_env_destroy(rocksdb_env_t* env) { if (!env->is_default) delete env->rep; delete env; } diff --git a/db/c_test.c b/db/c_test.c index abbe1ddd5f..8c5e8e5348 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -62,30 +62,30 @@ static void Free(char** ptr) { } static void CheckGet( - leveldb_t* db, - const leveldb_readoptions_t* options, + rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, const char* expected) { char* err = NULL; size_t val_len; char* val; - val = leveldb_get(db, options, key, strlen(key), &val_len, &err); + val = rocksdb_get(db, options, key, strlen(key), &val_len, &err); CheckNoError(err); CheckEqual(expected, val, val_len); Free(&val); } -static void CheckIter(leveldb_iterator_t* iter, +static void CheckIter(rocksdb_iterator_t* iter, const char* key, const char* val) { size_t len; const char* str; - str = leveldb_iter_key(iter, &len); + str = rocksdb_iter_key(iter, &len); CheckEqual(key, str, len); - str = leveldb_iter_value(iter, &len); + str = rocksdb_iter_value(iter, &len); CheckEqual(val, str, len); } -// Callback from leveldb_writebatch_iterate() +// Callback from rocksdb_writebatch_iterate() static void CheckPut(void* ptr, const char* k, size_t klen, const char* v, size_t vlen) { @@ -104,7 +104,7 @@ static void CheckPut(void* ptr, (*state)++; } -// Callback from leveldb_writebatch_iterate() +// Callback from rocksdb_writebatch_iterate() static void CheckDel(void* ptr, const char* k, size_t klen) { int* state = (int*) ptr; CheckCondition(*state == 2); @@ -155,117 +155,117 @@ unsigned char FilterKeyMatch( } int main(int argc, char** argv) { - leveldb_t* db; - leveldb_comparator_t* cmp; - leveldb_cache_t* cache; - leveldb_env_t* env; - leveldb_options_t* options; - leveldb_readoptions_t* roptions; - leveldb_writeoptions_t* woptions; + rocksdb_t* db; + rocksdb_comparator_t* cmp; + rocksdb_cache_t* cache; + rocksdb_env_t* env; + rocksdb_options_t* options; + rocksdb_readoptions_t* roptions; + rocksdb_writeoptions_t* woptions; char* err = NULL; int run = -1; snprintf(dbname, sizeof(dbname), - "%s/leveldb_c_test-%d", + "%s/rocksdb_c_test-%d", GetTempDir(), ((int) geteuid())); StartPhase("create_objects"); - cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); - env = leveldb_create_default_env(); - cache = leveldb_cache_create_lru(100000); + cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); + env = rocksdb_create_default_env(); + cache = rocksdb_cache_create_lru(100000); - options = leveldb_options_create(); - leveldb_options_set_comparator(options, cmp); - leveldb_options_set_error_if_exists(options, 1); - leveldb_options_set_cache(options, cache); - leveldb_options_set_env(options, env); - leveldb_options_set_info_log(options, NULL); - leveldb_options_set_write_buffer_size(options, 100000); - leveldb_options_set_paranoid_checks(options, 1); - leveldb_options_set_max_open_files(options, 10); - leveldb_options_set_block_size(options, 1024); - leveldb_options_set_block_restart_interval(options, 8); - leveldb_options_set_compression(options, leveldb_no_compression); - leveldb_options_set_compression_options(options, -14, -1, 0); - int compression_levels[] = {leveldb_no_compression, leveldb_no_compression, - leveldb_no_compression, leveldb_no_compression}; - leveldb_options_set_compression_per_level(options, compression_levels, 4); + options = rocksdb_options_create(); + rocksdb_options_set_comparator(options, cmp); + rocksdb_options_set_error_if_exists(options, 1); + rocksdb_options_set_cache(options, cache); + rocksdb_options_set_env(options, env); + rocksdb_options_set_info_log(options, NULL); + rocksdb_options_set_write_buffer_size(options, 100000); + rocksdb_options_set_paranoid_checks(options, 1); + rocksdb_options_set_max_open_files(options, 10); + rocksdb_options_set_block_size(options, 1024); + rocksdb_options_set_block_restart_interval(options, 8); + rocksdb_options_set_compression(options, rocksdb_no_compression); + rocksdb_options_set_compression_options(options, -14, -1, 0); + int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression, + rocksdb_no_compression, rocksdb_no_compression}; + rocksdb_options_set_compression_per_level(options, compression_levels, 4); - roptions = leveldb_readoptions_create(); - leveldb_readoptions_set_verify_checksums(roptions, 1); - leveldb_readoptions_set_fill_cache(roptions, 0); + roptions = rocksdb_readoptions_create(); + rocksdb_readoptions_set_verify_checksums(roptions, 1); + rocksdb_readoptions_set_fill_cache(roptions, 0); - woptions = leveldb_writeoptions_create(); - leveldb_writeoptions_set_sync(woptions, 1); + woptions = rocksdb_writeoptions_create(); + rocksdb_writeoptions_set_sync(woptions, 1); StartPhase("destroy"); - leveldb_destroy_db(options, dbname, &err); + rocksdb_destroy_db(options, dbname, &err); Free(&err); StartPhase("open_error"); - db = leveldb_open(options, dbname, &err); + db = rocksdb_open(options, dbname, &err); CheckCondition(err != NULL); Free(&err); StartPhase("open"); - leveldb_options_set_create_if_missing(options, 1); - db = leveldb_open(options, dbname, &err); + rocksdb_options_set_create_if_missing(options, 1); + db = rocksdb_open(options, dbname, &err); CheckNoError(err); CheckGet(db, roptions, "foo", NULL); StartPhase("put"); - leveldb_put(db, woptions, "foo", 3, "hello", 5, &err); + rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err); CheckNoError(err); CheckGet(db, roptions, "foo", "hello"); StartPhase("compactall"); - leveldb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_compact_range(db, NULL, 0, NULL, 0); CheckGet(db, roptions, "foo", "hello"); StartPhase("compactrange"); - leveldb_compact_range(db, "a", 1, "z", 1); + rocksdb_compact_range(db, "a", 1, "z", 1); CheckGet(db, roptions, "foo", "hello"); StartPhase("writebatch"); { - leveldb_writebatch_t* wb = leveldb_writebatch_create(); - leveldb_writebatch_put(wb, "foo", 3, "a", 1); - leveldb_writebatch_clear(wb); - leveldb_writebatch_put(wb, "bar", 3, "b", 1); - leveldb_writebatch_put(wb, "box", 3, "c", 1); - leveldb_writebatch_delete(wb, "bar", 3); - leveldb_write(db, woptions, wb, &err); + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_put(wb, "foo", 3, "a", 1); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put(wb, "bar", 3, "b", 1); + rocksdb_writebatch_put(wb, "box", 3, "c", 1); + rocksdb_writebatch_delete(wb, "bar", 3); + rocksdb_write(db, woptions, wb, &err); CheckNoError(err); CheckGet(db, roptions, "foo", "hello"); CheckGet(db, roptions, "bar", NULL); CheckGet(db, roptions, "box", "c"); int pos = 0; - leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); + rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); CheckCondition(pos == 3); - leveldb_writebatch_destroy(wb); + rocksdb_writebatch_destroy(wb); } StartPhase("iter"); { - leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions); - CheckCondition(!leveldb_iter_valid(iter)); - leveldb_iter_seek_to_first(iter); - CheckCondition(leveldb_iter_valid(iter)); + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); CheckIter(iter, "box", "c"); - leveldb_iter_next(iter); + rocksdb_iter_next(iter); CheckIter(iter, "foo", "hello"); - leveldb_iter_prev(iter); + rocksdb_iter_prev(iter); CheckIter(iter, "box", "c"); - leveldb_iter_prev(iter); - CheckCondition(!leveldb_iter_valid(iter)); - leveldb_iter_seek_to_last(iter); + rocksdb_iter_prev(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_last(iter); CheckIter(iter, "foo", "hello"); - leveldb_iter_seek(iter, "b", 1); + rocksdb_iter_seek(iter, "b", 1); CheckIter(iter, "box", "c"); - leveldb_iter_get_error(iter, &err); + rocksdb_iter_get_error(iter, &err); CheckNoError(err); - leveldb_iter_destroy(iter); + rocksdb_iter_destroy(iter); } StartPhase("approximate_sizes"); @@ -279,39 +279,39 @@ int main(int argc, char** argv) { size_t start_len[2] = { 1, 21 }; const char* limit[2] = { "k00000000000000010000", "z" }; size_t limit_len[2] = { 21, 1 }; - leveldb_writeoptions_set_sync(woptions, 0); + rocksdb_writeoptions_set_sync(woptions, 0); for (i = 0; i < n; i++) { snprintf(keybuf, sizeof(keybuf), "k%020d", i); snprintf(valbuf, sizeof(valbuf), "v%020d", i); - leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf), + rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf), &err); CheckNoError(err); } - leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); + rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); CheckCondition(sizes[0] > 0); CheckCondition(sizes[1] > 0); } StartPhase("property"); { - char* prop = leveldb_property_value(db, "nosuchprop"); + char* prop = rocksdb_property_value(db, "nosuchprop"); CheckCondition(prop == NULL); - prop = leveldb_property_value(db, "rocksdb.stats"); + prop = rocksdb_property_value(db, "rocksdb.stats"); CheckCondition(prop != NULL); Free(&prop); } StartPhase("snapshot"); { - const leveldb_snapshot_t* snap; - snap = leveldb_create_snapshot(db); - leveldb_delete(db, woptions, "foo", 3, &err); + const rocksdb_snapshot_t* snap; + snap = rocksdb_create_snapshot(db); + rocksdb_delete(db, woptions, "foo", 3, &err); CheckNoError(err); - leveldb_readoptions_set_snapshot(roptions, snap); + rocksdb_readoptions_set_snapshot(roptions, snap); CheckGet(db, roptions, "foo", "hello"); - leveldb_readoptions_set_snapshot(roptions, NULL); + rocksdb_readoptions_set_snapshot(roptions, NULL); CheckGet(db, roptions, "foo", NULL); - leveldb_release_snapshot(db, snap); + rocksdb_release_snapshot(db, snap); } StartPhase("repair"); @@ -320,44 +320,44 @@ int main(int argc, char** argv) { // files (https://reviews.facebook.net/D6123) would leave // around deleted files and the repair process will find // those files and put them back into the database. - leveldb_compact_range(db, NULL, 0, NULL, 0); - leveldb_close(db); - leveldb_options_set_create_if_missing(options, 0); - leveldb_options_set_error_if_exists(options, 0); - leveldb_repair_db(options, dbname, &err); + rocksdb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_close(db); + rocksdb_options_set_create_if_missing(options, 0); + rocksdb_options_set_error_if_exists(options, 0); + rocksdb_repair_db(options, dbname, &err); CheckNoError(err); - db = leveldb_open(options, dbname, &err); + db = rocksdb_open(options, dbname, &err); CheckNoError(err); CheckGet(db, roptions, "foo", NULL); CheckGet(db, roptions, "bar", NULL); CheckGet(db, roptions, "box", "c"); - leveldb_options_set_create_if_missing(options, 1); - leveldb_options_set_error_if_exists(options, 1); + rocksdb_options_set_create_if_missing(options, 1); + rocksdb_options_set_error_if_exists(options, 1); } StartPhase("filter"); for (run = 0; run < 2; run++) { // First run uses custom filter, second run uses bloom filter CheckNoError(err); - leveldb_filterpolicy_t* policy; + rocksdb_filterpolicy_t* policy; if (run == 0) { - policy = leveldb_filterpolicy_create( + policy = rocksdb_filterpolicy_create( NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName); } else { - policy = leveldb_filterpolicy_create_bloom(10); + policy = rocksdb_filterpolicy_create_bloom(10); } // Create new database - leveldb_close(db); - leveldb_destroy_db(options, dbname, &err); - leveldb_options_set_filter_policy(options, policy); - db = leveldb_open(options, dbname, &err); + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_set_filter_policy(options, policy); + db = rocksdb_open(options, dbname, &err); CheckNoError(err); - leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); CheckNoError(err); - leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err); + rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err); CheckNoError(err); - leveldb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_compact_range(db, NULL, 0, NULL, 0); fake_filter_result = 1; CheckGet(db, roptions, "foo", "foovalue"); @@ -372,18 +372,18 @@ int main(int argc, char** argv) { CheckGet(db, roptions, "foo", "foovalue"); CheckGet(db, roptions, "bar", "barvalue"); } - leveldb_options_set_filter_policy(options, NULL); - leveldb_filterpolicy_destroy(policy); + rocksdb_options_set_filter_policy(options, NULL); + rocksdb_filterpolicy_destroy(policy); } StartPhase("cleanup"); - leveldb_close(db); - leveldb_options_destroy(options); - leveldb_readoptions_destroy(roptions); - leveldb_writeoptions_destroy(woptions); - leveldb_cache_destroy(cache); - leveldb_comparator_destroy(cmp); - leveldb_env_destroy(env); + rocksdb_close(db); + rocksdb_options_destroy(options); + rocksdb_readoptions_destroy(roptions); + rocksdb_writeoptions_destroy(woptions); + rocksdb_cache_destroy(cache); + rocksdb_comparator_destroy(cmp); + rocksdb_env_destroy(env); fprintf(stderr, "PASS\n"); return 0; diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 787bcf4315..e093e9448d 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -54,171 +54,171 @@ extern "C" { /* Exported types */ -typedef struct leveldb_t leveldb_t; -typedef struct leveldb_cache_t leveldb_cache_t; -typedef struct leveldb_comparator_t leveldb_comparator_t; -typedef struct leveldb_env_t leveldb_env_t; -typedef struct leveldb_filelock_t leveldb_filelock_t; -typedef struct leveldb_filterpolicy_t leveldb_filterpolicy_t; -typedef struct leveldb_iterator_t leveldb_iterator_t; -typedef struct leveldb_logger_t leveldb_logger_t; -typedef struct leveldb_options_t leveldb_options_t; -typedef struct leveldb_randomfile_t leveldb_randomfile_t; -typedef struct leveldb_readoptions_t leveldb_readoptions_t; -typedef struct leveldb_seqfile_t leveldb_seqfile_t; -typedef struct leveldb_snapshot_t leveldb_snapshot_t; -typedef struct leveldb_writablefile_t leveldb_writablefile_t; -typedef struct leveldb_writebatch_t leveldb_writebatch_t; -typedef struct leveldb_writeoptions_t leveldb_writeoptions_t; +typedef struct rocksdb_t rocksdb_t; +typedef struct rocksdb_cache_t rocksdb_cache_t; +typedef struct rocksdb_comparator_t rocksdb_comparator_t; +typedef struct rocksdb_env_t rocksdb_env_t; +typedef struct rocksdb_filelock_t rocksdb_filelock_t; +typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t; +typedef struct rocksdb_iterator_t rocksdb_iterator_t; +typedef struct rocksdb_logger_t rocksdb_logger_t; +typedef struct rocksdb_options_t rocksdb_options_t; +typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; +typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; +typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; +typedef struct rocksdb_snapshot_t rocksdb_snapshot_t; +typedef struct rocksdb_writablefile_t rocksdb_writablefile_t; +typedef struct rocksdb_writebatch_t rocksdb_writebatch_t; +typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; /* DB operations */ -extern leveldb_t* leveldb_open( - const leveldb_options_t* options, +extern rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, const char* name, char** errptr); -extern void leveldb_close(leveldb_t* db); +extern void rocksdb_close(rocksdb_t* db); -extern void leveldb_put( - leveldb_t* db, - const leveldb_writeoptions_t* options, +extern void rocksdb_put( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, const char* key, size_t keylen, const char* val, size_t vallen, char** errptr); -extern void leveldb_delete( - leveldb_t* db, - const leveldb_writeoptions_t* options, +extern void rocksdb_delete( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, const char* key, size_t keylen, char** errptr); -extern void leveldb_write( - leveldb_t* db, - const leveldb_writeoptions_t* options, - leveldb_writebatch_t* batch, +extern void rocksdb_write( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr); /* Returns NULL if not found. A malloc()ed array otherwise. Stores the length of the array in *vallen. */ -extern char* leveldb_get( - leveldb_t* db, - const leveldb_readoptions_t* options, +extern char* rocksdb_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, size_t* vallen, char** errptr); -extern leveldb_iterator_t* leveldb_create_iterator( - leveldb_t* db, - const leveldb_readoptions_t* options); +extern rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, + const rocksdb_readoptions_t* options); -extern const leveldb_snapshot_t* leveldb_create_snapshot( - leveldb_t* db); +extern const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db); -extern void leveldb_release_snapshot( - leveldb_t* db, - const leveldb_snapshot_t* snapshot); +extern void rocksdb_release_snapshot( + rocksdb_t* db, + const rocksdb_snapshot_t* snapshot); /* Returns NULL if property name is unknown. Else returns a pointer to a malloc()-ed null-terminated value. */ -extern char* leveldb_property_value( - leveldb_t* db, +extern char* rocksdb_property_value( + rocksdb_t* db, const char* propname); -extern void leveldb_approximate_sizes( - leveldb_t* db, +extern void rocksdb_approximate_sizes( + rocksdb_t* db, int num_ranges, const char* const* range_start_key, const size_t* range_start_key_len, const char* const* range_limit_key, const size_t* range_limit_key_len, uint64_t* sizes); -extern void leveldb_compact_range( - leveldb_t* db, +extern void rocksdb_compact_range( + rocksdb_t* db, const char* start_key, size_t start_key_len, const char* limit_key, size_t limit_key_len); /* Management operations */ -extern void leveldb_destroy_db( - const leveldb_options_t* options, +extern void rocksdb_destroy_db( + const rocksdb_options_t* options, const char* name, char** errptr); -extern void leveldb_repair_db( - const leveldb_options_t* options, +extern void rocksdb_repair_db( + const rocksdb_options_t* options, const char* name, char** errptr); /* Iterator */ -extern void leveldb_iter_destroy(leveldb_iterator_t*); -extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*); -extern void leveldb_iter_seek_to_first(leveldb_iterator_t*); -extern void leveldb_iter_seek_to_last(leveldb_iterator_t*); -extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen); -extern void leveldb_iter_next(leveldb_iterator_t*); -extern void leveldb_iter_prev(leveldb_iterator_t*); -extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen); -extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen); -extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr); +extern void rocksdb_iter_destroy(rocksdb_iterator_t*); +extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*); +extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*); +extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*); +extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen); +extern void rocksdb_iter_next(rocksdb_iterator_t*); +extern void rocksdb_iter_prev(rocksdb_iterator_t*); +extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen); +extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen); +extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr); /* Write batch */ -extern leveldb_writebatch_t* leveldb_writebatch_create(); -extern void leveldb_writebatch_destroy(leveldb_writebatch_t*); -extern void leveldb_writebatch_clear(leveldb_writebatch_t*); -extern void leveldb_writebatch_put( - leveldb_writebatch_t*, +extern rocksdb_writebatch_t* rocksdb_writebatch_create(); +extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*); +extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*); +extern void rocksdb_writebatch_put( + rocksdb_writebatch_t*, const char* key, size_t klen, const char* val, size_t vlen); -extern void leveldb_writebatch_delete( - leveldb_writebatch_t*, +extern void rocksdb_writebatch_delete( + rocksdb_writebatch_t*, const char* key, size_t klen); -extern void leveldb_writebatch_iterate( - leveldb_writebatch_t*, +extern void rocksdb_writebatch_iterate( + rocksdb_writebatch_t*, void* state, void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), void (*deleted)(void*, const char* k, size_t klen)); /* Options */ -extern leveldb_options_t* leveldb_options_create(); -extern void leveldb_options_destroy(leveldb_options_t*); -extern void leveldb_options_set_comparator( - leveldb_options_t*, - leveldb_comparator_t*); -extern void leveldb_options_set_compression_per_level( - leveldb_options_t* opt, +extern rocksdb_options_t* rocksdb_options_create(); +extern void rocksdb_options_destroy(rocksdb_options_t*); +extern void rocksdb_options_set_comparator( + rocksdb_options_t*, + rocksdb_comparator_t*); +extern void rocksdb_options_set_compression_per_level( + rocksdb_options_t* opt, int* level_values, size_t num_levels); -extern void leveldb_options_set_filter_policy( - leveldb_options_t*, - leveldb_filterpolicy_t*); -extern void leveldb_options_set_create_if_missing( - leveldb_options_t*, unsigned char); -extern void leveldb_options_set_error_if_exists( - leveldb_options_t*, unsigned char); -extern void leveldb_options_set_paranoid_checks( - leveldb_options_t*, unsigned char); -extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*); -extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*); -extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t); -extern void leveldb_options_set_max_open_files(leveldb_options_t*, int); -extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*); -extern void leveldb_options_set_block_size(leveldb_options_t*, size_t); -extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int); -extern void leveldb_options_set_compression_options( - leveldb_options_t* opt, int w_bits, int level, int strategy); +extern void rocksdb_options_set_filter_policy( + rocksdb_options_t*, + rocksdb_filterpolicy_t*); +extern void rocksdb_options_set_create_if_missing( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_error_if_exists( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_paranoid_checks( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*); +extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*); +extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int); +extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*); +extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int); +extern void rocksdb_options_set_compression_options( + rocksdb_options_t* opt, int w_bits, int level, int strategy); enum { - leveldb_no_compression = 0, - leveldb_snappy_compression = 1 + rocksdb_no_compression = 0, + rocksdb_snappy_compression = 1 }; -extern void leveldb_options_set_compression(leveldb_options_t*, int); +extern void rocksdb_options_set_compression(rocksdb_options_t*, int); /* Comparator */ -extern leveldb_comparator_t* leveldb_comparator_create( +extern rocksdb_comparator_t* rocksdb_comparator_create( void* state, void (*destructor)(void*), int (*compare)( @@ -226,11 +226,11 @@ extern leveldb_comparator_t* leveldb_comparator_create( const char* a, size_t alen, const char* b, size_t blen), const char* (*name)(void*)); -extern void leveldb_comparator_destroy(leveldb_comparator_t*); +extern void rocksdb_comparator_destroy(rocksdb_comparator_t*); /* Filter policy */ -extern leveldb_filterpolicy_t* leveldb_filterpolicy_create( +extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( void* state, void (*destructor)(void*), char* (*create_filter)( @@ -243,40 +243,40 @@ extern leveldb_filterpolicy_t* leveldb_filterpolicy_create( const char* key, size_t length, const char* filter, size_t filter_length), const char* (*name)(void*)); -extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*); +extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*); -extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom( +extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom( int bits_per_key); /* Read options */ -extern leveldb_readoptions_t* leveldb_readoptions_create(); -extern void leveldb_readoptions_destroy(leveldb_readoptions_t*); -extern void leveldb_readoptions_set_verify_checksums( - leveldb_readoptions_t*, +extern rocksdb_readoptions_t* rocksdb_readoptions_create(); +extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*); +extern void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t*, unsigned char); -extern void leveldb_readoptions_set_fill_cache( - leveldb_readoptions_t*, unsigned char); -extern void leveldb_readoptions_set_snapshot( - leveldb_readoptions_t*, - const leveldb_snapshot_t*); +extern void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t*, unsigned char); +extern void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t*, + const rocksdb_snapshot_t*); /* Write options */ -extern leveldb_writeoptions_t* leveldb_writeoptions_create(); -extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*); -extern void leveldb_writeoptions_set_sync( - leveldb_writeoptions_t*, unsigned char); +extern rocksdb_writeoptions_t* rocksdb_writeoptions_create(); +extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*); +extern void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t*, unsigned char); /* Cache */ -extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity); -extern void leveldb_cache_destroy(leveldb_cache_t* cache); +extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity); +extern void rocksdb_cache_destroy(rocksdb_cache_t* cache); /* Env */ -extern leveldb_env_t* leveldb_create_default_env(); -extern void leveldb_env_destroy(leveldb_env_t*); +extern rocksdb_env_t* rocksdb_create_default_env(); +extern void rocksdb_env_destroy(rocksdb_env_t*); #ifdef __cplusplus } /* end extern "C" */ From 7cf5728440229c65ddf65c7f8063d0387ee18aa5 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Tue, 10 Dec 2013 10:35:06 -0800 Subject: [PATCH 10/40] Cleaning up BackupableDB + fix valgrind errors Summary: Valgrind complained about BackupableDB. This fixes valgrind errors. Also, I cleaned up some code. Test Plan: valgrind does not complain anymore Reviewers: dhruba Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14529 --- utilities/backupable/backupable_db.cc | 40 ++++++++++------------ utilities/backupable/backupable_db_test.cc | 14 ++++++-- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 498606045a..f68e821d0e 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -82,6 +82,8 @@ class BackupEngine { std::vector files_; std::unordered_map* file_refs_; Env* env_; + + static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024; // 10MB }; // BackupMeta inline std::string GetAbsolutePath( @@ -144,8 +146,6 @@ class BackupEngine { Env* db_env_; Env* backup_env_; - // constants - static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024; // 10MB static const size_t copy_file_buffer_size_ = 5 * 1024 * 1024LL; // 5MB }; @@ -438,21 +438,19 @@ Status BackupEngine::GetLatestBackupFileContents(uint32_t* latest_backup) { return s; } - char* buf = new char[10]; - Slice data(buf, 0); - + char buf[11]; + Slice data; s = file->Read(10, &data, buf); - if (!s.ok() || data.size() == 0) { - delete[] buf; return s.ok() ? Status::Corruption("Latest backup file corrupted") : s; } + buf[data.size()] = 0; + *latest_backup = 0; sscanf(data.data(), "%u", latest_backup); if (backup_env_->FileExists(GetBackupMetaFile(*latest_backup)) == false) { s = Status::Corruption("Latest backup file corrupted"); } - delete[] buf; return Status::OK(); } @@ -473,7 +471,7 @@ Status BackupEngine::PutLatestBackupFileContents(uint32_t latest_backup) { return s; } - char* file_contents = new char[10]; + char file_contents[10]; int len = sprintf(file_contents, "%u\n", latest_backup); s = file->Append(Slice(file_contents, len)); if (s.ok() && options_.sync) { @@ -519,13 +517,13 @@ Status BackupEngine::CopyFile(const std::string& src, return s; } - char* buf = new char[copy_file_buffer_size_]; - Slice data(buf, 0); + unique_ptr buf(new char[copy_file_buffer_size_]); + Slice data; do { size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? copy_file_buffer_size_ : size_limit; - s = src_file->Read(buffer_to_read, &data, buf); + s = src_file->Read(buffer_to_read, &data, buf.get()); size_limit -= data.size(); if (size != nullptr) { *size += data.size(); @@ -700,12 +698,11 @@ Status BackupEngine::BackupMeta::LoadFromFile(const std::string& backup_dir) { return s; } - char* buf = new char[max_backup_meta_file_size_ + 1]; - Slice data(buf, 0); - s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf); + unique_ptr buf(new char[max_backup_meta_file_size_ + 1]); + Slice data; + s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf.get()); if (!s.ok() || data.size() == max_backup_meta_file_size_) { - delete[] buf; return s.ok() ? Status::IOError("File size too big") : s; } buf[data.size()] = 0; @@ -724,7 +721,6 @@ Status BackupEngine::BackupMeta::LoadFromFile(const std::string& backup_dir) { AddFile(filename, size); } - delete[] buf; return s; } @@ -739,15 +735,15 @@ Status BackupEngine::BackupMeta::StoreToFile(bool sync) { return s; } - char* buf = new char[max_backup_meta_file_size_]; + unique_ptr buf(new char[max_backup_meta_file_size_]); int len = 0, buf_size = max_backup_meta_file_size_; - len += snprintf(buf, buf_size, "%" PRId64 "\n", timestamp_); - len += snprintf(buf + len, buf_size - len, "%zu\n", files_.size()); + len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_); + len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size()); for (size_t i = 0; i < files_.size(); ++i) { - len += snprintf(buf + len, buf_size - len, "%s\n", files_[i].c_str()); + len += snprintf(buf.get() + len, buf_size - len, "%s\n", files_[i].c_str()); } - s = backup_meta_file->Append(Slice(buf, (size_t)len)); + s = backup_meta_file->Append(Slice(buf.get(), (size_t)len)); if (s.ok() && sync) { s = backup_meta_file->Sync(); } diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 31a5abf87d..58fb036e52 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -124,8 +124,13 @@ class TestEnv : public EnvWrapper { explicit TestEnv(Env* t) : EnvWrapper(t) {} class DummySequentialFile : public SequentialFile { + public: + DummySequentialFile() : SequentialFile(), rnd_(5) {} virtual Status Read(size_t n, Slice* result, char* scratch) { size_t read_size = (n > size_left) ? size_left : n; + for (size_t i = 0; i < read_size; ++i) { + scratch[i] = rnd_.Next() & 255; + } *result = Slice(scratch, read_size); size_left -= read_size; return Status::OK(); @@ -137,6 +142,7 @@ class TestEnv : public EnvWrapper { } private: size_t size_left = 200; + Random rnd_; }; Status NewSequentialFile(const std::string& f, @@ -291,9 +297,9 @@ class BackupableDBTest { options_.wal_dir = dbname_; // set up backup db options CreateLoggerFromOptions(dbname_, backupdir_, env_, - Options(), &logger); + Options(), &logger_); backupable_options_.reset(new BackupableDBOptions( - backupdir_, test_backup_env_.get(), logger.get(), true)); + backupdir_, test_backup_env_.get(), logger_.get(), true)); // delete old files in db DestroyDB(dbname_, Options()); @@ -377,7 +383,7 @@ class BackupableDBTest { // options Options options_; unique_ptr backupable_options_; - std::shared_ptr logger; + std::shared_ptr logger_; }; // BackupableDBTest void AppendPath(const std::string& path, std::vector& v) { @@ -432,6 +438,8 @@ TEST(BackupableDBTest, NoDoubleCopy) { ASSERT_EQ(100, size); test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size); ASSERT_EQ(200, size); + + CloseBackupableDB(); } // test various kind of corruptions that may happen: From cbe7ffef9a2f63c44eb4f8d27cae2e5313b91069 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Tue, 10 Dec 2013 10:48:49 -0800 Subject: [PATCH 11/40] fix comparison between signed and unsigned --- utilities/backupable/backupable_db_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 58fb036e52..738e11b914 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -435,9 +435,9 @@ TEST(BackupableDBTest, NoDoubleCopy) { // MANIFEST file size should be only 100 uint64_t size; test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size); - ASSERT_EQ(100, size); + ASSERT_EQ(100UL, size); test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size); - ASSERT_EQ(200, size); + ASSERT_EQ(200UL, size); CloseBackupableDB(); } From 4815468be40121750521e94cf1c657cdab90137d Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Tue, 10 Dec 2013 10:52:47 -0800 Subject: [PATCH 12/40] Fix another sign and unsign comparison in test --- utilities/backupable/backupable_db_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 738e11b914..69ffa6e562 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -614,7 +614,7 @@ TEST(BackupableDBTest, OnlineIntegrationTest) { std::vector backup_info; restore_db_->GetBackupInfo(&backup_info); - ASSERT_EQ(2, backup_info.size()); + ASSERT_EQ(2UL, backup_info.size()); // check backup 3 AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key); From 19f5463d3f13d39d27df26672149b26520c124e1 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Tue, 10 Dec 2013 10:57:46 -0800 Subject: [PATCH 13/40] Don't LogFlush() in foreground threads Summary: So fflush() takes a lock which is heavyweight. I added flush_pending_, but more importantly, I removed LogFlush() from foreground threads. Test Plan: ./db_test Reviewers: dhruba, haobo Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14535 --- db/db_impl.cc | 3 --- util/posix_logger.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 697d0017b5..20ab82a0c3 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -2643,7 +2643,6 @@ Status DBImpl::GetImpl(const ReadOptions& options, delete m; for (MemTable* v: to_delete) delete v; - LogFlush(options_.info_log); // Note, tickers are atomic now - no lock protection needed any more. RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); RecordTick(options_.statistics.get(), BYTES_READ, value->size()); @@ -2729,7 +2728,6 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, delete m; for (MemTable* v: to_delete) delete v; - LogFlush(options_.info_log); RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS); RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, numKeys); RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytesRead); @@ -2877,7 +2875,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, last_sequence); } - LogFlush(options_.info_log); mutex_.Lock(); if (status.ok()) { versions_->SetLastSequence(last_sequence); diff --git a/util/posix_logger.h b/util/posix_logger.h index 0a09bd1ebc..99d7ed9997 100644 --- a/util/posix_logger.h +++ b/util/posix_logger.h @@ -36,15 +36,19 @@ class PosixLogger : public Logger { const static uint64_t flush_every_seconds_ = 5; std::atomic_uint_fast64_t last_flush_micros_; Env* env_; + bool flush_pending_; public: PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env) : file_(f), gettid_(gettid), log_size_(0), fd_(fileno(f)), - last_flush_micros_(0), env_(env) { } + last_flush_micros_(0), env_(env), flush_pending_(false) { } virtual ~PosixLogger() { fclose(file_); } virtual void Flush() { - fflush(file_); + if (flush_pending_) { + flush_pending_ = false; + fflush(file_); + } last_flush_micros_ = env_->NowMicros(); } virtual void Logv(const char* format, va_list ap) { @@ -124,6 +128,7 @@ class PosixLogger : public Logger { #endif size_t sz = fwrite(base, 1, write_size, file_); + flush_pending_ = true; assert(sz == write_size); if (sz > 0) { log_size_ += write_size; @@ -131,6 +136,7 @@ class PosixLogger : public Logger { uint64_t now_micros = static_cast(now_tv.tv_sec) * 1000000 + now_tv.tv_usec; if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + flush_pending_ = false; fflush(file_); last_flush_micros_ = now_micros; } From 204bb9cffd7ea5dd067b9e72650a4ca991da3867 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Tue, 10 Dec 2013 10:59:00 -0800 Subject: [PATCH 14/40] Get rid of LogFlush() in InternalIterator --- db/db_impl.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 20ab82a0c3..da7a1a4b95 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -2559,7 +2559,6 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); mutex_.Unlock(); - LogFlush(options_.info_log); return internal_iter; } From 5e4ab767cf5b9df49de6a879d90cd18d5ca88f5e Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Tue, 10 Dec 2013 20:49:28 -0800 Subject: [PATCH 15/40] BackupableDB delete backups with newer seq number Summary: We now delete backups with newer sequence number, so the clients don't have to handle confusing situations when they restore from backup. Test Plan: added a unit test Reviewers: dhruba Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14547 --- include/utilities/backupable_db.h | 13 +++-- utilities/backupable/backupable_db.cc | 56 +++++++++++++++++-- utilities/backupable/backupable_db_test.cc | 65 +++++++++++++++++----- 3 files changed, 110 insertions(+), 24 deletions(-) diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h index b90c3e93a3..335e028576 100644 --- a/include/utilities/backupable_db.h +++ b/include/utilities/backupable_db.h @@ -78,6 +78,9 @@ class BackupableDB : public StackableDB { public: // BackupableDBOptions have to be the same as the ones used in a previous // incarnation of the DB + // + // BackupableDB ownes the pointer `DB* db` now. You should not delete it or + // use it after the invocation of BackupableDB BackupableDB(DB* db, const BackupableDBOptions& options); virtual ~BackupableDB(); @@ -106,10 +109,12 @@ class RestoreBackupableDB { // restore from backup with backup_id // IMPORTANT -- if you restore from some backup that is not the latest, - // you HAVE to delete all the newer backups immediately, before creating - // new backup on the restored database. Otherwise, your new backups - // will be corrupted. - // TODO should we enforce this somehow? + // and you start creating new backups from the new DB, all the backups + // that were newer than the backup you restored from will be deleted + // + // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. + // If you try creating a new backup now, old backups 4 and 5 will be deleted + // and new backup with ID 4 will be created. Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir, const std::string& wal_dir); diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index f68e821d0e..7d605c9684 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -40,6 +40,8 @@ class BackupEngine { return RestoreDBFromBackup(latest_backup_id_, db_dir, wal_dir); } + void DeleteBackupsNewerThan(uint64_t sequence_number); + private: class BackupMeta { public: @@ -59,6 +61,12 @@ class BackupEngine { uint64_t GetSize() const { return size_; } + void SetSequenceNumber(uint64_t sequence_number) { + sequence_number_ = sequence_number; + } + uint64_t GetSequenceNumber() { + return sequence_number_; + } void AddFile(const std::string& filename, uint64_t size); void Delete(); @@ -76,6 +84,9 @@ class BackupEngine { private: int64_t timestamp_; + // sequence number is only approximate, should not be used + // by clients + uint64_t sequence_number_; uint64_t size_; std::string const meta_filename_; // files with relative paths (without "/" prefix!!) @@ -232,11 +243,31 @@ BackupEngine::~BackupEngine() { LogFlush(options_.info_log); } +void BackupEngine::DeleteBackupsNewerThan(uint64_t sequence_number) { + for (auto backup : backups_) { + if (backup.second.GetSequenceNumber() > sequence_number) { + Log(options_.info_log, + "Deleting backup %u because sequence number (%lu) is newer than %lu", + backup.first, backup.second.GetSequenceNumber(), sequence_number); + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + } + for (auto ob : obsolete_backups_) { + backups_.erase(backups_.find(ob)); + } + auto itr = backups_.end(); + latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first; + PutLatestBackupFileContents(latest_backup_id_); // Ignore errors + GarbageCollection(false); +} + Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) { Status s; std::vector live_files; VectorLogPtr live_wal_files; uint64_t manifest_file_size = 0; + uint64_t sequence_number = db->GetLatestSequenceNumber(); s = db->DisableFileDeletions(); if (s.ok()) { @@ -261,6 +292,7 @@ Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) { assert(ret.second == true); auto& new_backup = ret.first->second; new_backup.RecordTimestamp(); + new_backup.SetSequenceNumber(sequence_number); Log(options_.info_log, "Started the backup process -- creating backup %u", new_backup_id); @@ -603,8 +635,8 @@ void BackupEngine::GarbageCollection(bool full_scan) { Log(options_.info_log, "Deleting private dir %s -- %s", private_dir.c_str(), s.ToString().c_str()); } - obsolete_backups_.clear(); } + obsolete_backups_.clear(); if (full_scan) { Log(options_.info_log, "Starting full scan garbage collection"); @@ -684,6 +716,7 @@ void BackupEngine::BackupMeta::Delete() { // each backup meta file is of the format: // +// // // // @@ -711,14 +744,24 @@ Status BackupEngine::BackupMeta::LoadFromFile(const std::string& backup_dir) { int bytes_read = 0; sscanf(data.data(), "%ld%n", ×tamp_, &bytes_read); data.remove_prefix(bytes_read + 1); // +1 for '\n' + sscanf(data.data(), "%lu%n", &sequence_number_, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' sscanf(data.data(), "%u%n", &num_files, &bytes_read); data.remove_prefix(bytes_read + 1); // +1 for '\n' + std::vector> files; + for (uint32_t i = 0; s.ok() && i < num_files; ++i) { std::string filename = GetSliceUntil(&data, '\n').ToString(); uint64_t size; s = env_->GetFileSize(backup_dir + "/" + filename, &size); - AddFile(filename, size); + files.push_back(std::make_pair(filename, size)); + } + + if (s.ok()) { + for (auto file : files) { + AddFile(file.first, file.second); + } } return s; @@ -738,6 +781,8 @@ Status BackupEngine::BackupMeta::StoreToFile(bool sync) { unique_ptr buf(new char[max_backup_meta_file_size_]); int len = 0, buf_size = max_backup_meta_file_size_; len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_); + len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n", + sequence_number_); len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size()); for (size_t i = 0; i < files_.size(); ++i) { len += snprintf(buf.get() + len, buf_size - len, "%s\n", files_[i].c_str()); @@ -758,9 +803,10 @@ Status BackupEngine::BackupMeta::StoreToFile(bool sync) { // --- BackupableDB methods -------- -BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options) : - StackableDB(db), - backup_engine_(new BackupEngine(db->GetEnv(), options)) {} +BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options) + : StackableDB(db), backup_engine_(new BackupEngine(db->GetEnv(), options)) { + backup_engine_->DeleteBackupsNewerThan(GetLatestSequenceNumber()); +} BackupableDB::~BackupableDB() { delete backup_engine_; diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 69ffa6e562..af4af0d02e 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -7,6 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "rocksdb/types.h" #include "rocksdb/transaction_log.h" #include "utilities/utility_db.h" #include "utilities/backupable_db.h" @@ -29,7 +30,11 @@ class DummyDB : public StackableDB { /* implicit */ DummyDB(const Options& options, const std::string& dbname) : StackableDB(nullptr), options_(options), dbname_(dbname), - deletions_enabled_(true) {} + deletions_enabled_(true), sequence_number_(0) {} + + virtual SequenceNumber GetLatestSequenceNumber() const { + return ++sequence_number_; + } virtual const std::string& GetName() const override { return dbname_; @@ -117,6 +122,7 @@ class DummyDB : public StackableDB { Options options_; std::string dbname_; bool deletions_enabled_; + mutable SequenceNumber sequence_number_; }; // DummyDB class TestEnv : public EnvWrapper { @@ -292,7 +298,7 @@ class BackupableDBTest { // set up db options options_.create_if_missing = true; options_.paranoid_checks = true; - options_.write_buffer_size = 1 << 19; // 512KB + options_.write_buffer_size = 1 << 17; // 128KB options_.env = test_db_env_.get(); options_.wal_dir = dbname_; // set up backup db options @@ -305,6 +311,12 @@ class BackupableDBTest { DestroyDB(dbname_, Options()); } + DB* OpenDB() { + DB* db; + ASSERT_OK(DB::Open(options_, dbname_, &db)); + return db; + } + void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false) { // reset all the defaults test_backup_env_->SetLimitWrittenFiles(1000000); @@ -354,12 +366,12 @@ class BackupableDBTest { } else { ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_)); } - OpenBackupableDB(); - AssertExists(db_.get(), start_exist, end_exist); + DB* db = OpenDB(); + AssertExists(db, start_exist, end_exist); if (end != 0) { - AssertEmpty(db_.get(), end_exist, end); + AssertEmpty(db, end_exist, end); } - CloseBackupableDB(); + delete db; if (opened_restore) { CloseRestoreDB(); } @@ -450,7 +462,7 @@ TEST(BackupableDBTest, NoDoubleCopy) { // not be able to open that backup, but all other backups should be // fine TEST(BackupableDBTest, CorruptionsTest) { - const int keys_iteration = 20000; + const int keys_iteration = 5000; Random rnd(6); Status s; @@ -516,7 +528,7 @@ TEST(BackupableDBTest, CorruptionsTest) { // open DB, write, close DB, backup, restore, repeat TEST(BackupableDBTest, OfflineIntegrationTest) { // has to be a big number, so that it triggers the memtable flush - const int keys_iteration = 20000; + const int keys_iteration = 5000; const int max_key = keys_iteration * 4 + 10; // first iter -- flush before backup // second iter -- don't flush before backup @@ -542,9 +554,9 @@ TEST(BackupableDBTest, OfflineIntegrationTest) { DestroyDB(dbname_, Options()); // ---- make sure it's empty ---- - OpenBackupableDB(); - AssertEmpty(db_.get(), 0, fill_up_to); - CloseBackupableDB(); + DB* db = OpenDB(); + AssertEmpty(db, 0, fill_up_to); + delete db; // ---- restore the DB ---- OpenRestoreDB(); @@ -563,7 +575,7 @@ TEST(BackupableDBTest, OfflineIntegrationTest) { // open DB, write, backup, write, backup, close, restore TEST(BackupableDBTest, OnlineIntegrationTest) { // has to be a big number, so that it triggers the memtable flush - const int keys_iteration = 20000; + const int keys_iteration = 5000; const int max_key = keys_iteration * 4 + 10; Random rnd(7); // delete old data @@ -591,9 +603,9 @@ TEST(BackupableDBTest, OnlineIntegrationTest) { DestroyDB(dbname_, Options()); // ---- make sure it's empty ---- - OpenBackupableDB(); - AssertEmpty(db_.get(), 0, max_key); - CloseBackupableDB(); + DB* db = OpenDB(); + AssertEmpty(db, 0, max_key); + delete db; // ---- restore every backup and verify all the data is there ---- OpenRestoreDB(); @@ -624,6 +636,29 @@ TEST(BackupableDBTest, OnlineIntegrationTest) { CloseRestoreDB(); } +TEST(BackupableDBTest, DeleteNewerBackups) { + // create backups 1, 2, 3, 4, 5 + OpenBackupableDB(true); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), 100 * i, 100 * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(i % 2))); + } + CloseBackupableDB(); + + // backup 3 is fine + AssertBackupConsistency(3, 0, 300, 500); + // this should delete backups 4 and 5 + OpenBackupableDB(); + CloseBackupableDB(); + // backups 4 and 5 don't exist + OpenRestoreDB(); + Status s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_); + ASSERT_TRUE(s.IsNotFound()); + s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_); + ASSERT_TRUE(s.IsNotFound()); + CloseRestoreDB(); +} + } // anon namespace } // namespace rocksdb From 43c386b72ee834c88a1a22500ce1fc36a8208277 Mon Sep 17 00:00:00 2001 From: James Golick Date: Tue, 10 Dec 2013 22:34:19 -0800 Subject: [PATCH 16/40] only try to use fallocate if it's actually present on the system --- build_tools/build_detect_platform | 12 ++++++++++++ util/env_posix.cc | 8 ++++---- util/posix_logger.h | 2 +- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 59e2e46195..96a1fb3319 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -189,6 +189,18 @@ EOF COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT" fi + # Test whether fallocate is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() { + int fd = open("/dev/null", 0); + fallocate(fd, 0, 0, 1024); + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$PLATFORM_LDFLAGS -DROCKSDB_FALLOCATE_PRESENT" + fi + # Test whether Snappy library is installed # http://code.google.com/p/snappy/ $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < Date: Wed, 11 Dec 2013 08:33:29 -0800 Subject: [PATCH 17/40] [RocksDB perf] Cache speedup Summary: I have ran a get benchmark where all the data is in the cache and observed that most of the time is spent on waiting for lock in LRUCache. This is an effort to optimize LRUCache. Test Plan: The data was loaded with fillseq. Then, I ran a benchmark: /db_bench --db=/tmp/rocksdb_stat_bench --num=1000000 --benchmarks=readrandom --statistics=1 --use_existing_db=1 --threads=16 --disable_seek_compaction=1 --cache_size=20000000000 --cache_numshardbits=8 --table_cache_numshardbits=8 I ran the benchmark three times. Here are the results: AFTER THE PATCH: 798072, 803998, 811807 BEFORE THE PATCH: 782008, 815593, 763017 Reviewers: dhruba, haobo, kailiu Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D14571 --- util/cache.cc | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/util/cache.cc b/util/cache.cc index deec528640..8fa03626b1 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include "rocksdb/cache.h" #include "port/port.h" @@ -111,8 +111,8 @@ class HandleTable { } void Resize() { - uint32_t new_length = 4; - while (new_length < elems_) { + uint32_t new_length = 16; + while (new_length < elems_ * 1.5) { new_length *= 2; } LRUHandle** new_list = new LRUHandle*[new_length]; @@ -255,18 +255,20 @@ Cache::Handle* LRUCache::Insert( LRUHandle* e = reinterpret_cast( malloc(sizeof(LRUHandle)-1 + key.size())); - std::list last_reference_list; + std::vector last_reference_list; + last_reference_list.reserve(1); + + e->value = value; + e->deleter = deleter; + e->charge = charge; + e->key_length = key.size(); + e->hash = hash; + e->refs = 2; // One from LRUCache, one for the returned handle + memcpy(e->key_data, key.data(), key.size()); { MutexLock l(&mutex_); - e->value = value; - e->deleter = deleter; - e->charge = charge; - e->key_length = key.size(); - e->hash = hash; - e->refs = 2; // One from LRUCache, one for the returned handle - memcpy(e->key_data, key.data(), key.size()); LRU_Append(e); LRUHandle* old = table_.Insert(e); From 0304e3d2ff78ba0f173289d3762a880677d84301 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Tue, 10 Dec 2013 20:03:27 -0800 Subject: [PATCH 18/40] When flushing mem tables, create iterators out of mutex Summary: creating new iterators of mem tables can be expensive. Move them out of mutex. DBImpl::WriteLevel0Table()'s mems seems to be a local vector and is only used by flushing. memtables to flush are also immutable, so it should be safe to do so. Test Plan: make all check Reviewers: haobo, dhruba, kailiu Reviewed By: dhruba CC: igor, leveldb Differential Revision: https://reviews.facebook.net/D14577 Conflicts: db/db_impl.cc --- db/db_impl.cc | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index da7a1a4b95..53d2acdb09 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1053,27 +1053,26 @@ Status DBImpl::WriteLevel0Table(std::vector &mems, VersionEdit* edit, *filenumber = meta.number; pending_outputs_.insert(meta.number); - std::vector list; - for (MemTable* m : mems) { - Log(options_.info_log, - "Flushing memtable with log file: %lu\n", - (unsigned long)m->GetLogNumber()); - list.push_back(m->NewIterator()); - } - Iterator* iter = NewMergingIterator(&internal_comparator_, &list[0], - list.size()); const SequenceNumber newest_snapshot = snapshots_.GetNewest(); const SequenceNumber earliest_seqno_in_memtable = mems[0]->GetFirstSequenceNumber(); - Log(options_.info_log, - "Level-0 flush table #%lu: started", - (unsigned long)meta.number); - Version* base = versions_->current(); base->Ref(); // it is likely that we do not need this reference Status s; { mutex_.Unlock(); + std::vector list; + for (MemTable* m : mems) { + Log(options_.info_log, + "Flushing memtable with log file: %lu\n", + (unsigned long)m->GetLogNumber()); + list.push_back(m->NewIterator()); + } + Iterator* iter = NewMergingIterator(&internal_comparator_, &list[0], + list.size()); + Log(options_.info_log, + "Level-0 flush table #%lu: started", + (unsigned long)meta.number); // We skip compression if universal compression is used and the size // threshold is set for compression. bool enable_compression = (options_.compaction_style @@ -1084,15 +1083,15 @@ Status DBImpl::WriteLevel0Table(std::vector &mems, VersionEdit* edit, user_comparator(), newest_snapshot, earliest_seqno_in_memtable, enable_compression); LogFlush(options_.info_log); + delete iter; + Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s", + (unsigned long) meta.number, + (unsigned long) meta.file_size, + s.ToString().c_str()); mutex_.Lock(); } base->Unref(); - Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s", - (unsigned long) meta.number, - (unsigned long) meta.file_size, - s.ToString().c_str()); - delete iter; // re-acquire the most current version base = versions_->current(); From bc5dd19b141b1faaba28ac8b122dc5d3d6fa1f56 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Mon, 9 Dec 2013 14:28:26 -0800 Subject: [PATCH 19/40] [RocksDB Performance Branch] Avoid sorting in Version::Get() by presorting them in VersionSet::Builder::SaveTo() Summary: Pre-sort files in VersionSet::Builder::SaveTo() so that when getting the value, no need to sort them. It can avoid the costs of vector operations and sorting in Version::Get(). Test Plan: make all check Reviewers: haobo, kailiu, dhruba Reviewed By: dhruba CC: nkg-, igor, leveldb Differential Revision: https://reviews.facebook.net/D14409 --- db/version_set.cc | 84 +++++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/db/version_set.cc b/db/version_set.cc index adee80d04f..2ebb64adf0 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -450,17 +450,12 @@ void Version::Get(const ReadOptions& options, // levels. Therefore we are guaranteed that if we find data // in an smaller level, later levels are irrelevant (unless we // are MergeInProgress). - std::vector important_files; for (int level = 0; level < vset_->NumberLevels(); level++) { size_t num_files = files_[level].size(); if (num_files == 0) continue; // Get the list of files to search in this level FileMetaData* const* files = &files_[level][0]; - important_files.clear(); - if (level == 0) { - important_files.reserve(num_files); - } // Some files may overlap each other. We find // all files that overlap user_key and process them in order from @@ -478,44 +473,42 @@ void Version::Get(const ReadOptions& options, start_index = FindFile(vset_->icmp_, files_[level], ikey); } - // Traverse the list, finding all overlapping files. - for (uint32_t i = start_index; i < num_files; i++) { - FileMetaData* f = files[i]; - if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 && - ucmp->Compare(user_key, f->largest.user_key()) <= 0) { - important_files.push_back(f); - } else if (level > 0) { - // If on Level-n (n>=1) then the files are sorted. - // So we can stop looking when we are past the ikey. - break; - } - } - - if (important_files.empty()) continue; - - if (level == 0) { - if (vset_->options_->compaction_style == kCompactionStyleUniversal) { - std::sort(important_files.begin(), important_files.end(), NewestFirstBySeqNo); - } else { - std::sort(important_files.begin(), important_files.end(), NewestFirst); - } - } else { - // Sanity check to make sure that the files are correctly sorted -#ifndef NDEBUG - num_files = important_files.size(); - for (uint32_t i = 1; i < num_files; ++i) { - FileMetaData* a = important_files[i-1]; - FileMetaData* b = important_files[i]; - int comp_sign = vset_->icmp_.Compare(a->largest, b->smallest); - assert(comp_sign < 0); - } -#endif - } - // Traverse each relevant file to find the desired key - num_files = important_files.size(); - for (uint32_t i = 0; i < num_files; ++i) { - FileMetaData* f = important_files[i]; +#ifndef NDEBUG + FileMetaData* prev_file = nullptr; +#endif + for (uint32_t i = start_index; i < num_files; ++i) { + FileMetaData* f = files[i]; + if (ucmp->Compare(user_key, f->smallest.user_key()) < 0 || + ucmp->Compare(user_key, f->largest.user_key()) > 0) { + // Only process overlapping files. + if (level > 0) { + // If on Level-n (n>=1) then the files are sorted. + // So we can stop looking when we are past the ikey. + break; + } + // TODO: do we want to check file ranges for level0 files at all? + // For new SST format where Get() is fast, we might want to consider + // to avoid those two comparisons, if it can filter out too few files. + continue; + } +#ifndef NDEBUG + // Sanity check to make sure that the files are correctly sorted + if (prev_file) { + if (level != 0) { + int comp_sign = vset_->icmp_.Compare(prev_file->largest, f->smallest); + assert(comp_sign < 0); + } else { + // level == 0, the current file cannot be newer than the previous one. + if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + assert(!NewestFirstBySeqNo(f, prev_file)); + } else { + assert(!NewestFirst(f, prev_file)); + } + } + } + prev_file = f; +#endif bool tableIO = false; *status = vset_->table_cache_->Get(options, f->number, f->file_size, ikey, &saver, SaveValue, &tableIO, @@ -1113,6 +1106,13 @@ class VersionSet::Builder { MaybeAddFile(v, level, *base_iter); } } + // Pre-sort level0 for Get() + if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo); + } else { + std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst); + } + CheckConsistency(v); } From c28dd2a891cd1fe032c943798f5809bc087070f6 Mon Sep 17 00:00:00 2001 From: James Golick Date: Wed, 11 Dec 2013 11:18:00 -0800 Subject: [PATCH 20/40] oops - missed a spot --- util/env_posix.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/env_posix.cc b/util/env_posix.cc index e81c59dcc5..2be524e95d 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -1297,7 +1297,7 @@ class PosixEnv : public Env { } bool SupportsFastAllocate(const std::string& path) { -#ifdef OS_LINUX +#ifdef ROCKSDB_FALLOCATE_PRESENT struct statfs s; if (statfs(path.c_str(), &s)){ return false; From a8029fdc752acfca9c509f28509919b25fb711f4 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Mon, 2 Dec 2013 18:34:05 -0800 Subject: [PATCH 21/40] Introduce MergeContext to Lazily Initialize merge operand list Summary: In get operations, merge_operands is only used in few cases. Lazily initialize it can reduce average latency in some cases Test Plan: make all check Reviewers: haobo, kailiu, dhruba Reviewed By: haobo CC: igor, nkg-, leveldb Differential Revision: https://reviews.facebook.net/D14415 Conflicts: db/db_impl.cc db/memtable.cc --- db/db_impl.cc | 21 +++++----- db/db_impl_readonly.cc | 8 ++-- db/memtable.cc | 32 +++++++-------- db/memtable.h | 3 +- db/memtablelist.cc | 5 +-- db/memtablelist.h | 2 +- db/merge_context.h | 69 ++++++++++++++++++++++++++++++++ db/version_set.cc | 34 +++++++++------- db/version_set.h | 7 ++-- include/rocksdb/merge_operator.h | 1 + 10 files changed, 129 insertions(+), 53 deletions(-) create mode 100644 db/merge_context.h diff --git a/db/db_impl.cc b/db/db_impl.cc index 53d2acdb09..49423d7766 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -27,6 +27,7 @@ #include "db/log_writer.h" #include "db/memtable.h" #include "db/memtablelist.h" +#include "db/merge_context.h" #include "db/merge_helper.h" #include "db/prefix_filter_iterator.h" #include "db/table_cache.h" @@ -2608,20 +2609,20 @@ Status DBImpl::GetImpl(const ReadOptions& options, // Prepare to store a list of merge operations if merge occurs. - std::deque merge_operands; + MergeContext merge_context; // First look in the memtable, then in the immutable memtable (if any). // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. LookupKey lkey(key, snapshot); - if (mem->Get(lkey, value, &s, &merge_operands, options_)) { + if (mem->Get(lkey, value, &s, merge_context, options_)) { // Done RecordTick(options_.statistics.get(), MEMTABLE_HIT); - } else if (imm.Get(lkey, value, &s, &merge_operands, options_)) { + } else if (imm.Get(lkey, value, &s, merge_context, options_)) { // Done RecordTick(options_.statistics.get(), MEMTABLE_HIT); } else { - current->Get(options, lkey, value, &s, &merge_operands, &stats, + current->Get(options, lkey, value, &s, &merge_context, &stats, options_, value_found); have_stat_update = true; RecordTick(options_.statistics.get(), MEMTABLE_MISS); @@ -2676,8 +2677,8 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, bool have_stat_update = false; Version::GetStats stats; - // Prepare to store a list of merge operations if merge occurs. - std::deque merge_operands; + // Contain a list of merge operations if merge occurs. + MergeContext merge_context; // Note: this always resizes the values array int numKeys = keys.size(); @@ -2692,17 +2693,17 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. for (int i=0; iGet(lkey, value, &s, &merge_operands, options_)) { + if (mem->Get(lkey, value, &s, merge_context, options_)) { // Done - } else if (imm.Get(lkey, value, &s, &merge_operands, options_)) { + } else if (imm.Get(lkey, value, &s, merge_context, options_)) { // Done } else { - current->Get(options, lkey, value, &s, &merge_operands, &stats, options_); + current->Get(options, lkey, value, &s, &merge_context, &stats, options_); have_stat_update = true; } diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index 27d5c31ede..dbb297e93a 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -23,6 +23,7 @@ #include "db/log_reader.h" #include "db/log_writer.h" #include "db/memtable.h" +#include "db/merge_context.h" #include "db/table_cache.h" #include "db/version_set.h" #include "db/write_batch_internal.h" @@ -30,6 +31,7 @@ #include "rocksdb/env.h" #include "rocksdb/status.h" #include "rocksdb/table.h" +#include "rocksdb/merge_operator.h" #include "port/port.h" #include "table/block.h" #include "table/merger.h" @@ -57,12 +59,12 @@ Status DBImplReadOnly::Get(const ReadOptions& options, MemTable* mem = GetMemTable(); Version* current = versions_->current(); SequenceNumber snapshot = versions_->LastSequence(); - std::deque merge_operands; + MergeContext merge_context; LookupKey lkey(key, snapshot); - if (mem->Get(lkey, value, &s, &merge_operands, options_)) { + if (mem->Get(lkey, value, &s, merge_context, options_)) { } else { Version::GetStats stats; - current->Get(options, lkey, value, &s, &merge_operands, &stats, options_); + current->Get(options, lkey, value, &s, &merge_context, &stats, options_); } return s; } diff --git a/db/memtable.cc b/db/memtable.cc index f86af4e33c..d2a51a125d 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -12,6 +12,7 @@ #include #include "db/dbformat.h" +#include "db/merge_context.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" @@ -162,15 +163,12 @@ void MemTable::Add(SequenceNumber s, ValueType type, } bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, - std::deque* operands, const Options& options) { + MergeContext& merge_context, const Options& options) { Slice memkey = key.memtable_key(); std::shared_ptr iter( table_->GetIterator(key.user_key())); iter->Seek(memkey.data()); - // It is the caller's responsibility to allocate/delete operands list - assert(operands != nullptr); - bool merge_in_progress = s->IsMergeInProgress(); auto merge_operator = options.merge_operator.get(); auto logger = options.info_log; @@ -202,8 +200,9 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, *s = Status::OK(); if (merge_in_progress) { assert(merge_operator); - if (!merge_operator->FullMerge(key.user_key(), &v, *operands, - value, logger.get())) { + if (!merge_operator->FullMerge(key.user_key(), &v, + merge_context.GetOperands(), value, + logger.get())) { RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES); *s = Status::Corruption("Error: Could not perform merge."); } @@ -219,8 +218,9 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, if (merge_in_progress) { assert(merge_operator); *s = Status::OK(); - if (!merge_operator->FullMerge(key.user_key(), nullptr, *operands, - value, logger.get())) { + if (!merge_operator->FullMerge(key.user_key(), nullptr, + merge_context.GetOperands(), value, + logger.get())) { RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES); *s = Status::Corruption("Error: Could not perform merge."); } @@ -232,16 +232,14 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, case kTypeMerge: { Slice v = GetLengthPrefixedSlice(key_ptr + key_length); merge_in_progress = true; - operands->push_front(v.ToString()); - while(operands->size() >= 2) { + merge_context.PushOperand(v); + while(merge_context.GetNumOperands() >= 2) { // Attempt to associative merge. (Returns true if successful) - if (merge_operator->PartialMerge(key.user_key(), - Slice((*operands)[0]), - Slice((*operands)[1]), - &merge_result, - logger.get())) { - operands->pop_front(); - swap(operands->front(), merge_result); + if (merge_operator->PartialMerge(key.user_key(), + merge_context.GetOperand(0), + merge_context.GetOperand(1), + &merge_result, logger.get())) { + merge_context.PushPartialMergeResult(merge_result); } else { // Stack them because user can't associative merge break; diff --git a/db/memtable.h b/db/memtable.h index 7a0d6b3436..79d5ba2d0e 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -22,6 +22,7 @@ namespace rocksdb { class Mutex; class MemTableIterator; +class MergeContext; class MemTable { public: @@ -94,7 +95,7 @@ class MemTable { // store MergeInProgress in s, and return false. // Else, return false. bool Get(const LookupKey& key, std::string* value, Status* s, - std::deque* operands, const Options& options); + MergeContext& merge_context, const Options& options); // Update the value and return status ok, // if key exists in current memtable diff --git a/db/memtablelist.cc b/db/memtablelist.cc index 3d4d35fd8d..71e4e5a923 100644 --- a/db/memtablelist.cc +++ b/db/memtablelist.cc @@ -204,10 +204,9 @@ size_t MemTableList::ApproximateMemoryUsage() { // Return the most recent value found, if any. // Operands stores the list of merge operations to apply, so far. bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s, - std::deque* operands, - const Options& options) { + MergeContext& merge_context, const Options& options) { for (auto &memtable : memlist_) { - if (memtable->Get(key, value, s, operands, options)) { + if (memtable->Get(key, value, s, merge_context, options)) { return true; } } diff --git a/db/memtablelist.h b/db/memtablelist.h index 17c6c3ae4d..ed353c8b87 100644 --- a/db/memtablelist.h +++ b/db/memtablelist.h @@ -78,7 +78,7 @@ class MemTableList { // Search all the memtables starting from the most recent one. // Return the most recent value found, if any. bool Get(const LookupKey& key, std::string* value, Status* s, - std::deque* operands, const Options& options); + MergeContext& merge_context, const Options& options); // Returns the list of underlying memtables. void GetMemTables(std::vector* list); diff --git a/db/merge_context.h b/db/merge_context.h new file mode 100644 index 0000000000..91d9f8a016 --- /dev/null +++ b/db/merge_context.h @@ -0,0 +1,69 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include +#include + +namespace rocksdb { + +const std::deque empty_operand_list; + +// The merge context for merging a user key. +// When doing a Get(), DB will create such a class and pass it when +// issuing Get() operation to memtables and version_set. The operands +// will be fetched from the context when issuing partial of full merge. +class MergeContext { +public: + // Clear all the operands + void Clear() { + if (operand_list) { + operand_list->clear(); + } + } + // Replace the first two operands of merge_result, which are expected be the + // merge results of them. + void PushPartialMergeResult(std::string& merge_result) { + assert (operand_list); + operand_list->pop_front(); + swap(operand_list->front(), merge_result); + } + // Push a merge operand + void PushOperand(const Slice& operand_slice) { + Initialize(); + operand_list->push_front(operand_slice.ToString()); + } + // return total number of operands in the list + size_t GetNumOperands() const { + if (!operand_list) { + return 0; + } + return operand_list->size(); + } + // Get the operand at the index. + Slice GetOperand(int index) const { + assert (operand_list); + return (*operand_list)[index]; + } + // Return all the operands. + const std::deque& GetOperands() const { + if (!operand_list) { + return empty_operand_list; + } + return *operand_list; + } +private: + void Initialize() { + if (!operand_list) { + operand_list.reset(new std::deque()); + } + } + std::unique_ptr> operand_list; +}; + +} // namespace rocksdb + diff --git a/db/version_set.cc b/db/version_set.cc index 2ebb64adf0..741752936f 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -16,6 +16,7 @@ #include "db/log_reader.h" #include "db/log_writer.h" #include "db/memtable.h" +#include "db/merge_context.h" #include "db/table_cache.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" @@ -287,7 +288,8 @@ struct Saver { bool* value_found; // Is value set correctly? Used by KeyMayExist std::string* value; const MergeOperator* merge_operator; - std::deque* merge_operands; // the merge operations encountered + // the merge operations encountered; + MergeContext* merge_context; Logger* logger; bool didIO; // did we do any disk io? Statistics* statistics; @@ -309,10 +311,10 @@ static void MarkKeyMayExist(void* arg) { static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){ Saver* s = reinterpret_cast(arg); - std::deque* const ops = s->merge_operands; // shorter alias + MergeContext* merge_contex = s->merge_context; std::string merge_result; // temporary area for merge results later - assert(s != nullptr && ops != nullptr); + assert(s != nullptr && merge_contex != nullptr); ParsedInternalKey parsed_key; // TODO: didIO and Merge? @@ -331,7 +333,8 @@ static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){ } else if (kMerge == s->state) { assert(s->merge_operator != nullptr); s->state = kFound; - if (!s->merge_operator->FullMerge(s->user_key, &v, *ops, + if (!s->merge_operator->FullMerge(s->user_key, &v, + merge_contex->GetOperands(), s->value, s->logger)) { RecordTick(s->statistics, NUMBER_MERGE_FAILURES); s->state = kCorrupt; @@ -346,8 +349,9 @@ static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){ s->state = kDeleted; } else if (kMerge == s->state) { s->state = kFound; - if (!s->merge_operator->FullMerge(s->user_key, nullptr, *ops, - s->value, s->logger)) { + if (!s->merge_operator->FullMerge(s->user_key, nullptr, + merge_contex->GetOperands(), + s->value, s->logger)) { RecordTick(s->statistics, NUMBER_MERGE_FAILURES); s->state = kCorrupt; } @@ -359,16 +363,15 @@ static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){ case kTypeMerge: assert(s->state == kNotFound || s->state == kMerge); s->state = kMerge; - ops->push_front(v.ToString()); - while (ops->size() >= 2) { + merge_contex->PushOperand(v); + while (merge_contex->GetNumOperands() >= 2) { // Attempt to merge operands together via user associateive merge if (s->merge_operator->PartialMerge(s->user_key, - Slice((*ops)[0]), - Slice((*ops)[1]), + merge_contex->GetOperand(0), + merge_contex->GetOperand(1), &merge_result, s->logger)) { - ops->pop_front(); - swap(ops->front(), merge_result); + merge_contex->PushPartialMergeResult(merge_result); } else { // Associative merge returns false ==> stack the operands break; @@ -417,7 +420,7 @@ void Version::Get(const ReadOptions& options, const LookupKey& k, std::string* value, Status* status, - std::deque* operands, + MergeContext* merge_context, GetStats* stats, const Options& db_options, bool* value_found) { @@ -436,7 +439,7 @@ void Version::Get(const ReadOptions& options, saver.value_found = value_found; saver.value = value; saver.merge_operator = merge_operator; - saver.merge_operands = operands; + saver.merge_context = merge_context; saver.logger = logger.get(); saver.didIO = false; saver.statistics = db_options.statistics.get(); @@ -557,7 +560,8 @@ void Version::Get(const ReadOptions& options, if (kMerge == saver.state) { // merge_operands are in saver and we hit the beginning of the key history // do a final merge of nullptr and operands; - if (merge_operator->FullMerge(user_key, nullptr, *saver.merge_operands, + if (merge_operator->FullMerge(user_key, nullptr, + saver.merge_context->GetOperands(), value, logger.get())) { *status = Status::OK(); } else { diff --git a/db/version_set.h b/db/version_set.h index 38415173c8..bf466a932f 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -38,6 +38,7 @@ class MemTable; class TableCache; class Version; class VersionSet; +class MergeContext; // Return the smallest index i such that files[i]->largest >= key. // Return files.size() if there is no such file. @@ -76,9 +77,9 @@ class Version { int seek_file_level; }; void Get(const ReadOptions&, const LookupKey& key, std::string* val, - Status* status, std::deque* operands, GetStats* stats, - const Options& db_option, - bool* value_found = nullptr); + Status* status, MergeContext* merge_context, + GetStats* stats, const Options& db_option, bool* value_found = + nullptr); // Adds "stats" into the current state. Returns true if a new // compaction may need to be triggered, false otherwise. diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h index ddb3102e3f..bd4c36c077 100644 --- a/include/rocksdb/merge_operator.h +++ b/include/rocksdb/merge_operator.h @@ -6,6 +6,7 @@ #ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ #define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ +#include #include #include #include "rocksdb/slice.h" From f5f5c645a892caa8770d4b61f33570db916b7caf Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Wed, 11 Dec 2013 13:51:20 -0800 Subject: [PATCH 22/40] Add readrandom with both memtable and sst regression test Summary: @MarkCallaghan's tests indicate that performance with 8k rows in memtable is much worse than empty memtable. I wanted to add a regression tests that measures this effect, so we could optimize it. However, current config shows 634461 QPS on my devbox. Mark, any idea why this is so much faster than your measurements? Test Plan: Ran the regression test. Reviewers: MarkCallaghan, dhruba, haobo Reviewed By: MarkCallaghan CC: leveldb, MarkCallaghan Differential Revision: https://reviews.facebook.net/D14511 --- build_tools/regression_build_test.sh | 116 +++++++++++++++++++++++---- 1 file changed, 99 insertions(+), 17 deletions(-) diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index b0140ef48f..b0c130e3cf 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -26,15 +26,20 @@ function cleanup { } trap cleanup EXIT -git_br=$(basename $GIT_BRANCH) + +if [ -z $GIT_BRANCH ]; then + git_br=`git rev-parse --abbrev-ref HEAD` +else + git_br=$(basename $GIT_BRANCH) +fi + if [ $git_br == "master" ]; then git_br="" else git_br="."$git_br fi -make clean -OPT=-DNDEBUG make db_bench -j$(nproc) +make release # measure fillseq + fill up the DB for overwrite benchmark ./db_bench \ @@ -45,7 +50,8 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --num=$NUM \ --writes=$NUM \ --cache_size=6442450944 \ - --cache_numshardbits=6 \ + --cache_numshardbits=4 \ + --table_cache_numshardbits=4 \ --open_files=55000 \ --statistics=1 \ --histogram=1 \ @@ -60,9 +66,10 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --use_existing_db=1 \ --bloom_bits=10 \ --num=$NUM \ - --writes=$((NUM / 2)) \ + --writes=$((NUM / 10)) \ --cache_size=6442450944 \ - --cache_numshardbits=6 \ + --cache_numshardbits=4 \ + --table_cache_numshardbits=4 \ --open_files=55000 \ --statistics=1 \ --histogram=1 \ @@ -80,7 +87,8 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --num=$NUM \ --writes=$NUM \ --cache_size=6442450944 \ - --cache_numshardbits=6 \ + --cache_numshardbits=4 \ + --table_cache_numshardbits=4 \ --open_files=55000 \ --statistics=1 \ --histogram=1 \ @@ -96,9 +104,10 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --use_existing_db=1 \ --bloom_bits=10 \ --num=$NUM \ - --reads=$NUM \ + --reads=$((NUM / 5)) \ --cache_size=6442450944 \ - --cache_numshardbits=8 \ + --cache_numshardbits=4 \ + --table_cache_numshardbits=4 \ --open_files=55000 \ --disable_seek_compaction=1 \ --statistics=1 \ @@ -106,18 +115,19 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --disable_data_sync=1 \ --disable_wal=1 \ --sync=0 \ - --threads=32 > ${STAT_FILE}.readrandom + --threads=16 > ${STAT_FILE}.readrandom -# measure readrandom with 300MB block cache +# measure readrandom with 100MB block cache ./db_bench \ --benchmarks=readrandom \ --db=$DATA_DIR \ --use_existing_db=1 \ --bloom_bits=10 \ --num=$NUM \ - --reads=$NUM \ - --cache_size=314572800 \ - --cache_numshardbits=8 \ + --reads=$((NUM / 5)) \ + --cache_size=104857600 \ + --cache_numshardbits=4 \ + --table_cache_numshardbits=4 \ --open_files=55000 \ --disable_seek_compaction=1 \ --statistics=1 \ @@ -125,7 +135,70 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --disable_data_sync=1 \ --disable_wal=1 \ --sync=0 \ - --threads=32 > ${STAT_FILE}.readrandomsmallblockcache + --threads=16 > ${STAT_FILE}.readrandomsmallblockcache + +# measure readrandom with 8k data in memtable +./db_bench \ + --benchmarks=overwrite,readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --writes=512 \ + --cache_size=6442450944 \ + --cache_numshardbits=4 \ + --table_cache_numshardbits=4 \ + --write_buffer_size=1000000000 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandom_mem_sst + + +# fill up the db for readrandom benchmark with filluniquerandom (1GB total size) +./db_bench \ + --benchmarks=filluniquerandom \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --writes=$((NUM / 4)) \ + --cache_size=6442450944 \ + --cache_numshardbits=4 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=1 > /dev/null + +# measure readrandom after load with filluniquerandom with 6GB block cache +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --reads=$((NUM / 4)) \ + --cache_size=6442450944 \ + --cache_numshardbits=4 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --disable_auto_compactions=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom # measure memtable performance -- none of the data gets flushed to disk ./db_bench \ @@ -135,7 +208,8 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --num=$((NUM / 10)) \ --reads=$NUM \ --cache_size=6442450944 \ - --cache_numshardbits=8 \ + --cache_numshardbits=4 \ + --table_cache_numshardbits=4 \ --write_buffer_size=1000000000 \ --open_files=55000 \ --disable_seek_compaction=1 \ @@ -145,13 +219,19 @@ OPT=-DNDEBUG make db_bench -j$(nproc) --disable_wal=1 \ --sync=0 \ --value_size=10 \ - --threads=32 > ${STAT_FILE}.memtablefillreadrandom + --threads=16 > ${STAT_FILE}.memtablefillreadrandom # send data to ods function send_to_ods { key="$1" value="$2" + if [ -z $JENKINS_HOME ]; then + # running on devbox, just print out the values + echo $1 $2 + return + fi + if [ -z "$value" ];then echo >&2 "ERROR: Key $key doesn't have a value." return @@ -180,5 +260,7 @@ send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache +send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst +send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom From 249e736bc5f978a58dae669154fd1eb3438f964b Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Thu, 12 Dec 2013 08:13:47 -0800 Subject: [PATCH 23/40] portable %lu printing --- utilities/backupable/backupable_db.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 7d605c9684..6291546d7d 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -247,7 +247,8 @@ void BackupEngine::DeleteBackupsNewerThan(uint64_t sequence_number) { for (auto backup : backups_) { if (backup.second.GetSequenceNumber() > sequence_number) { Log(options_.info_log, - "Deleting backup %u because sequence number (%lu) is newer than %lu", + "Deleting backup %u because sequence number (%" PRIu64 + ") is newer than %" PRIu64 "", backup.first, backup.second.GetSequenceNumber(), sequence_number); backup.second.Delete(); obsolete_backups_.push_back(backup.first); @@ -742,9 +743,9 @@ Status BackupEngine::BackupMeta::LoadFromFile(const std::string& backup_dir) { uint32_t num_files = 0; int bytes_read = 0; - sscanf(data.data(), "%ld%n", ×tamp_, &bytes_read); + sscanf(data.data(), "%" PRId64 "%n", ×tamp_, &bytes_read); data.remove_prefix(bytes_read + 1); // +1 for '\n' - sscanf(data.data(), "%lu%n", &sequence_number_, &bytes_read); + sscanf(data.data(), "%" PRIu64 "%n", &sequence_number_, &bytes_read); data.remove_prefix(bytes_read + 1); // +1 for '\n' sscanf(data.data(), "%u%n", &num_files, &bytes_read); data.remove_prefix(bytes_read + 1); // +1 for '\n' From e9e6b00d297d19fb143abe7d98bf0ab3c20a3d64 Mon Sep 17 00:00:00 2001 From: Mark Callaghan Date: Mon, 9 Dec 2013 13:43:34 -0800 Subject: [PATCH 24/40] Add monitoring for universal compaction and add counters for compaction IO Summary: Adds these counters { WAL_FILE_SYNCED, "rocksdb.wal.synced" } number of writes that request a WAL sync { WAL_FILE_BYTES, "rocksdb.wal.bytes" }, number of bytes written to the WAL { WRITE_DONE_BY_SELF, "rocksdb.write.self" }, number of writes processed by the calling thread { WRITE_DONE_BY_OTHER, "rocksdb.write.other" }, number of writes not processed by the calling thread. Instead these were processed by the current holder of the write lock { WRITE_WITH_WAL, "rocksdb.write.wal" }, number of writes that request WAL logging { COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" }, number of bytes read during compaction { COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" }, number of bytes written during compaction Per-interval stats output was updated with WAL stats and correct stats for universal compaction including a correct value for write-amplification. It now looks like: Compactions Level Files Size(MB) Score Time(sec) Read(MB) Write(MB) Rn(MB) Rnp1(MB) Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s) Rn Rnp1 Wnp1 NewW Count Ln-stall Stall-cnt -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 0 7 464 46.4 281 3411 3875 3411 0 3875 2.1 12.1 13.8 621 0 240 240 628 0.0 0 Uptime(secs): 310.8 total, 2.0 interval Writes cumulative: 9999999 total, 9999999 batches, 1.0 per batch, 1.22 ingest GB WAL cumulative: 9999999 WAL writes, 9999999 WAL syncs, 1.00 writes per sync, 1.22 GB written Compaction IO cumulative (GB): 1.22 new, 3.33 read, 3.78 write, 7.12 read+write Compaction IO cumulative (MB/sec): 4.0 new, 11.0 read, 12.5 write, 23.4 read+write Amplification cumulative: 4.1 write, 6.8 compaction Writes interval: 100000 total, 100000 batches, 1.0 per batch, 12.5 ingest MB WAL interval: 100000 WAL writes, 100000 WAL syncs, 1.00 writes per sync, 0.01 MB written Compaction IO interval (MB): 12.49 new, 14.98 read, 21.50 write, 36.48 read+write Compaction IO interval (MB/sec): 6.4 new, 7.6 read, 11.0 write, 18.6 read+write Amplification interval: 101.7 write, 102.9 compaction Stalls(secs): 142.924 level0_slowdown, 0.000 level0_numfiles, 0.805 memtable_compaction, 0.000 leveln_slowdown Stalls(count): 132461 level0_slowdown, 0 level0_numfiles, 3 memtable_compaction, 0 leveln_slowdown Task ID: #3329644, #3301695 Blame Rev: Test Plan: Revert Plan: Database Impact: Memcache Impact: Other Notes: EImportant: - begin *PUBLIC* platform impact section - Bugzilla: # - end platform impact - Reviewers: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14583 --- db/db_bench.cc | 2 +- db/db_impl.cc | 126 ++++++++++++++++++++++++++++++----- db/db_impl.h | 20 ++++-- include/rocksdb/statistics.h | 22 +++++- 4 files changed, 146 insertions(+), 24 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index 33c1ecfe12..158a5faa2d 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -149,7 +149,7 @@ DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink" DEFINE_bool(histogram, false, "Print histogram of operation timings"); -DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size, +DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size, "Number of bytes to buffer in memtable before compacting"); DEFINE_int32(max_write_buffer_number, diff --git a/db/db_impl.cc b/db/db_impl.cc index 49423d7766..c06d2f5bc1 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1041,6 +1041,7 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) { stats.bytes_written = meta.file_size; stats.files_out_levelnp1 = 1; stats_[level].Add(stats); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); return s; } @@ -1129,6 +1130,7 @@ Status DBImpl::WriteLevel0Table(std::vector &mems, VersionEdit* edit, stats.micros = env_->NowMicros() - start_micros; stats.bytes_written = meta.file_size; stats_[level].Add(stats); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); return s; } @@ -2454,14 +2456,22 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, } stats.files_out_levelnp1 = num_output_files; - for (int i = 0; i < compact->compaction->num_input_files(0); i++) + for (int i = 0; i < compact->compaction->num_input_files(0); i++) { stats.bytes_readn += compact->compaction->input(0, i)->file_size; + RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, + compact->compaction->input(0, i)->file_size); + } - for (int i = 0; i < compact->compaction->num_input_files(1); i++) + for (int i = 0; i < compact->compaction->num_input_files(1); i++) { stats.bytes_readnp1 += compact->compaction->input(1, i)->file_size; + RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, + compact->compaction->input(1, i)->file_size); + } for (int i = 0; i < num_output_files; i++) { stats.bytes_written += compact->outputs[i].file_size; + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, + compact->outputs[i].file_size); } LogFlush(options_.info_log); @@ -2810,8 +2820,16 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { while (!w.done && &w != writers_.front()) { w.cv.Wait(); } + + if (!options.disableWAL) { + RecordTick(options_.statistics.get(), WRITE_WITH_WAL, 1); + } + if (w.done) { + RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1); return w.status; + } else { + RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1); } // May temporarily unlock and wait. @@ -2849,7 +2867,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (!options.disableWAL) { StopWatchNano timer(env_); StartPerfTimer(&timer); - status = log_->AddRecord(WriteBatchInternal::Contents(updates)); + Slice log_entry = WriteBatchInternal::Contents(updates); + status = log_->AddRecord(log_entry); + RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1); + RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size()); BumpPerfTime(&perf_context.wal_write_time, &timer); if (status.ok() && options.sync) { if (options_.use_fsync) { @@ -3225,6 +3246,13 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { } else if (in == "stats") { char buf[1000]; + + uint64_t wal_bytes = 0; + uint64_t wal_synced = 0; + uint64_t user_bytes_written = 0; + uint64_t write_other = 0; + uint64_t write_self = 0; + uint64_t write_with_wal = 0; uint64_t total_bytes_written = 0; uint64_t total_bytes_read = 0; uint64_t micros_up = env_->NowMicros() - started_at_; @@ -3237,6 +3265,16 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { uint64_t interval_bytes_new = 0; double interval_seconds_up = 0; + Statistics* s = options_.statistics.get(); + if (s) { + wal_bytes = s->getTickerCount(WAL_FILE_BYTES); + wal_synced = s->getTickerCount(WAL_FILE_SYNCED); + user_bytes_written = s->getTickerCount(BYTES_WRITTEN); + write_other = s->getTickerCount(WRITE_DONE_BY_OTHER); + write_self = s->getTickerCount(WRITE_DONE_BY_SELF); + write_with_wal = s->getTickerCount(WRITE_WITH_WAL); + } + // Pardon the long line but I think it is easier to read this way. snprintf(buf, sizeof(buf), " Compactions\n" @@ -3293,19 +3331,38 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { } } - interval_bytes_new = stats_[0].bytes_written - last_stats_.bytes_new_; - interval_bytes_read = total_bytes_read - last_stats_.bytes_read_; - interval_bytes_written = total_bytes_written - last_stats_.bytes_written_; + interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_; + interval_bytes_read = total_bytes_read - last_stats_.compaction_bytes_read_; + interval_bytes_written = + total_bytes_written - last_stats_.compaction_bytes_written_; interval_seconds_up = seconds_up - last_stats_.seconds_up_; snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n", seconds_up, interval_seconds_up); value->append(buf); + snprintf(buf, sizeof(buf), + "Writes cumulative: %llu total, %llu batches, " + "%.1f per batch, %.2f ingest GB\n", + (unsigned long long) (write_other + write_self), + (unsigned long long) write_self, + (write_other + write_self) / (double) (write_self + 1), + user_bytes_written / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "WAL cumulative: %llu WAL writes, %llu WAL syncs, " + "%.2f writes per sync, %.2f GB written\n", + (unsigned long long) write_with_wal, + (unsigned long long ) wal_synced, + write_with_wal / (double) (wal_synced + 1), + wal_bytes / (1048576.0 * 1024)); + value->append(buf); + snprintf(buf, sizeof(buf), "Compaction IO cumulative (GB): " "%.2f new, %.2f read, %.2f write, %.2f read+write\n", - stats_[0].bytes_written / (1048576.0 * 1024), + user_bytes_written / (1048576.0 * 1024), total_bytes_read / (1048576.0 * 1024), total_bytes_written / (1048576.0 * 1024), (total_bytes_read + total_bytes_written) / (1048576.0 * 1024)); @@ -3314,7 +3371,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { snprintf(buf, sizeof(buf), "Compaction IO cumulative (MB/sec): " "%.1f new, %.1f read, %.1f write, %.1f read+write\n", - stats_[0].bytes_written / 1048576.0 / seconds_up, + user_bytes_written / 1048576.0 / seconds_up, total_bytes_read / 1048576.0 / seconds_up, total_bytes_written / 1048576.0 / seconds_up, (total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up); @@ -3323,9 +3380,38 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { // +1 to avoid divide by 0 and NaN snprintf(buf, sizeof(buf), "Amplification cumulative: %.1f write, %.1f compaction\n", - (double) total_bytes_written / (stats_[0].bytes_written+1), - (double) (total_bytes_written + total_bytes_read) - / (stats_[0].bytes_written+1)); + (double) (total_bytes_written + wal_bytes) + / (user_bytes_written + 1), + (double) (total_bytes_written + total_bytes_read + wal_bytes) + / (user_bytes_written + 1)); + value->append(buf); + + uint64_t interval_write_other = write_other - last_stats_.write_other_; + uint64_t interval_write_self = write_self - last_stats_.write_self_; + + snprintf(buf, sizeof(buf), + "Writes interval: %llu total, %llu batches, " + "%.1f per batch, %.1f ingest MB\n", + (unsigned long long) (interval_write_other + interval_write_self), + (unsigned long long) interval_write_self, + (double) (interval_write_other + interval_write_self) + / (interval_write_self + 1), + (user_bytes_written - last_stats_.ingest_bytes_) / 1048576.0); + value->append(buf); + + uint64_t interval_write_with_wal = + write_with_wal - last_stats_.write_with_wal_; + + uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_; + uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_; + + snprintf(buf, sizeof(buf), + "WAL interval: %llu WAL writes, %llu WAL syncs, " + "%.2f writes per sync, %.2f MB written\n", + (unsigned long long) interval_write_with_wal, + (unsigned long long ) interval_wal_synced, + interval_write_with_wal / (double) (interval_wal_synced + 1), + interval_wal_bytes / (1048576.0 * 1024)); value->append(buf); snprintf(buf, sizeof(buf), @@ -3350,9 +3436,10 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { // +1 to avoid divide by 0 and NaN snprintf(buf, sizeof(buf), "Amplification interval: %.1f write, %.1f compaction\n", - (double) interval_bytes_written / (interval_bytes_new+1), - (double) (interval_bytes_written + interval_bytes_read) / - (interval_bytes_new+1)); + (double) (interval_bytes_written + wal_bytes) + / (interval_bytes_new + 1), + (double) (interval_bytes_written + interval_bytes_read + wal_bytes) + / (interval_bytes_new + 1)); value->append(buf); snprintf(buf, sizeof(buf), @@ -3373,10 +3460,15 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { (unsigned long) total_slowdown_count); value->append(buf); - last_stats_.bytes_read_ = total_bytes_read; - last_stats_.bytes_written_ = total_bytes_written; - last_stats_.bytes_new_ = stats_[0].bytes_written; + last_stats_.compaction_bytes_read_ = total_bytes_read; + last_stats_.compaction_bytes_written_ = total_bytes_written; + last_stats_.ingest_bytes_ = user_bytes_written; last_stats_.seconds_up_ = seconds_up; + last_stats_.wal_bytes_ = wal_bytes; + last_stats_.wal_synced_ = wal_synced; + last_stats_.write_with_wal_ = write_with_wal; + last_stats_.write_other_ = write_other; + last_stats_.write_self_ = write_self; return true; } else if (in == "sstables") { diff --git a/db/db_impl.h b/db/db_impl.h index 0591839403..39e1329798 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -441,15 +441,25 @@ class DBImpl : public DB { // Used to compute per-interval statistics struct StatsSnapshot { - uint64_t bytes_read_; - uint64_t bytes_written_; - uint64_t bytes_new_; + uint64_t compaction_bytes_read_; // Bytes read by compaction + uint64_t compaction_bytes_written_; // Bytes written by compaction + uint64_t ingest_bytes_; // Bytes written by user + uint64_t wal_bytes_; // Bytes written to WAL + uint64_t wal_synced_; // Number of times WAL is synced + uint64_t write_with_wal_; // Number of writes that request WAL + // These count the number of writes processed by the calling thread or + // another thread. + uint64_t write_other_; + uint64_t write_self_; double seconds_up_; - StatsSnapshot() : bytes_read_(0), bytes_written_(0), - bytes_new_(0), seconds_up_(0) {} + StatsSnapshot() : compaction_bytes_read_(0), compaction_bytes_written_(0), + ingest_bytes_(0), wal_bytes_(0), wal_synced_(0), + write_with_wal_(0), write_other_(0), write_self_(0), + seconds_up_(0) {} }; + // Counters from the previous time per-interval stats were computed StatsSnapshot last_stats_; static const int KEEP_LOG_FILE_NUM = 1000; diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 286a624c8e..011e510f5a 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -114,6 +114,19 @@ enum Tickers { BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL + + // Writes can be processed by requesting thread or by the thread at the + // head of the writers queue. + WRITE_DONE_BY_SELF, + WRITE_DONE_BY_OTHER, + + WRITE_WITH_WAL, // Number of Write calls that request WAL + + COMPACT_READ_BYTES, // Bytes read during compaction + COMPACT_WRITE_BYTES, // Bytes written during compaction + TICKER_ENUM_MAX }; @@ -159,7 +172,14 @@ const std::vector> TickersNameMap = { { NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" }, { GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" }, { BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" }, - { BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" } + { BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" }, + { WAL_FILE_SYNCED, "rocksdb.wal.synced" }, + { WAL_FILE_BYTES, "rocksdb.wal.bytes" }, + { WRITE_DONE_BY_SELF, "rocksdb.write.self" }, + { WRITE_DONE_BY_OTHER, "rocksdb.write.other" }, + { WRITE_WITH_WAL, "rocksdb.write.wal" }, + { COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" }, + { COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" }, }; /** From 417b453fa66d73afceb4af0444f62bcfb627c056 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Thu, 12 Dec 2013 14:57:18 -0800 Subject: [PATCH 25/40] [backupable db] Delete db_dir children when restoring backup Summary: I realized that manifest will get deleted by PurgeObsoleteFiles in DBImpl, but it is sill cleaner to delete files before we restore the backup Test Plan: backupable_db_test Reviewers: dhruba Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14619 --- utilities/backupable/backupable_db.cc | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 6291546d7d..61e009cd31 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -422,11 +422,18 @@ Status BackupEngine::RestoreDBFromBackup(BackupID backup_id, // delete log files that might have been already in wal_dir. // This is important since they might get replayed to the restored DB, // which will then differ from the backuped DB - std::vector wal_dir_children; - db_env_->GetChildren(wal_dir, &wal_dir_children); // ignore errors - for (auto f : wal_dir_children) { + std::vector delete_children; + db_env_->GetChildren(wal_dir, &delete_children); // ignore errors + for (auto f : delete_children) { db_env_->DeleteFile(wal_dir + "/" + f); // ignore errors } + // Also delete all the db_dir children. This is not so important + // because obsolete files will be deleted by DBImpl::PurgeObsoleteFiles() + delete_children.clear(); + db_env_->GetChildren(db_dir, &delete_children); // ignore errors + for (auto f : delete_children) { + db_env_->DeleteFile(db_dir + "/" + f); // ignore errors + } Status s; for (auto& file : backup.GetFiles()) { From 2a2506b62980ff74c54dad6f8073f84d6df31f6d Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Fri, 13 Dec 2013 23:58:18 -0800 Subject: [PATCH 26/40] C bindings: add a bunch of the newer options --- db/c.cc | 151 +++++++++++++++++++++++++++++++++++++++++++- include/rocksdb/c.h | 62 +++++++++++++++++- 2 files changed, 209 insertions(+), 4 deletions(-) diff --git a/db/c.cc b/db/c.cc index 021122301b..36ee2d486b 100644 --- a/db/c.cc +++ b/db/c.cc @@ -20,6 +20,8 @@ #include "rocksdb/options.h" #include "rocksdb/status.h" #include "rocksdb/write_batch.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/universal_compaction.h" using rocksdb::Cache; using rocksdb::Comparator; @@ -134,6 +136,11 @@ struct rocksdb_env_t { bool is_default; }; +struct rocksdb_universal_compaction_options_t { + rocksdb::CompactionOptionsUniversal *rep; +}; + + static bool SaveError(char** errptr, const Status& s) { assert(errptr != NULL); if (s.ok()) { @@ -531,12 +538,12 @@ void rocksdb_options_set_compression_options( } void rocksdb_options_set_disable_data_sync( - rocksdb_options_t* opt, bool disable_data_sync) { + rocksdb_options_t* opt, int disable_data_sync) { opt->rep.disableDataSync = disable_data_sync; } void rocksdb_options_set_use_fsync( - rocksdb_options_t* opt, bool use_fsync) { + rocksdb_options_t* opt, int use_fsync) { opt->rep.use_fsync = use_fsync; } @@ -559,6 +566,95 @@ void rocksdb_options_set_WAL_size_limit_MB( opt->rep.WAL_size_limit_MB = limit; } +void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) { + opt->rep.max_write_buffer_number = n; +} + +void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) { + opt->rep.min_write_buffer_number_to_merge = n; +} + +void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) { + opt->rep.max_background_compactions = n; +} + +void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) { + opt->rep.max_background_flushes = n; +} + +void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) { + opt->rep.disable_auto_compactions = disable; +} + +void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) { + opt->rep.disable_seek_compaction = disable; +} + +void rocksdb_options_set_source_compaction_factor( + rocksdb_options_t* opt, int n) { + opt->rep.expanded_compaction_factor = n; +} + +void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) { + opt->rep.PrepareForBulkLoad(); +} + +void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) { + static rocksdb::VectorRepFactory* factory = 0; + if (!factory) { + factory = new rocksdb::VectorRepFactory; + } + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) { + opt->rep.compaction_style = static_cast(style); +} + +void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) { + opt->rep.compaction_options_universal = *(uco->rep); +} + +/* +TODO: +merge_operator +compaction_filter +prefix_extractor +whole_key_filtering +max_bytes_for_level_multiplier_additional +delete_obsolete_files_period_micros +max_log_file_size +log_file_time_to_roll +keep_log_file_num +soft_rate_limit +hard_rate_limit +rate_limit_delay_max_milliseconds +max_manifest_file_size +no_block_cache +table_cache_numshardbits +table_cache_remove_scan_count_limit +arena_block_size +manifest_preallocation_size +purge_redundant_kvs_while_flush +allow_os_buffer +allow_mmap_reads +allow_mmap_writes +is_fd_close_on_exec +skip_log_error_on_recovery +stats_dump_period_sec +block_size_deviation +advise_random_on_open +access_hint_on_compaction_start +use_adaptive_mutex +bytes_per_sync +filter_deletes +max_sequential_skip_in_iterations +table_factory +table_properties_collectors +inplace_update_support +inplace_update_num_locks +*/ + rocksdb_comparator_t* rocksdb_comparator_create( void* state, void (*destructor)(void*), @@ -666,6 +762,11 @@ void rocksdb_writeoptions_set_sync( opt->rep.sync = v; } +void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) { + opt->rep.disableWAL = disable; +} + + rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) { rocksdb_cache_t* c = new rocksdb_cache_t; c->rep = NewLRUCache(capacity); @@ -683,9 +784,55 @@ rocksdb_env_t* rocksdb_create_default_env() { return result; } +void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n); +} + void rocksdb_env_destroy(rocksdb_env_t* env) { if (!env->is_default) delete env->rep; delete env; } +rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() { + rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t; + result->rep = new rocksdb::CompactionOptionsUniversal; + return result; +} + +void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t* uco, int ratio) { + uco->rep->size_ratio = ratio; +} + +void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->min_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->max_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->max_size_amplification_percent = p; +} + +void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->compression_size_percent = p; +} + +void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t* uco, int style) { + uco->rep->stop_style = static_cast(style); +} + +void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t* uco) { + delete uco->rep; + delete uco; +} + } // end extern "C" diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index e093e9448d..a3b18084a8 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -70,6 +70,7 @@ typedef struct rocksdb_snapshot_t rocksdb_snapshot_t; typedef struct rocksdb_writablefile_t rocksdb_writablefile_t; typedef struct rocksdb_writebatch_t rocksdb_writebatch_t; typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; +typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t; /* DB operations */ @@ -208,14 +209,46 @@ extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*); extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t); extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int); extern void rocksdb_options_set_compression_options( - rocksdb_options_t* opt, int w_bits, int level, int strategy); + rocksdb_options_t*, int, int, int); +extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_slowdown_writes_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_target_file_size_base( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t*, int); +extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int); +extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int); +extern void rocksdb_options_set_use_fsync( + rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int); +extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int); +extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*); +extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*); + enum { rocksdb_no_compression = 0, - rocksdb_snappy_compression = 1 + rocksdb_snappy_compression = 1, + rocksdb_zlib_compression = 1, + rocksdb_bz2_compression = 1 }; extern void rocksdb_options_set_compression(rocksdb_options_t*, int); +enum { + rocksdb_level_compaction = 0, + rocksdb_universal_compaction = 1 +}; +extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int); +extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*); /* Comparator */ extern rocksdb_comparator_t* rocksdb_comparator_create( @@ -267,6 +300,7 @@ extern rocksdb_writeoptions_t* rocksdb_writeoptions_create(); extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*); extern void rocksdb_writeoptions_set_sync( rocksdb_writeoptions_t*, unsigned char); +extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable); /* Cache */ @@ -276,8 +310,32 @@ extern void rocksdb_cache_destroy(rocksdb_cache_t* cache); /* Env */ extern rocksdb_env_t* rocksdb_create_default_env(); +extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n); extern void rocksdb_env_destroy(rocksdb_env_t*); +/* Universal Compaction options */ + +enum { + rocksdb_similar_size_compaction_stop_style = 0, + rocksdb_total_size_compaction_stop_style = 1 +}; + +extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ; +extern void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t*); + #ifdef __cplusplus } /* end extern "C" */ #endif From 8c34189f0c59fbf0a32882219af3b33d4b21c278 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20=C5=A0im=C3=A1nek?= Date: Sat, 14 Dec 2013 04:11:32 +0100 Subject: [PATCH 27/40] Remove .DS_Store files. --- utilities/.DS_Store | Bin 6148 -> 0 bytes utilities/merge_operators/.DS_Store | Bin 6148 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 utilities/.DS_Store delete mode 100644 utilities/merge_operators/.DS_Store diff --git a/utilities/.DS_Store b/utilities/.DS_Store deleted file mode 100644 index daeccc094b20ae294ebecc3d166c64a4b84cbe49..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKyG{c^3>-s>NED=`++W}iR#EtZd;lPU(m;X^LVXqA#iucT20^|Y0#ZNoaB2iHaNC7GEt$=?Y8r`uMj*0Q< zV2BZbxL`Vr>zE~o%@f34I3_Yfv!oJ}YBge5(wT2n*9*tQq{C|Xu)5i5Lb14==eH<_ z^+ZJ}AO)rhoac7s{r`sk!~8!bX(t7wz`s(!7Mty6%~z`4I(s?qwT*s9_nJ?-8`nW$ mh;~ejcFc{p?}cMx(3uZ9Q9lE&i%bgqwE|z3TNl3o diff --git a/utilities/merge_operators/.DS_Store b/utilities/merge_operators/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Tue, 17 Dec 2013 14:58:42 -0800 Subject: [PATCH 28/40] Get() Does Not Reserve space for to_delete memtables Summary: It seems to be a decision tradeoff in current codes: we make a malloc for every Get() to reduce one malloc for a flush inside mutex. It takes about 5% of CPU time in readrandom tests. We might consider the tradeoff to be the other way around. Test Plan: make all check Reviewers: dhruba, haobo, igor Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14697 --- db/db_impl.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index c06d2f5bc1..6c57a986db 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -2597,7 +2597,6 @@ Status DBImpl::GetImpl(const ReadOptions& options, StopWatch sw(env_, options_.statistics.get(), DB_GET); SequenceNumber snapshot; std::vector to_delete; - to_delete.reserve(options_.max_write_buffer_number); mutex_.Lock(); if (options.snapshot != nullptr) { snapshot = reinterpret_cast(options.snapshot)->number_; @@ -2665,7 +2664,6 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET); SequenceNumber snapshot; std::vector to_delete; - to_delete.reserve(options_.max_write_buffer_number); mutex_.Lock(); if (options.snapshot != nullptr) { From 14995a8ff3110bbcd19c34cd92a449ca3e435f5d Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Wed, 11 Dec 2013 15:40:22 -0800 Subject: [PATCH 29/40] Move level0 sorting logic from Version::SaveTo() to Version::Finalize() Summary: I realized that "D14409 Avoid sorting in Version::Get() by presorting them in VersionSet::Builder::SaveTo()" is not done in an optimized place. SaveTo() is usually inside mutex. Move it to Finalize(), which is called out of mutex. Test Plan: make all check Reviewers: dhruba, haobo, kailiu Reviewed By: dhruba CC: igor, leveldb Differential Revision: https://reviews.facebook.net/D14607 --- db/version_set.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/db/version_set.cc b/db/version_set.cc index 741752936f..933affd180 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1110,12 +1110,6 @@ class VersionSet::Builder { MaybeAddFile(v, level, *base_iter); } } - // Pre-sort level0 for Get() - if (vset_->options_->compaction_style == kCompactionStyleUniversal) { - std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo); - } else { - std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst); - } CheckConsistency(v); } @@ -1681,6 +1675,12 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) { void VersionSet::Finalize(Version* v, std::vector& size_being_compacted) { + // Pre-sort level0 for Get() + if (options_->compaction_style == kCompactionStyleUniversal) { + std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo); + } else { + std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst); + } double max_score = 0; int max_score_level = 0; From e914b6490d1e53d851f59446468e6f596fa52f6d Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Wed, 18 Dec 2013 13:37:06 -0800 Subject: [PATCH 30/40] Reorder tests Summary: db_test should be the first to execute because it finds the most bugs. Also, when third parties report issues, we don't want ldb error message, we prefer to have db_test error message. For example, see thread: https://github.com/facebook/rocksdb/issues/25 Test Plan: make check Reviewers: dhruba, haobo, kailiu Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14715 --- Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 6200144c1e..0b113c1b59 100644 --- a/Makefile +++ b/Makefile @@ -49,6 +49,7 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full TESTS = \ + db_test \ table_properties_collector_test \ arena_test \ auto_roll_logger_test \ @@ -81,8 +82,7 @@ TESTS = \ version_set_test \ write_batch_test\ deletefile_test \ - table_test \ - db_test + table_test TOOLS = \ sst_dump \ @@ -147,8 +147,9 @@ coverage: # Delete intermediate files find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; -check: all $(PROGRAMS) $(TESTS) $(TOOLS) ldb_tests +check: all $(PROGRAMS) $(TESTS) $(TOOLS) for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done + python tools/ldb_test.py ldb_tests: all $(PROGRAMS) $(TOOLS) python tools/ldb_test.py From ca92068b12c7d2c4ba9cfc6022dc7dfaf6ba0708 Mon Sep 17 00:00:00 2001 From: Mark Callaghan Date: Wed, 18 Dec 2013 16:50:48 -0800 Subject: [PATCH 31/40] Add 'readtocache' test Summary: For some tests I want to cache the database prior to running other tests on the same invocation of db_bench. The readtocache test ignores --threads and --reads so those can be used by other tests and it will still do a full read of --num rows with one thread. It might be invoked like: db_bench --benchmarks=readtocache,readrandom --reads 100 --num 10000 --threads 8 Task ID: # Blame Rev: Test Plan: run db_bench Revert Plan: Database Impact: Memcache Impact: Other Notes: EImportant: - begin *PUBLIC* platform impact section - Bugzilla: # - end platform impact - Reviewers: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14739 --- db/db_bench.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/db/db_bench.cc b/db/db_bench.cc index 158a5faa2d..eb5d7cb421 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -48,6 +48,7 @@ DEFINE_string(benchmarks, "compact," "readrandom," "readseq," + "readtocache," "readreverse," "readwhilewriting," "readrandomwriterandom," @@ -75,6 +76,7 @@ DEFINE_string(benchmarks, "\tdeleteseq -- delete N keys in sequential order\n" "\tdeleterandom -- delete N keys in random order\n" "\treadseq -- read N times sequentially\n" + "\treadtocache -- 1 thread reading database sequentially\n" "\treadreverse -- read N times in reverse order\n" "\treadrandom -- read N times in random order\n" "\treadmissing -- read N missing keys in random order\n" @@ -1057,6 +1059,10 @@ class Benchmark { method = &Benchmark::WriteRandom; } else if (name == Slice("readseq")) { method = &Benchmark::ReadSequential; + } else if (name == Slice("readtocache")) { + method = &Benchmark::ReadSequential; + num_threads = 1; + reads_ = num_; } else if (name == Slice("readreverse")) { method = &Benchmark::ReadReverse; } else if (name == Slice("readrandom")) { From 1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Fri, 20 Dec 2013 09:57:58 -0800 Subject: [PATCH 32/40] [RocksDB] Optimize locking for Get Summary: Instead of locking and saving a DB state, we can cache a DB state and update it only when it changes. This change reduces lock contention and speeds up read operations on the DB. Performance improvements are substantial, although there is some cost in no-read workloads. I ran the regression tests on my devserver and here are the numbers: overwrite 56345 -> 63001 fillseq 193730 -> 185296 readrandom 771301 -> 1219803 (58% improvement!) readrandom_smallblockcache 677609 -> 862850 readrandom_memtable_sst 710440 -> 1109223 readrandom_fillunique_random 221589 -> 247869 memtablefillrandom 105286 -> 92643 memtablereadrandom 763033 -> 1288862 Test Plan: make asan_check I am also running db_stress Reviewers: dhruba, haobo, sdong, kailiu Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D14679 --- db/db_impl.cc | 208 +++++++++++++++++++++++++++++++++++++---------- db/db_impl.h | 74 +++++++++++++++-- db/version_set.h | 8 +- 3 files changed, 237 insertions(+), 53 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 6c57a986db..ece08db8b3 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -241,6 +241,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) mem_(new MemTable(internal_comparator_, mem_rep_factory_, NumberLevels(), options_)), logfile_number_(0), + super_version_(nullptr), tmp_batch_(), bg_compaction_scheduled_(0), bg_flush_scheduled_(0), @@ -316,6 +317,13 @@ DBImpl::~DBImpl() { bg_logstats_scheduled_) { bg_cv_.Wait(); } + if (super_version_ != nullptr) { + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + } mutex_.Unlock(); if (db_lock_ != nullptr) { @@ -345,6 +353,13 @@ void DBImpl::TEST_Destroy_DBImpl() { bg_logstats_scheduled_) { bg_cv_.Wait(); } + if (super_version_ != nullptr) { + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + } // Prevent new compactions from occuring. bg_work_gate_closed_ = true; @@ -443,6 +458,49 @@ void DBImpl::MaybeDumpStats() { } } +// DBImpl::SuperVersion methods +DBImpl::SuperVersion::SuperVersion(const int num_memtables) { + to_delete.resize(num_memtables); +} + +DBImpl::SuperVersion::~SuperVersion() { + for (auto td : to_delete) { + delete td; + } +} + +DBImpl::SuperVersion* DBImpl::SuperVersion::Ref() { + refs.fetch_add(1, std::memory_order_relaxed); + return this; +} + +bool DBImpl::SuperVersion::Unref() { + assert(refs > 0); + // fetch_sub returns the previous value of ref + return refs.fetch_sub(1, std::memory_order_relaxed) == 1; +} + +void DBImpl::SuperVersion::Cleanup() { + assert(refs.load(std::memory_order_relaxed) == 0); + imm.UnrefAll(&to_delete); + MemTable* m = mem->Unref(); + if (m != nullptr) { + to_delete.push_back(m); + } + current->Unref(); +} + +void DBImpl::SuperVersion::Init(MemTable* new_mem, const MemTableList& new_imm, + Version* new_current) { + mem = new_mem; + imm = new_imm; + current = new_current; + mem->Ref(); + imm.RefAll(); + current->Ref(); + refs.store(1, std::memory_order_relaxed); +} + // Returns the list of live files in 'sst_live' and the list // of all files in the filesystem in 'all_files'. // no_full_scan = true -- never do the full scan using GetChildren() @@ -518,11 +576,6 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, // It is not necessary to hold the mutex when invoking this method. void DBImpl::PurgeObsoleteFiles(DeletionState& state) { - // free pending memtables - for (auto m : state.memtables_to_free) { - delete m; - } - // check if there is anything to do if (!state.all_files.size() && !state.sst_delete_files.size() && @@ -1188,6 +1241,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, file_number, pending_outputs_, &deletion_state.memtables_to_free); if (s.ok()) { + InstallSuperVersion(deletion_state); if (madeProgress) { *madeProgress = 1; } @@ -1247,11 +1301,17 @@ int DBImpl::FindMinimumEmptyLevelFitting(int level) { void DBImpl::ReFitLevel(int level, int target_level) { assert(level < NumberLevels()); - MutexLock l(&mutex_); + SuperVersion* superversion_to_free = nullptr; + SuperVersion* new_superversion = + new SuperVersion(options_.max_write_buffer_number); + + mutex_.Lock(); // only allow one thread refitting if (refitting_level_) { + mutex_.Unlock(); Log(options_.info_log, "ReFitLevel: another thread is refitting"); + delete new_superversion; return; } refitting_level_ = true; @@ -1287,6 +1347,8 @@ void DBImpl::ReFitLevel(int level, int target_level) { edit.DebugString().data()); auto status = versions_->LogAndApply(&edit, &mutex_); + superversion_to_free = InstallSuperVersion(new_superversion); + new_superversion = nullptr; Log(options_.info_log, "LogAndApply: %s\n", status.ToString().data()); @@ -1298,6 +1360,10 @@ void DBImpl::ReFitLevel(int level, int target_level) { refitting_level_ = false; bg_work_gate_closed_ = false; + + mutex_.Unlock(); + delete superversion_to_free; + delete new_superversion; } int DBImpl::NumberLevels() { @@ -1671,7 +1737,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, void DBImpl::BackgroundCallFlush() { bool madeProgress = false; - DeletionState deletion_state(options_.max_write_buffer_number); + DeletionState deletion_state(options_.max_write_buffer_number, true); assert(bg_flush_scheduled_); MutexLock l(&mutex_); @@ -1717,7 +1783,7 @@ void DBImpl::TEST_PurgeObsoleteteWAL() { void DBImpl::BackgroundCallCompaction() { bool madeProgress = false; - DeletionState deletion_state(options_.max_write_buffer_number); + DeletionState deletion_state(options_.max_write_buffer_number, true); MaybeDumpStats(); @@ -1770,7 +1836,7 @@ void DBImpl::BackgroundCallCompaction() { } Status DBImpl::BackgroundCompaction(bool* madeProgress, - DeletionState& deletion_state) { + DeletionState& deletion_state) { *madeProgress = false; mutex_.AssertHeld(); @@ -1823,6 +1889,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); status = versions_->LogAndApply(c->edit(), &mutex_); + InstallSuperVersion(deletion_state); VersionSet::LevelSummaryStorage tmp; Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", static_cast(f->number), @@ -2484,6 +2551,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, if (status.ok()) { status = InstallCompactionResults(compact); + InstallSuperVersion(deletion_state); } VersionSet::LevelSummaryStorage tmp; Log(options_.info_log, @@ -2588,6 +2656,44 @@ Status DBImpl::Get(const ReadOptions& options, return GetImpl(options, key, value); } +// DeletionState gets created and destructed outside of the lock -- we +// use this convinently to: +// * malloc one SuperVersion() outside of the lock -- new_superversion +// * delete one SuperVersion() outside of the lock -- superversion_to_free +// +// However, if InstallSuperVersion() gets called twice with the same, +// deletion_state, we can't reuse the SuperVersion() that got malloced because +// first call already used it. In that rare case, we take a hit and create a +// new SuperVersion() inside of the mutex. We do similar thing +// for superversion_to_free +void DBImpl::InstallSuperVersion(DeletionState& deletion_state) { + // if new_superversion == nullptr, it means somebody already used it + SuperVersion* new_superversion = + (deletion_state.new_superversion != nullptr) ? + deletion_state.new_superversion : new SuperVersion(); + SuperVersion* old_superversion = InstallSuperVersion(new_superversion); + deletion_state.new_superversion = nullptr; + if (deletion_state.superversion_to_free != nullptr) { + // somebody already put it there + delete old_superversion; + } else { + deletion_state.superversion_to_free = old_superversion; + } +} + +DBImpl::SuperVersion* DBImpl::InstallSuperVersion( + SuperVersion* new_superversion) { + mutex_.AssertHeld(); + new_superversion->Init(mem_, imm_, versions_->current()); + SuperVersion* old_superversion = super_version_; + super_version_ = new_superversion; + if (old_superversion != nullptr && old_superversion->Unref()) { + old_superversion->Cleanup(); + return old_superversion; // will let caller delete outside of mutex + } + return nullptr; +} + Status DBImpl::GetImpl(const ReadOptions& options, const Slice& key, std::string* value, @@ -2596,27 +2702,20 @@ Status DBImpl::GetImpl(const ReadOptions& options, StopWatch sw(env_, options_.statistics.get(), DB_GET); SequenceNumber snapshot; - std::vector to_delete; - mutex_.Lock(); if (options.snapshot != nullptr) { snapshot = reinterpret_cast(options.snapshot)->number_; } else { snapshot = versions_->LastSequence(); } - MemTable* mem = mem_; - MemTableList imm = imm_; - Version* current = versions_->current(); - mem->Ref(); - imm.RefAll(); - current->Ref(); - - // Unlock while reading from files and memtables + // This can be replaced by using atomics and spinlock instead of big mutex + mutex_.Lock(); + SuperVersion* get_version = super_version_->Ref(); mutex_.Unlock(); + bool have_stat_update = false; Version::GetStats stats; - // Prepare to store a list of merge operations if merge occurs. MergeContext merge_context; @@ -2624,32 +2723,41 @@ Status DBImpl::GetImpl(const ReadOptions& options, // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. LookupKey lkey(key, snapshot); - if (mem->Get(lkey, value, &s, merge_context, options_)) { + if (get_version->mem->Get(lkey, value, &s, merge_context, options_)) { // Done RecordTick(options_.statistics.get(), MEMTABLE_HIT); - } else if (imm.Get(lkey, value, &s, merge_context, options_)) { + } else if (get_version->imm.Get(lkey, value, &s, merge_context, options_)) { // Done RecordTick(options_.statistics.get(), MEMTABLE_HIT); } else { - current->Get(options, lkey, value, &s, &merge_context, &stats, - options_, value_found); + get_version->current->Get(options, lkey, value, &s, &merge_context, &stats, + options_, value_found); have_stat_update = true; RecordTick(options_.statistics.get(), MEMTABLE_MISS); } - mutex_.Lock(); - if (!options_.disable_seek_compaction && - have_stat_update && current->UpdateStats(stats)) { - MaybeScheduleFlushOrCompaction(); + bool delete_get_version = false; + if (!options_.disable_seek_compaction && have_stat_update) { + mutex_.Lock(); + if (get_version->current->UpdateStats(stats)) { + MaybeScheduleFlushOrCompaction(); + } + if (get_version->Unref()) { + get_version->Cleanup(); + delete_get_version = true; + } + mutex_.Unlock(); + } else { + if (get_version->Unref()) { + mutex_.Lock(); + get_version->Cleanup(); + mutex_.Unlock(); + delete_get_version = true; + } + } + if (delete_get_version) { + delete get_version; } - MemTable* m = mem->Unref(); - imm.UnrefAll(&to_delete); - current->Unref(); - mutex_.Unlock(); - - // free up all obsolete memtables outside the mutex - delete m; - for (MemTable* v: to_delete) delete v; // Note, tickers are atomic now - no lock protection needed any more. RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); @@ -2813,7 +2921,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { w.done = false; StopWatch sw(env_, options_.statistics.get(), DB_WRITE); - MutexLock l(&mutex_); + mutex_.Lock(); writers_.push_back(&w); while (!w.done && &w != writers_.front()) { w.cv.Wait(); @@ -2824,6 +2932,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } if (w.done) { + mutex_.Unlock(); RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1); return w.status; } else { @@ -2831,7 +2940,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } // May temporarily unlock and wait. - Status status = MakeRoomForWrite(my_batch == nullptr); + SuperVersion* superversion_to_free = nullptr; + Status status = MakeRoomForWrite(my_batch == nullptr, &superversion_to_free); uint64_t last_sequence = versions_->LastSequence(); Writer* last_writer = &w; if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions @@ -2919,6 +3029,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (!writers_.empty()) { writers_.front()->cv.Signal(); } + mutex_.Unlock(); + delete superversion_to_free; return status; } @@ -3011,7 +3123,8 @@ uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) { // REQUIRES: mutex_ is held // REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::MakeRoomForWrite(bool force) { +Status DBImpl::MakeRoomForWrite(bool force, + SuperVersion** superversion_to_free) { mutex_.AssertHeld(); assert(!writers_.empty()); bool allow_delay = !force; @@ -3020,6 +3133,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { uint64_t rate_limit_delay_millis = 0; Status s; double score; + *superversion_to_free = nullptr; while (true) { if (!bg_error_.ok()) { @@ -3146,6 +3260,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { // Do this without holding the dbmutex lock. assert(versions_->PrevLogNumber() == 0); uint64_t new_log_number = versions_->NewFileNumber(); + SuperVersion* new_superversion = nullptr; mutex_.Unlock(); { EnvOptions soptions(storage_options_); @@ -3162,6 +3277,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); memtmp = new MemTable( internal_comparator_, mem_rep_factory_, NumberLevels(), options_); + new_superversion = new SuperVersion(options_.max_write_buffer_number); } } mutex_.Lock(); @@ -3186,6 +3302,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { mem_->SetLogNumber(logfile_number_); force = false; // Do not force another compaction if have room MaybeScheduleFlushOrCompaction(); + *superversion_to_free = InstallSuperVersion(new_superversion); } } return s; @@ -3541,7 +3658,7 @@ Status DBImpl::DeleteFile(std::string name) { FileMetaData metadata; int maxlevel = NumberLevels(); VersionEdit edit(maxlevel); - DeletionState deletion_state; + DeletionState deletion_state(0, true); { MutexLock l(&mutex_); status = versions_->GetMetadataForFile(number, &level, &metadata); @@ -3571,14 +3688,14 @@ Status DBImpl::DeleteFile(std::string name) { } edit.DeleteFile(level, number); status = versions_->LogAndApply(&edit, &mutex_); + if (status.ok()) { + InstallSuperVersion(deletion_state); + } FindObsoleteFiles(deletion_state, false); } // lock released here LogFlush(options_.info_log); - - if (status.ok()) { - // remove files outside the db-lock - PurgeObsoleteFiles(deletion_state); - } + // remove files outside the db-lock + PurgeObsoleteFiles(deletion_state); return status; } @@ -3678,6 +3795,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { s = impl->versions_->LogAndApply(&edit, &impl->mutex_); } if (s.ok()) { + delete impl->InstallSuperVersion(new DBImpl::SuperVersion()); impl->mem_->SetLogNumber(impl->logfile_number_); impl->DeleteObsoleteFiles(); impl->MaybeScheduleFlushOrCompaction(); diff --git a/db/db_impl.h b/db/db_impl.h index 39e1329798..2447b31fa1 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -128,12 +128,38 @@ class DBImpl : public DB { default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL; } - // needed for CleanupIteratorState + // holds references to memtable, all immutable memtables and version + struct SuperVersion { + MemTable* mem; + MemTableList imm; + Version* current; + std::atomic refs; + // We need to_delete because during Cleanup(), imm.UnrefAll() returns + // all memtables that we need to free through this vector. We then + // delete all those memtables outside of mutex, during destruction + std::vector to_delete; + // should be called outside the mutex + explicit SuperVersion(const int num_memtables = 0); + ~SuperVersion(); + SuperVersion* Ref(); + // Returns true if this was the last reference and caller should + // call Clenaup() and delete the object + bool Unref(); + + // call these two methods with db mutex held + // Cleanup unrefs mem, imm and current. Also, it stores all memtables + // that needs to be deleted in to_delete vector. Unrefing those + // objects needs to be done in the mutex + void Cleanup(); + void Init(MemTable* new_mem, const MemTableList& new_imm, + Version* new_current); + }; + + // needed for CleanupIteratorState struct DeletionState { inline bool HaveSomethingToDelete() const { - return memtables_to_free.size() || - all_files.size() || + return all_files.size() || sst_delete_files.size() || log_delete_files.size(); } @@ -155,15 +181,35 @@ class DBImpl : public DB { // a list of memtables to be free std::vector memtables_to_free; + SuperVersion* superversion_to_free; // if nullptr nothing to free + + SuperVersion* new_superversion; // if nullptr no new superversion + // the current manifest_file_number, log_number and prev_log_number // that corresponds to the set of files in 'live'. uint64_t manifest_file_number, log_number, prev_log_number; - explicit DeletionState(const int num_memtables = 0) { + explicit DeletionState(const int num_memtables = 0, + bool create_superversion = false) { manifest_file_number = 0; log_number = 0; prev_log_number = 0; memtables_to_free.reserve(num_memtables); + superversion_to_free = nullptr; + new_superversion = + create_superversion ? new SuperVersion(num_memtables) : nullptr; + } + + ~DeletionState() { + // free pending memtables + for (auto m : memtables_to_free) { + delete m; + } + // free superversion. if nullptr, this will be noop + delete superversion_to_free; + // if new_superversion was not used, it will be non-nullptr and needs + // to be freed here + delete new_superversion; } }; @@ -240,7 +286,11 @@ class DBImpl : public DB { uint64_t* filenumber); uint64_t SlowdownAmount(int n, int top, int bottom); - Status MakeRoomForWrite(bool force /* compact even if there is room? */); + // MakeRoomForWrite will return superversion_to_free through an arugment, + // which the caller needs to delete. We do it because caller can delete + // the superversion outside of mutex + Status MakeRoomForWrite(bool force /* compact even if there is room? */, + SuperVersion** superversion_to_free); WriteBatch* BuildBatchGroup(Writer** last_writer); // Force current memtable contents to be flushed. @@ -324,6 +374,8 @@ class DBImpl : public DB { uint64_t logfile_number_; unique_ptr log_; + SuperVersion* super_version_; + std::string host_name_; // Queue of writers. @@ -491,6 +543,18 @@ class DBImpl : public DB { std::vector& snapshots, SequenceNumber* prev_snapshot); + // will return a pointer to SuperVersion* if previous SuperVersion + // if its reference count is zero and needs deletion or nullptr if not + // As argument takes a pointer to allocated SuperVersion + // Foreground threads call this function directly (they don't carry + // deletion state and have to handle their own creation and deletion + // of SuperVersion) + SuperVersion* InstallSuperVersion(SuperVersion* new_superversion); + // Background threads call this function, which is just a wrapper around + // the InstallSuperVersion() function above. Background threads carry + // deletion_state which can have new_superversion already allocated. + void InstallSuperVersion(DeletionState& deletion_state); + // Function that Get and KeyMayExist call with no_io true or false // Note: 'value_found' from KeyMayExist propagates here Status GetImpl(const ReadOptions& options, diff --git a/db/version_set.h b/db/version_set.h index bf466a932f..75b529942f 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -272,12 +272,14 @@ class VersionSet { int64_t NumLevelBytes(int level) const; // Return the last sequence number. - uint64_t LastSequence() const { return last_sequence_; } + uint64_t LastSequence() const { + return last_sequence_.load(std::memory_order_acquire); + } // Set the last sequence number to s. void SetLastSequence(uint64_t s) { assert(s >= last_sequence_); - last_sequence_ = s; + last_sequence_.store(s, std::memory_order_release); } // Mark the specified file number as used. @@ -476,7 +478,7 @@ class VersionSet { const InternalKeyComparator icmp_; uint64_t next_file_number_; uint64_t manifest_file_number_; - uint64_t last_sequence_; + std::atomic last_sequence_; uint64_t log_number_; uint64_t prev_log_number_; // 0 or backing store for memtable being compacted From b26dc9562801d935ceb1f4410fbb709851840c99 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Fri, 20 Dec 2013 10:01:12 -0800 Subject: [PATCH 33/40] Initialize sequence number in BatchResult - issue #39 --- include/rocksdb/transaction_log.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rocksdb/transaction_log.h b/include/rocksdb/transaction_log.h index abf0725748..41a3250d8d 100644 --- a/include/rocksdb/transaction_log.h +++ b/include/rocksdb/transaction_log.h @@ -56,7 +56,7 @@ class LogFile { }; struct BatchResult { - SequenceNumber sequence = SequenceNumber(); + SequenceNumber sequence = 0; std::unique_ptr writeBatchPtr; }; From 71ddb117c840f285d19d43eccf252d7c614cefc9 Mon Sep 17 00:00:00 2001 From: Dhruba Borthakur Date: Mon, 23 Dec 2013 12:19:18 -0800 Subject: [PATCH 34/40] Add a pointer to the engineering design discussion forum. Summary: Add a pointer to the engineering design discussion forum. Test Plan: Reviewers: CC: Task ID: # Blame Rev: --- README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README b/README index c55149d443..473e4145b5 100644 --- a/README +++ b/README @@ -79,4 +79,4 @@ include/rocksdb/statistics.h include/rocksdb/transaction_log.h An API to retrieve transaction logs from a database. - +Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/ From c01676e46d3be08c3c140361ef1f5884f47d3b3c Mon Sep 17 00:00:00 2001 From: kailiu Date: Thu, 12 Dec 2013 15:32:56 -0800 Subject: [PATCH 35/40] Implement autovector Summary: A vector that leverages pre-allocated stack-based array to achieve better performance for array with small amount of items. Test Plan: Added tests for both correctness and performance Here is the performance benchmark between vector and autovector Please note that in the test "Creation and Insertion Test", the test case were designed with the motivation described below: * no element inserted: internal array of std::vector may not really get initialize. * one element inserted: internal array of std::vector must have initialized. * kSize elements inserted. This shows the most time we'll spend if we keep everything in stack. * 2 * kSize elements inserted. The internal vector of autovector must have been initialized. Note: kSize is the capacity of autovector ===================================================== Creation and Insertion Test ===================================================== created 100000 vectors: each was inserted with 0 elements total time elapsed: 128000 (ns) created 100000 autovectors: each was inserted with 0 elements total time elapsed: 3641000 (ns) created 100000 VectorWithReserveSizes: each was inserted with 0 elements total time elapsed: 9896000 (ns) ----------------------------------- created 100000 vectors: each was inserted with 1 elements total time elapsed: 11089000 (ns) created 100000 autovectors: each was inserted with 1 elements total time elapsed: 5008000 (ns) created 100000 VectorWithReserveSizes: each was inserted with 1 elements total time elapsed: 24271000 (ns) ----------------------------------- created 100000 vectors: each was inserted with 4 elements total time elapsed: 39369000 (ns) created 100000 autovectors: each was inserted with 4 elements total time elapsed: 10121000 (ns) created 100000 VectorWithReserveSizes: each was inserted with 4 elements total time elapsed: 28473000 (ns) ----------------------------------- created 100000 vectors: each was inserted with 8 elements total time elapsed: 75013000 (ns) created 100000 autovectors: each was inserted with 8 elements total time elapsed: 18237000 (ns) created 100000 VectorWithReserveSizes: each was inserted with 8 elements total time elapsed: 42464000 (ns) ----------------------------------- created 100000 vectors: each was inserted with 16 elements total time elapsed: 102319000 (ns) created 100000 autovectors: each was inserted with 16 elements total time elapsed: 76724000 (ns) created 100000 VectorWithReserveSizes: each was inserted with 16 elements total time elapsed: 68285000 (ns) ----------------------------------- ===================================================== Sequence Access Test ===================================================== performed 100000 sequence access against vector size: 4 total time elapsed: 198000 (ns) performed 100000 sequence access against autovector size: 4 total time elapsed: 306000 (ns) ----------------------------------- performed 100000 sequence access against vector size: 8 total time elapsed: 565000 (ns) performed 100000 sequence access against autovector size: 8 total time elapsed: 512000 (ns) ----------------------------------- performed 100000 sequence access against vector size: 16 total time elapsed: 1076000 (ns) performed 100000 sequence access against autovector size: 16 total time elapsed: 1070000 (ns) ----------------------------------- Reviewers: dhruba, haobo, sdong, chip Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14655 --- Makefile | 4 + util/autovector.h | 329 ++++++++++++++++++++++++++++++++++++++++ util/autovector_test.cc | 286 ++++++++++++++++++++++++++++++++++ 3 files changed, 619 insertions(+) create mode 100644 util/autovector.h create mode 100644 util/autovector_test.cc diff --git a/Makefile b/Makefile index 0b113c1b59..68bc489c35 100644 --- a/Makefile +++ b/Makefile @@ -49,6 +49,7 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full TESTS = \ + autovector_test \ db_test \ table_properties_collector_test \ arena_test \ @@ -226,6 +227,9 @@ signal_test: util/signal_test.o $(LIBOBJECTS) arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) diff --git a/util/autovector.h b/util/autovector.h new file mode 100644 index 0000000000..2b9cb40e9a --- /dev/null +++ b/util/autovector.h @@ -0,0 +1,329 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include +#include +#include + +namespace rocksdb { + +// A vector that leverages pre-allocated stack-based array to achieve better +// performance for array with small amount of items. +// +// The interface resembles that of vector, but with less features since we aim +// to solve the problem that we have in hand, rather than implementing a +// full-fledged generic container. +// +// Currently we don't support: +// * reserve()/shrink_to_fit()/resize() +// If used correctly, in most cases, people should not touch the +// underlying vector at all. +// * random insert()/erase(), please only use push_back()/pop_back(). +// * No move/swap operations. Each autovector instance has a +// stack-allocated array and if we want support move/swap operations, we +// need to copy the arrays other than just swapping the pointers. In this +// case we'll just explicitly forbid these operations since they may +// lead users to make false assumption by thinking they are inexpensive +// operations. +// +// Naming style of public methods almost follows that of the STL's. +template +class autovector { + public: + // General STL-style container member types. + typedef T value_type; + typedef typename std::vector::difference_type difference_type; + typedef typename std::vector::size_type size_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + + // This class is the base for regular/const iterator + template + class iterator_impl { + public: + // -- iterator traits + typedef iterator_impl self_type; + typedef TValueType value_type; + typedef TValueType& reference; + typedef TValueType* pointer; + typedef typename TAutoVector::difference_type difference_type; + typedef std::random_access_iterator_tag iterator_category; + + iterator_impl(TAutoVector* vect, size_t index) + : vect_(vect) + , index_(index) { + }; + iterator_impl(const iterator_impl&) = default; + ~iterator_impl() { } + iterator_impl& operator=(const iterator_impl&) = default; + + // -- Advancement + // iterator++ + self_type& operator++() { + ++index_; + return *this; + } + + // ++iterator + self_type operator++(int) { + auto old = *this; + ++index_; + return old; + } + + // iterator-- + self_type& operator--() { + --index_; + return *this; + } + + // --iterator + self_type operator--(int) { + auto old = *this; + --index_; + return old; + } + + self_type operator-(difference_type len) { + return self_type(vect_, index_ - len); + } + + difference_type operator-(const self_type& other) { + assert(vect_ == other.vect_); + return index_ - other.index_; + } + + self_type operator+(difference_type len) { + return self_type(vect_, index_ + len); + } + + self_type& operator+=(difference_type len) { + index_ += len; + return *this; + } + + self_type& operator-=(difference_type len) { + index_ -= len; + return *this; + } + + // -- Reference + reference operator*() { + assert(vect_->size() >= index_); + return (*vect_)[index_]; + } + pointer operator->() { + assert(vect_->size() >= index_); + return &(*vect_)[index_]; + } + + // -- Logical Operators + bool operator==(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ == other.index_; + } + + bool operator!=(const self_type& other) const { + return !(*this == other); + } + + bool operator>(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ > other.index_; + } + + bool operator<(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ < other.index_; + } + + bool operator>=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ >= other.index_; + } + + bool operator<=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ <= other.index_; + } + + private: + TAutoVector* vect_ = nullptr; + size_t index_ = 0; + }; + + typedef iterator_impl iterator; + typedef iterator_impl const_iterator; + typedef std::reverse_iterator reverse_iterator; + typedef std::reverse_iterator const_reverse_iterator; + + autovector() = default; + ~autovector() = default; + + // -- Immutable operations + // Indicate if all data resides in in-stack data structure. + bool only_in_stack() const { + // If no element was inserted at all, the vector's capacity will be `0`. + return vect_.capacity() == 0; + } + + size_type size() const { + return num_stack_items_ + vect_.size(); + } + + bool empty() const { + return size() == 0; + } + + // will not check boundry + const_reference operator[](size_type n) const { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + reference operator[](size_type n) { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + // will check boundry + const_reference at(size_type n) const { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference at(size_type n) { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference front() { + assert(!empty()); + return *begin(); + } + + const_reference front() const { + assert(!empty()); + return *begin(); + } + + reference back() { + assert(!empty()); + return *(end() - 1); + } + + const_reference back() const { + assert(!empty()); + return *(end() - 1); + } + + // -- Mutable Operations + void push_back(T&& item) { + if (num_stack_items_ < kSize) { + values_[num_stack_items_++] = std::move(item); + } else { + vect_.push_back(item); + } + } + + void push_back(const T& item) { + push_back(value_type(item)); + } + + template + void emplace_back(Args&&... args) { + push_back(value_type(args...)); + } + + void pop_back() { + assert(!empty()); + if (!vect_.empty()) { + vect_.pop_back(); + } else { + --num_stack_items_; + } + } + + void clear() { + num_stack_items_ = 0; + vect_.clear(); + } + + // -- Copy and Assignment + autovector& assign(const autovector& other); + + autovector(const autovector& other) { + assign(other); + } + + autovector& operator=(const autovector& other) { + return assign(other); + } + + // move operation are disallowed since it is very hard to make sure both + // autovectors are allocated from the same function stack. + autovector& operator=(autovector&& other) = delete; + autovector(autovector&& other) = delete; + + // -- Iterator Operations + iterator begin() { + return iterator(this, 0); + } + + const_iterator begin() const { + return const_iterator(this, 0); + } + + iterator end() { + return iterator(this, this->size()); + } + + const_iterator end() const { + return const_iterator(this, this->size()); + } + + reverse_iterator rbegin() { + return reverse_iterator(end()); + } + + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + + reverse_iterator rend() { + return reverse_iterator(begin()); + } + + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + private: + size_type num_stack_items_ = 0; // current number of items + value_type values_[kSize]; // the first `kSize` items + // used only if there are more than `kSize` items. + std::vector vect_; +}; + +template +autovector& autovector::assign(const autovector& other) { + // copy the internal vector + vect_.assign(other.vect_.begin(), other.vect_.end()); + + // copy array + num_stack_items_ = other.num_stack_items_; + std::copy(other.values_, other.values_ + num_stack_items_, values_); + + return *this; +} + +} // rocksdb diff --git a/util/autovector_test.cc b/util/autovector_test.cc new file mode 100644 index 0000000000..31ce4ed194 --- /dev/null +++ b/util/autovector_test.cc @@ -0,0 +1,286 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include + +#include "rocksdb/env.h" +#include "util/autovector.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +using namespace std; + +class AutoVectorTest { }; + +const size_t kSize = 8; +TEST(AutoVectorTest, PushBackAndPopBack) { + autovector vec; + ASSERT_TRUE(vec.empty()); + ASSERT_EQ(0ul, vec.size()); + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.push_back(i); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i]); + ASSERT_EQ(i, vec.at(i)); + } + + size_t size = vec.size(); + while (size != 0) { + vec.pop_back(); + // will always be in heap + ASSERT_TRUE(!vec.only_in_stack()); + ASSERT_EQ(--size, vec.size()); + } + + ASSERT_TRUE(vec.empty()); +} + +TEST(AutoVectorTest, EmplaceBack) { + typedef std::pair ValueType; + autovector vec; + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.emplace_back(i, std::to_string(i + 123)); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i].first); + ASSERT_EQ(std::to_string(i + 123), vec[i].second); + } + + vec.clear(); + ASSERT_TRUE(vec.empty()); + ASSERT_TRUE(!vec.only_in_stack()); +} + +void AssertEqual( + const autovector& a, const autovector& b) { + ASSERT_EQ(a.size(), b.size()); + ASSERT_EQ(a.empty(), b.empty()); + ASSERT_EQ(a.only_in_stack(), b.only_in_stack()); + for (size_t i = 0; i < a.size(); ++i) { + ASSERT_EQ(a[i], b[i]); + } +} + +TEST(AutoVectorTest, CopyAndAssignment) { + // Test both heap-allocated and stack-allocated cases. + for (auto size : { kSize / 2, kSize * 1000 }) { + autovector vec; + for (size_t i = 0; i < size; ++i) { + vec.push_back(i); + } + + { + autovector other; + other = vec; + AssertEqual(other, vec); + } + + { + autovector other(vec); + AssertEqual(other, vec); + } + } +} + +TEST(AutoVectorTest, Iterators) { + autovector vec; + for (size_t i = 0; i < kSize * 1000; ++i) { + vec.push_back(std::to_string(i)); + } + + // basic operator test + ASSERT_EQ(vec.front(), *vec.begin()); + ASSERT_EQ(vec.back(), *(vec.end() - 1)); + ASSERT_TRUE(vec.begin() < vec.end()); + + // non-const iterator + size_t index = 0; + for (const auto& item : vec) { + ASSERT_EQ(vec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) { + ASSERT_EQ(vec[index--], *pos); + } + + // const iterator + const auto& cvec = vec; + index = 0; + for (const auto& item : cvec) { + ASSERT_EQ(cvec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) { + ASSERT_EQ(cvec[index--], *pos); + } + + // forward and backward + auto pos = vec.begin(); + while (pos != vec.end()) { + auto old_val = *pos; + auto old = pos++; + // HACK: make sure -> works + ASSERT_TRUE(!old->empty()); + ASSERT_EQ(old_val, *old); + ASSERT_TRUE(old_val != *pos); + } + + pos = vec.begin(); + for (size_t i = 0; i < vec.size(); i += 2) { + // Cannot use ASSERT_EQ since that macro depends on iostream serialization + ASSERT_TRUE(pos + 2 - 2 == pos); + pos += 2; + ASSERT_TRUE(i + 2 == pos - vec.begin()); + ASSERT_TRUE(pos >= vec.begin()); + ASSERT_TRUE(pos <= vec.end()); + } +} + +vector GetTestKeys(size_t size) { + vector keys; + keys.resize(size); + + int index = 0; + for (auto& key : keys) { + key = "item-" + to_string(index++); + } + return keys; +} + +template +void BenchmarkVectorCreationAndInsertion( + string name, size_t ops, size_t item_size, + const std::vector& items) { + auto env = Env::Default(); + + int index = 0; + auto start_time = env->NowNanos(); + auto ops_remaining = ops; + while(ops_remaining--) { + TVector v; + for (size_t i = 0; i < item_size; ++i) { + v.push_back(items[index++]); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "created " << ops << " " << name << " instances:\n\t" + << "each was inserted with " << item_size << " elements\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; +} + +template +void BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { + TVector v; + for (const auto& item : GetTestKeys(elem_size)) { + v.push_back(item); + } + auto env = Env::Default(); + + auto ops_remaining = ops; + auto start_time = env->NowNanos(); + size_t total = 0; + while (ops_remaining--) { + auto end = v.end(); + for (auto pos = v.begin(); pos != end; ++pos) { + total += pos->size(); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "performed " << ops << " sequence access against " << name << "\n\t" + << "size: " << elem_size << "\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; +} + +// This test case only reports the performance between std::vector +// and autovector. We chose string for comparison because in most +// o our use cases we used std::vector. +TEST(AutoVectorTest, PerfBench) { + // We run same operations for kOps times in order to get a more fair result. + size_t kOps = 100000; + + // Creation and insertion test + // Test the case when there is: + // * no element inserted: internal array of std::vector may not really get + // initialize. + // * one element inserted: internal array of std::vector must have + // initialized. + // * kSize elements inserted. This shows the most time we'll spend if we + // keep everything in stack. + // * 2 * kSize elements inserted. The internal vector of + // autovector must have been initialized. + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: std::string)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + auto string_keys = GetTestKeys(kOps * 2 * kSize); + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, string_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, string_keys + ); + cout << "-----------------------------------" << endl; + } + + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: uint64_t)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + vector int_keys(kOps * 2 * kSize); + for (size_t i = 0; i < kOps * 2 * kSize; ++i) { + int_keys[i] = i; + } + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, int_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, int_keys + ); + cout << "-----------------------------------" << endl; + } + + // Sequence Access Test + cout << "=====================================================" << endl; + cout << "Sequence Access Test" << endl; + cout << "=====================================================" << endl; + for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) { + BenchmarkSequenceAccess>( + "vector", kOps, elem_size + ); + BenchmarkSequenceAccess>( + "autovector", kOps, elem_size + ); + cout << "-----------------------------------" << endl; + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} From 079a21ba99cd83c7e94f631c0ba6e250e690bbf4 Mon Sep 17 00:00:00 2001 From: kailiu Date: Thu, 26 Dec 2013 15:12:30 -0800 Subject: [PATCH 36/40] Fix the unused variable warning message in mac os --- db/db_test.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/db/db_test.cc b/db/db_test.cc index 8cfdedd5e3..9615c89691 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2549,12 +2549,13 @@ class DeleteFilter : public CompactionFilter { class ChangeFilter : public CompactionFilter { public: - explicit ChangeFilter(int argv) : argv_(argv) {} + explicit ChangeFilter(int argv) { + assert(argv == 100); + } virtual bool Filter(int level, const Slice& key, const Slice& value, std::string* new_value, bool* value_changed) const override { - assert(argv_ == 100); assert(new_value != nullptr); *new_value = NEW_VALUE; *value_changed = true; @@ -2564,9 +2565,6 @@ class ChangeFilter : public CompactionFilter { virtual const char* Name() const override { return "ChangeFilter"; } - - private: - const int argv_; }; class KeepFilterFactory : public CompactionFilterFactory { From 113a08c9291a8a723458f4426312e9c5add61139 Mon Sep 17 00:00:00 2001 From: kailiu Date: Thu, 26 Dec 2013 15:47:07 -0800 Subject: [PATCH 37/40] Fix [-Werror=sign-compare] in autovector_test --- util/autovector_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util/autovector_test.cc b/util/autovector_test.cc index 31ce4ed194..67fb67b054 100644 --- a/util/autovector_test.cc +++ b/util/autovector_test.cc @@ -147,7 +147,8 @@ TEST(AutoVectorTest, Iterators) { } pos = vec.begin(); - for (size_t i = 0; i < vec.size(); i += 2) { + typedef autovector::difference_type diff_type; + for (diff_type i = 0; i < vec.size(); i += 2) { // Cannot use ASSERT_EQ since that macro depends on iostream serialization ASSERT_TRUE(pos + 2 - 2 == pos); pos += 2; From b40c052bfa4e5ec1777f56cf83d572eb53e6d6d1 Mon Sep 17 00:00:00 2001 From: Kai Liu Date: Thu, 26 Dec 2013 15:56:20 -0800 Subject: [PATCH 38/40] Fix all the comparison issue in fb dev servers --- util/autovector.h | 2 +- util/autovector_test.cc | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/util/autovector.h b/util/autovector.h index 2b9cb40e9a..9998e29560 100644 --- a/util/autovector.h +++ b/util/autovector.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include diff --git a/util/autovector_test.cc b/util/autovector_test.cc index 67fb67b054..6d709a374b 100644 --- a/util/autovector_test.cc +++ b/util/autovector_test.cc @@ -48,7 +48,7 @@ TEST(AutoVectorTest, PushBackAndPopBack) { } TEST(AutoVectorTest, EmplaceBack) { - typedef std::pair ValueType; + typedef std::pair ValueType; autovector vec; for (size_t i = 0; i < 1000 * kSize; ++i) { @@ -143,18 +143,19 @@ TEST(AutoVectorTest, Iterators) { // HACK: make sure -> works ASSERT_TRUE(!old->empty()); ASSERT_EQ(old_val, *old); - ASSERT_TRUE(old_val != *pos); + ASSERT_TRUE(pos == vec.end() || old_val != *pos); } pos = vec.begin(); - typedef autovector::difference_type diff_type; - for (diff_type i = 0; i < vec.size(); i += 2) { + for (size_t i = 0; i < vec.size(); i += 2) { // Cannot use ASSERT_EQ since that macro depends on iostream serialization ASSERT_TRUE(pos + 2 - 2 == pos); pos += 2; - ASSERT_TRUE(i + 2 == pos - vec.begin()); ASSERT_TRUE(pos >= vec.begin()); ASSERT_TRUE(pos <= vec.end()); + + size_t diff = static_cast(pos - vec.begin()); + ASSERT_EQ(i + 2, diff); } } @@ -191,7 +192,7 @@ void BenchmarkVectorCreationAndInsertion( } template -void BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { +size_t BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { TVector v; for (const auto& item : GetTestKeys(elem_size)) { v.push_back(item); @@ -211,6 +212,8 @@ void BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { cout << "performed " << ops << " sequence access against " << name << "\n\t" << "size: " << elem_size << "\n\t" << "total time elapsed: " << elapsed << " (ns)" << endl; + // HACK avoid compiler's optimization to ignore total + return total; } // This test case only reports the performance between std::vector From 18df47b79aaee1bab0442a45caa9d73db8d6fa6f Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 26 Dec 2013 13:49:04 -0800 Subject: [PATCH 39/40] Avoid malloc in NotFound key status if no message is given. Summary: In some places we have NotFound status created with empty message, but it doesn't avoid a malloc. With this patch, the malloc is avoided for that case. The motivation of it is that I found in db_bench readrandom test when all keys are not existing, about 4% of the total running time is spent on malloc of Status, plus a similar amount of CPU spent on free of them, which is not necessary. Test Plan: make all check Reviewers: dhruba, haobo, igor Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D14691 --- db/memtable.cc | 2 +- db/version_set.cc | 4 +- include/rocksdb/status.h | 31 ++++++++------ util/status.cc | 87 +++++++++++++++++++--------------------- 4 files changed, 64 insertions(+), 60 deletions(-) diff --git a/db/memtable.cc b/db/memtable.cc index d2a51a125d..675a314ff5 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -225,7 +225,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, *s = Status::Corruption("Error: Could not perform merge."); } } else { - *s = Status::NotFound(Slice()); + *s = Status::NotFound(); } return true; } diff --git a/db/version_set.cc b/db/version_set.cc index 933affd180..46cdfaa61c 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -545,7 +545,7 @@ void Version::Get(const ReadOptions& options, case kFound: return; case kDeleted: - *status = Status::NotFound(Slice()); // Use empty error message for speed + *status = Status::NotFound(); // Use empty error message for speed return; case kCorrupt: *status = Status::Corruption("corrupted key for ", user_key); @@ -570,7 +570,7 @@ void Version::Get(const ReadOptions& options, user_key); } } else { - *status = Status::NotFound(Slice()); // Use an empty error message for speed + *status = Status::NotFound(); // Use an empty error message for speed } } diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index b118e3db4b..e2304fdb67 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -25,7 +25,7 @@ namespace rocksdb { class Status { public: // Create a success status. - Status() : state_(nullptr) { } + Status() : code_(kOk), state_(nullptr) { } ~Status() { delete[] state_; } // Copy the specified status. @@ -39,6 +39,10 @@ class Status { static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kNotFound, msg, msg2); } + // Fast path for not found without malloc; + static Status NotFound() { + return Status(kNotFound); + } static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kCorruption, msg, msg2); } @@ -59,7 +63,7 @@ class Status { } // Returns true iff the status indicates success. - bool ok() const { return (state_ == nullptr); } + bool ok() const { return code() == kOk; } // Returns true iff the status indicates a NotFound error. bool IsNotFound() const { return code() == kNotFound; } @@ -87,13 +91,6 @@ class Status { std::string ToString() const; private: - // OK status has a nullptr state_. Otherwise, state_ is a new[] array - // of the following form: - // state_[0..3] == length of message - // state_[4] == code - // state_[5..] == message - const char* state_; - enum Code { kOk = 0, kNotFound = 1, @@ -105,20 +102,30 @@ class Status { kIncomplete = 7 }; - Code code() const { - return (state_ == nullptr) ? kOk : static_cast(state_[4]); - } + // A nullptr state_ (which is always the case for OK) means the message + // is empty. + // of the following form: + // state_[0..3] == length of message + // state_[4..] == message + Code code_; + const char* state_; + Code code() const { + return code_; + } + explicit Status(Code code) : code_(code), state_(nullptr) { } Status(Code code, const Slice& msg, const Slice& msg2); static const char* CopyState(const char* s); }; inline Status::Status(const Status& s) { + code_ = s.code_; state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); } inline void Status::operator=(const Status& s) { // The following condition catches both aliasing (when this == &s), // and the common case where both s and *this are ok. + code_ = s.code_; if (state_ != s.state_) { delete[] state_; state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); diff --git a/util/status.cc b/util/status.cc index f7c40e9526..69060a7ccf 100644 --- a/util/status.cc +++ b/util/status.cc @@ -16,68 +16,65 @@ namespace rocksdb { const char* Status::CopyState(const char* state) { uint32_t size; memcpy(&size, state, sizeof(size)); - char* result = new char[size + 5]; - memcpy(result, state, size + 5); + char* result = new char[size + 4]; + memcpy(result, state, size + 4); return result; } -Status::Status(Code code, const Slice& msg, const Slice& msg2) { +Status::Status(Code code, const Slice& msg, const Slice& msg2) : + code_(code) { assert(code != kOk); const uint32_t len1 = msg.size(); const uint32_t len2 = msg2.size(); const uint32_t size = len1 + (len2 ? (2 + len2) : 0); - char* result = new char[size + 5]; + char* result = new char[size + 4]; memcpy(result, &size, sizeof(size)); - result[4] = static_cast(code); - memcpy(result + 5, msg.data(), len1); + memcpy(result + 4, msg.data(), len1); if (len2) { - result[5 + len1] = ':'; - result[6 + len1] = ' '; - memcpy(result + 7 + len1, msg2.data(), len2); + result[4 + len1] = ':'; + result[5 + len1] = ' '; + memcpy(result + 6 + len1, msg2.data(), len2); } state_ = result; } std::string Status::ToString() const { - if (state_ == nullptr) { - return "OK"; - } else { - char tmp[30]; - const char* type; - switch (code()) { - case kOk: - type = "OK"; - break; - case kNotFound: - type = "NotFound: "; - break; - case kCorruption: - type = "Corruption: "; - break; - case kNotSupported: - type = "Not implemented: "; - break; - case kInvalidArgument: - type = "Invalid argument: "; - break; - case kIOError: - type = "IO error: "; - break; - case kMergeInProgress: - type = "Merge In Progress: "; - break; - default: - snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", - static_cast(code())); - type = tmp; - break; - } - std::string result(type); + char tmp[30]; + const char* type; + switch (code_) { + case kOk: + return "OK"; + case kNotFound: + type = "NotFound: "; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + case kMergeInProgress: + type = "Merge In Progress: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast(code())); + type = tmp; + break; + } + std::string result(type); + if (state_ != nullptr) { uint32_t length; memcpy(&length, state_, sizeof(length)); - result.append(state_ + 5, length); - return result; + result.append(state_ + 4, length); } + return result; } } // namespace rocksdb From a094f3b3b5568f4af37c081c7cea4b77e747b2e3 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 26 Dec 2013 16:25:45 -0800 Subject: [PATCH 40/40] TableCache.FindTable() to avoid the mem copy of file number Summary: I'm not sure what's the purpose of encoding file number to a new buffer for looking up the table cache. It seems to be unnecessary to me. With this patch, we point the lookup key to the address of the int64 of the file number. Test Plan: make all check Reviewers: dhruba, haobo, igor, kailiu Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14811 --- db/table_cache.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/db/table_cache.cc b/db/table_cache.cc index e18c20c990..20eb68e4b8 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -29,6 +29,11 @@ static void UnrefEntry(void* arg1, void* arg2) { cache->Release(h); } +static Slice GetSliceForFileNumber(uint64_t file_number) { + return Slice(reinterpret_cast(&file_number), + sizeof(file_number)); +} + TableCache::TableCache(const std::string& dbname, const Options* options, const EnvOptions& storage_options, @@ -50,9 +55,7 @@ Status TableCache::FindTable(const EnvOptions& toptions, Cache::Handle** handle, bool* table_io, const bool no_io) { Status s; - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - Slice key(buf, sizeof(buf)); + Slice key = GetSliceForFileNumber(file_number); *handle = cache_->Lookup(key); if (*handle == nullptr) { if (no_io) { // Dont do IO and return a not-found status @@ -165,9 +168,7 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options, } void TableCache::Evict(uint64_t file_number) { - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - cache_->Erase(Slice(buf, sizeof(buf))); + cache_->Erase(GetSliceForFileNumber(file_number)); } } // namespace rocksdb