diff --git a/Makefile b/Makefile index 826e1cd601..21cd4cfd26 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,8 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full TESTS = \ + autovector_test \ + db_test \ table_properties_collector_test \ arena_test \ auto_roll_logger_test \ @@ -74,12 +76,12 @@ TESTS = \ skiplist_test \ stringappend_test \ ttl_test \ + backupable_db_test \ version_edit_test \ version_set_test \ write_batch_test\ deletefile_test \ - table_test \ - db_test + table_test TOOLS = \ sst_dump \ @@ -125,7 +127,7 @@ $(SHARED2): $(SHARED3) endif $(SHARED3): - $(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3) + $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $@ $(LDFLAGS) endif # PLATFORM_SHARED_EXT @@ -145,8 +147,9 @@ coverage: # Delete intermediate files find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; -check: all $(PROGRAMS) $(TESTS) $(TOOLS) ldb_tests +check: all $(PROGRAMS) $(TESTS) $(TOOLS) for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done + python tools/ldb_test.py ldb_tests: all $(PROGRAMS) $(TOOLS) python tools/ldb_test.py @@ -223,6 +226,9 @@ signal_test: util/signal_test.o $(LIBOBJECTS) arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -277,6 +283,9 @@ perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) +backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) diff --git a/README b/README index c55149d443..473e4145b5 100644 --- a/README +++ b/README @@ -79,4 +79,4 @@ include/rocksdb/statistics.h include/rocksdb/transaction_log.h An API to retrieve transaction logs from a database. - +Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/ diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 59e2e46195..96a1fb3319 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -189,6 +189,18 @@ EOF COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT" fi + # Test whether fallocate is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() { + int fd = open("/dev/null", 0); + fallocate(fd, 0, 0, 1024); + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$PLATFORM_LDFLAGS -DROCKSDB_FALLOCATE_PRESENT" + fi + # Test whether Snappy library is installed # http://code.google.com/p/snappy/ $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < rep; }; -struct leveldb_cache_t { shared_ptr rep; }; +struct rocksdb_t { DB* rep; }; +struct rocksdb_iterator_t { Iterator* rep; }; +struct rocksdb_writebatch_t { WriteBatch rep; }; +struct rocksdb_snapshot_t { const Snapshot* rep; }; +struct rocksdb_readoptions_t { ReadOptions rep; }; +struct rocksdb_writeoptions_t { WriteOptions rep; }; +struct rocksdb_options_t { Options rep; }; +struct rocksdb_seqfile_t { SequentialFile* rep; }; +struct rocksdb_randomfile_t { RandomAccessFile* rep; }; +struct rocksdb_writablefile_t { WritableFile* rep; }; +struct rocksdb_filelock_t { FileLock* rep; }; +struct rocksdb_logger_t { shared_ptr rep; }; +struct rocksdb_cache_t { shared_ptr rep; }; -struct leveldb_comparator_t : public Comparator { +struct rocksdb_comparator_t : public Comparator { void* state_; void (*destructor_)(void*); int (*compare_)( @@ -71,7 +73,7 @@ struct leveldb_comparator_t : public Comparator { const char* b, size_t blen); const char* (*name_)(void*); - virtual ~leveldb_comparator_t() { + virtual ~rocksdb_comparator_t() { (*destructor_)(state_); } @@ -88,7 +90,7 @@ struct leveldb_comparator_t : public Comparator { virtual void FindShortSuccessor(std::string* key) const { } }; -struct leveldb_filterpolicy_t : public FilterPolicy { +struct rocksdb_filterpolicy_t : public FilterPolicy { void* state_; void (*destructor_)(void*); const char* (*name_)(void*); @@ -102,7 +104,7 @@ struct leveldb_filterpolicy_t : public FilterPolicy { const char* key, size_t length, const char* filter, size_t filter_length); - virtual ~leveldb_filterpolicy_t() { + virtual ~rocksdb_filterpolicy_t() { (*destructor_)(state_); } @@ -129,11 +131,16 @@ struct leveldb_filterpolicy_t : public FilterPolicy { } }; -struct leveldb_env_t { +struct rocksdb_env_t { Env* rep; bool is_default; }; +struct rocksdb_universal_compaction_options_t { + rocksdb::CompactionOptionsUniversal *rep; +}; + + static bool SaveError(char** errptr, const Status& s) { assert(errptr != NULL); if (s.ok()) { @@ -154,27 +161,27 @@ static char* CopyString(const std::string& str) { return result; } -leveldb_t* leveldb_open( - const leveldb_options_t* options, +rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, const char* name, char** errptr) { DB* db; if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { return NULL; } - leveldb_t* result = new leveldb_t; + rocksdb_t* result = new rocksdb_t; result->rep = db; return result; } -void leveldb_close(leveldb_t* db) { +void rocksdb_close(rocksdb_t* db) { delete db->rep; delete db; } -void leveldb_put( - leveldb_t* db, - const leveldb_writeoptions_t* options, +void rocksdb_put( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, const char* key, size_t keylen, const char* val, size_t vallen, char** errptr) { @@ -182,26 +189,26 @@ void leveldb_put( db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); } -void leveldb_delete( - leveldb_t* db, - const leveldb_writeoptions_t* options, +void rocksdb_delete( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, const char* key, size_t keylen, char** errptr) { SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen))); } -void leveldb_write( - leveldb_t* db, - const leveldb_writeoptions_t* options, - leveldb_writebatch_t* batch, +void rocksdb_write( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr) { SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); } -char* leveldb_get( - leveldb_t* db, - const leveldb_readoptions_t* options, +char* rocksdb_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, size_t* vallen, char** errptr) { @@ -220,30 +227,30 @@ char* leveldb_get( return result; } -leveldb_iterator_t* leveldb_create_iterator( - leveldb_t* db, - const leveldb_readoptions_t* options) { - leveldb_iterator_t* result = new leveldb_iterator_t; +rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, + const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; result->rep = db->rep->NewIterator(options->rep); return result; } -const leveldb_snapshot_t* leveldb_create_snapshot( - leveldb_t* db) { - leveldb_snapshot_t* result = new leveldb_snapshot_t; +const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; result->rep = db->rep->GetSnapshot(); return result; } -void leveldb_release_snapshot( - leveldb_t* db, - const leveldb_snapshot_t* snapshot) { +void rocksdb_release_snapshot( + rocksdb_t* db, + const rocksdb_snapshot_t* snapshot) { db->rep->ReleaseSnapshot(snapshot->rep); delete snapshot; } -char* leveldb_property_value( - leveldb_t* db, +char* rocksdb_property_value( + rocksdb_t* db, const char* propname) { std::string tmp; if (db->rep->GetProperty(Slice(propname), &tmp)) { @@ -254,8 +261,8 @@ char* leveldb_property_value( } } -void leveldb_approximate_sizes( - leveldb_t* db, +void rocksdb_approximate_sizes( + rocksdb_t* db, int num_ranges, const char* const* range_start_key, const size_t* range_start_key_len, const char* const* range_limit_key, const size_t* range_limit_key_len, @@ -269,8 +276,8 @@ void leveldb_approximate_sizes( delete[] ranges; } -void leveldb_compact_range( - leveldb_t* db, +void rocksdb_compact_range( + rocksdb_t* db, const char* start_key, size_t start_key_len, const char* limit_key, size_t limit_key_len) { Slice a, b; @@ -280,92 +287,92 @@ void leveldb_compact_range( (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL)); } -void leveldb_destroy_db( - const leveldb_options_t* options, +void rocksdb_destroy_db( + const rocksdb_options_t* options, const char* name, char** errptr) { SaveError(errptr, DestroyDB(name, options->rep)); } -void leveldb_repair_db( - const leveldb_options_t* options, +void rocksdb_repair_db( + const rocksdb_options_t* options, const char* name, char** errptr) { SaveError(errptr, RepairDB(name, options->rep)); } -void leveldb_iter_destroy(leveldb_iterator_t* iter) { +void rocksdb_iter_destroy(rocksdb_iterator_t* iter) { delete iter->rep; delete iter; } -unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) { +unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) { return iter->rep->Valid(); } -void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) { +void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) { iter->rep->SeekToFirst(); } -void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) { +void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) { iter->rep->SeekToLast(); } -void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) { +void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) { iter->rep->Seek(Slice(k, klen)); } -void leveldb_iter_next(leveldb_iterator_t* iter) { +void rocksdb_iter_next(rocksdb_iterator_t* iter) { iter->rep->Next(); } -void leveldb_iter_prev(leveldb_iterator_t* iter) { +void rocksdb_iter_prev(rocksdb_iterator_t* iter) { iter->rep->Prev(); } -const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) { +const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) { Slice s = iter->rep->key(); *klen = s.size(); return s.data(); } -const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) { +const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) { Slice s = iter->rep->value(); *vlen = s.size(); return s.data(); } -void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) { +void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) { SaveError(errptr, iter->rep->status()); } -leveldb_writebatch_t* leveldb_writebatch_create() { - return new leveldb_writebatch_t; +rocksdb_writebatch_t* rocksdb_writebatch_create() { + return new rocksdb_writebatch_t; } -void leveldb_writebatch_destroy(leveldb_writebatch_t* b) { +void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { delete b; } -void leveldb_writebatch_clear(leveldb_writebatch_t* b) { +void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { b->rep.Clear(); } -void leveldb_writebatch_put( - leveldb_writebatch_t* b, +void rocksdb_writebatch_put( + rocksdb_writebatch_t* b, const char* key, size_t klen, const char* val, size_t vlen) { b->rep.Put(Slice(key, klen), Slice(val, vlen)); } -void leveldb_writebatch_delete( - leveldb_writebatch_t* b, +void rocksdb_writebatch_delete( + rocksdb_writebatch_t* b, const char* key, size_t klen) { b->rep.Delete(Slice(key, klen)); } -void leveldb_writebatch_iterate( - leveldb_writebatch_t* b, +void rocksdb_writebatch_iterate( + rocksdb_writebatch_t* b, void* state, void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), void (*deleted)(void*, const char* k, size_t klen)) { @@ -388,132 +395,132 @@ void leveldb_writebatch_iterate( b->rep.Iterate(&handler); } -leveldb_options_t* leveldb_options_create() { - return new leveldb_options_t; +rocksdb_options_t* rocksdb_options_create() { + return new rocksdb_options_t; } -void leveldb_options_destroy(leveldb_options_t* options) { +void rocksdb_options_destroy(rocksdb_options_t* options) { delete options; } -void leveldb_options_set_comparator( - leveldb_options_t* opt, - leveldb_comparator_t* cmp) { +void rocksdb_options_set_comparator( + rocksdb_options_t* opt, + rocksdb_comparator_t* cmp) { opt->rep.comparator = cmp; } -void leveldb_options_set_filter_policy( - leveldb_options_t* opt, - leveldb_filterpolicy_t* policy) { +void rocksdb_options_set_filter_policy( + rocksdb_options_t* opt, + rocksdb_filterpolicy_t* policy) { opt->rep.filter_policy = policy; } -void leveldb_options_set_create_if_missing( - leveldb_options_t* opt, unsigned char v) { +void rocksdb_options_set_create_if_missing( + rocksdb_options_t* opt, unsigned char v) { opt->rep.create_if_missing = v; } -void leveldb_options_set_error_if_exists( - leveldb_options_t* opt, unsigned char v) { +void rocksdb_options_set_error_if_exists( + rocksdb_options_t* opt, unsigned char v) { opt->rep.error_if_exists = v; } -void leveldb_options_set_paranoid_checks( - leveldb_options_t* opt, unsigned char v) { +void rocksdb_options_set_paranoid_checks( + rocksdb_options_t* opt, unsigned char v) { opt->rep.paranoid_checks = v; } -void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) { +void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) { opt->rep.env = (env ? env->rep : NULL); } -void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) { +void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) { if (l) { opt->rep.info_log = l->rep; } } -void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) { +void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { opt->rep.write_buffer_size = s; } -void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) { +void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) { opt->rep.max_open_files = n; } -void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) { +void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) { if (c) { opt->rep.block_cache = c->rep; } } -void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) { +void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) { opt->rep.block_size = s; } -void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) { +void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) { opt->rep.block_restart_interval = n; } -void leveldb_options_set_target_file_size_base( - leveldb_options_t* opt, uint64_t n) { +void rocksdb_options_set_target_file_size_base( + rocksdb_options_t* opt, uint64_t n) { opt->rep.target_file_size_base = n; } -void leveldb_options_set_target_file_size_multiplier( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t* opt, int n) { opt->rep.target_file_size_multiplier = n; } -void leveldb_options_set_max_bytes_for_level_base( - leveldb_options_t* opt, uint64_t n) { +void rocksdb_options_set_max_bytes_for_level_base( + rocksdb_options_t* opt, uint64_t n) { opt->rep.max_bytes_for_level_base = n; } -void leveldb_options_set_max_bytes_for_level_multiplier( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_max_bytes_for_level_multiplier( + rocksdb_options_t* opt, int n) { opt->rep.max_bytes_for_level_multiplier = n; } -void leveldb_options_set_expanded_compaction_factor( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_expanded_compaction_factor( + rocksdb_options_t* opt, int n) { opt->rep.expanded_compaction_factor = n; } -void leveldb_options_set_max_grandparent_overlap_factor( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_max_grandparent_overlap_factor( + rocksdb_options_t* opt, int n) { opt->rep.max_grandparent_overlap_factor = n; } -void leveldb_options_set_num_levels(leveldb_options_t* opt, int n) { +void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { opt->rep.num_levels = n; } -void leveldb_options_set_level0_file_num_compaction_trigger( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t* opt, int n) { opt->rep.level0_file_num_compaction_trigger = n; } -void leveldb_options_set_level0_slowdown_writes_trigger( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_level0_slowdown_writes_trigger( + rocksdb_options_t* opt, int n) { opt->rep.level0_slowdown_writes_trigger = n; } -void leveldb_options_set_level0_stop_writes_trigger( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t* opt, int n) { opt->rep.level0_stop_writes_trigger = n; } -void leveldb_options_set_max_mem_compaction_level( - leveldb_options_t* opt, int n) { +void rocksdb_options_set_max_mem_compaction_level( + rocksdb_options_t* opt, int n) { opt->rep.max_mem_compaction_level = n; } -void leveldb_options_set_compression(leveldb_options_t* opt, int t) { +void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) { opt->rep.compression = static_cast(t); } -void leveldb_options_set_compression_per_level(leveldb_options_t* opt, +void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt, int* level_values, size_t num_levels) { opt->rep.compression_per_level.resize(num_levels); @@ -523,43 +530,132 @@ void leveldb_options_set_compression_per_level(leveldb_options_t* opt, } } -void leveldb_options_set_compression_options( - leveldb_options_t* opt, int w_bits, int level, int strategy) { +void rocksdb_options_set_compression_options( + rocksdb_options_t* opt, int w_bits, int level, int strategy) { opt->rep.compression_opts.window_bits = w_bits; opt->rep.compression_opts.level = level; opt->rep.compression_opts.strategy = strategy; } -void leveldb_options_set_disable_data_sync( - leveldb_options_t* opt, bool disable_data_sync) { +void rocksdb_options_set_disable_data_sync( + rocksdb_options_t* opt, int disable_data_sync) { opt->rep.disableDataSync = disable_data_sync; } -void leveldb_options_set_use_fsync( - leveldb_options_t* opt, bool use_fsync) { +void rocksdb_options_set_use_fsync( + rocksdb_options_t* opt, int use_fsync) { opt->rep.use_fsync = use_fsync; } -void leveldb_options_set_db_stats_log_interval( - leveldb_options_t* opt, int db_stats_log_interval) { +void rocksdb_options_set_db_stats_log_interval( + rocksdb_options_t* opt, int db_stats_log_interval) { opt->rep.db_stats_log_interval = db_stats_log_interval; } -void leveldb_options_set_db_log_dir( - leveldb_options_t* opt, const char* db_log_dir) { +void rocksdb_options_set_db_log_dir( + rocksdb_options_t* opt, const char* db_log_dir) { opt->rep.db_log_dir = db_log_dir; } -void leveldb_options_set_WAL_ttl_seconds(leveldb_options_t* opt, uint64_t ttl) { +void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) { opt->rep.WAL_ttl_seconds = ttl; } -void leveldb_options_set_WAL_size_limit_MB( - leveldb_options_t* opt, uint64_t limit) { +void rocksdb_options_set_WAL_size_limit_MB( + rocksdb_options_t* opt, uint64_t limit) { opt->rep.WAL_size_limit_MB = limit; } -leveldb_comparator_t* leveldb_comparator_create( +void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) { + opt->rep.max_write_buffer_number = n; +} + +void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) { + opt->rep.min_write_buffer_number_to_merge = n; +} + +void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) { + opt->rep.max_background_compactions = n; +} + +void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) { + opt->rep.max_background_flushes = n; +} + +void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) { + opt->rep.disable_auto_compactions = disable; +} + +void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) { + opt->rep.disable_seek_compaction = disable; +} + +void rocksdb_options_set_source_compaction_factor( + rocksdb_options_t* opt, int n) { + opt->rep.expanded_compaction_factor = n; +} + +void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) { + opt->rep.PrepareForBulkLoad(); +} + +void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) { + static rocksdb::VectorRepFactory* factory = 0; + if (!factory) { + factory = new rocksdb::VectorRepFactory; + } + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) { + opt->rep.compaction_style = static_cast(style); +} + +void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) { + opt->rep.compaction_options_universal = *(uco->rep); +} + +/* +TODO: +merge_operator +compaction_filter +prefix_extractor +whole_key_filtering +max_bytes_for_level_multiplier_additional +delete_obsolete_files_period_micros +max_log_file_size +log_file_time_to_roll +keep_log_file_num +soft_rate_limit +hard_rate_limit +rate_limit_delay_max_milliseconds +max_manifest_file_size +no_block_cache +table_cache_numshardbits +table_cache_remove_scan_count_limit +arena_block_size +manifest_preallocation_size +purge_redundant_kvs_while_flush +allow_os_buffer +allow_mmap_reads +allow_mmap_writes +is_fd_close_on_exec +skip_log_error_on_recovery +stats_dump_period_sec +block_size_deviation +advise_random_on_open +access_hint_on_compaction_start +use_adaptive_mutex +bytes_per_sync +filter_deletes +max_sequential_skip_in_iterations +table_factory +table_properties_collectors +inplace_update_support +inplace_update_num_locks +*/ + +rocksdb_comparator_t* rocksdb_comparator_create( void* state, void (*destructor)(void*), int (*compare)( @@ -567,7 +663,7 @@ leveldb_comparator_t* leveldb_comparator_create( const char* a, size_t alen, const char* b, size_t blen), const char* (*name)(void*)) { - leveldb_comparator_t* result = new leveldb_comparator_t; + rocksdb_comparator_t* result = new rocksdb_comparator_t; result->state_ = state; result->destructor_ = destructor; result->compare_ = compare; @@ -575,11 +671,11 @@ leveldb_comparator_t* leveldb_comparator_create( return result; } -void leveldb_comparator_destroy(leveldb_comparator_t* cmp) { +void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { delete cmp; } -leveldb_filterpolicy_t* leveldb_filterpolicy_create( +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( void* state, void (*destructor)(void*), char* (*create_filter)( @@ -592,7 +688,7 @@ leveldb_filterpolicy_t* leveldb_filterpolicy_create( const char* key, size_t length, const char* filter, size_t filter_length), const char* (*name)(void*)) { - leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t; + rocksdb_filterpolicy_t* result = new rocksdb_filterpolicy_t; result->state_ = state; result->destructor_ = destructor; result->create_ = create_filter; @@ -601,15 +697,15 @@ leveldb_filterpolicy_t* leveldb_filterpolicy_create( return result; } -void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) { +void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) { delete filter; } -leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) { - // Make a leveldb_filterpolicy_t, but override all of its methods so +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) { + // Make a rocksdb_filterpolicy_t, but override all of its methods so // they delegate to a NewBloomFilterPolicy() instead of user // supplied C functions. - struct Wrapper : public leveldb_filterpolicy_t { + struct Wrapper : public rocksdb_filterpolicy_t { const FilterPolicy* rep_; ~Wrapper() { delete rep_; } const char* Name() const { return rep_->Name(); } @@ -628,64 +724,115 @@ leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) { return wrapper; } -leveldb_readoptions_t* leveldb_readoptions_create() { - return new leveldb_readoptions_t; +rocksdb_readoptions_t* rocksdb_readoptions_create() { + return new rocksdb_readoptions_t; } -void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) { +void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { delete opt; } -void leveldb_readoptions_set_verify_checksums( - leveldb_readoptions_t* opt, +void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.verify_checksums = v; } -void leveldb_readoptions_set_fill_cache( - leveldb_readoptions_t* opt, unsigned char v) { +void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.fill_cache = v; } -void leveldb_readoptions_set_snapshot( - leveldb_readoptions_t* opt, - const leveldb_snapshot_t* snap) { +void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t* opt, + const rocksdb_snapshot_t* snap) { opt->rep.snapshot = (snap ? snap->rep : NULL); } -leveldb_writeoptions_t* leveldb_writeoptions_create() { - return new leveldb_writeoptions_t; +rocksdb_writeoptions_t* rocksdb_writeoptions_create() { + return new rocksdb_writeoptions_t; } -void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) { +void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { delete opt; } -void leveldb_writeoptions_set_sync( - leveldb_writeoptions_t* opt, unsigned char v) { +void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t* opt, unsigned char v) { opt->rep.sync = v; } -leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) { - leveldb_cache_t* c = new leveldb_cache_t; +void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) { + opt->rep.disableWAL = disable; +} + + +rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) { + rocksdb_cache_t* c = new rocksdb_cache_t; c->rep = NewLRUCache(capacity); return c; } -void leveldb_cache_destroy(leveldb_cache_t* cache) { +void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; } -leveldb_env_t* leveldb_create_default_env() { - leveldb_env_t* result = new leveldb_env_t; +rocksdb_env_t* rocksdb_create_default_env() { + rocksdb_env_t* result = new rocksdb_env_t; result->rep = Env::Default(); result->is_default = true; return result; } -void leveldb_env_destroy(leveldb_env_t* env) { +void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n); +} + +void rocksdb_env_destroy(rocksdb_env_t* env) { if (!env->is_default) delete env->rep; delete env; } +rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() { + rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t; + result->rep = new rocksdb::CompactionOptionsUniversal; + return result; +} + +void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t* uco, int ratio) { + uco->rep->size_ratio = ratio; +} + +void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->min_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->max_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->max_size_amplification_percent = p; +} + +void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->compression_size_percent = p; +} + +void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t* uco, int style) { + uco->rep->stop_style = static_cast(style); +} + +void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t* uco) { + delete uco->rep; + delete uco; +} + } // end extern "C" diff --git a/db/c_test.c b/db/c_test.c index abbe1ddd5f..8c5e8e5348 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -62,30 +62,30 @@ static void Free(char** ptr) { } static void CheckGet( - leveldb_t* db, - const leveldb_readoptions_t* options, + rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, const char* expected) { char* err = NULL; size_t val_len; char* val; - val = leveldb_get(db, options, key, strlen(key), &val_len, &err); + val = rocksdb_get(db, options, key, strlen(key), &val_len, &err); CheckNoError(err); CheckEqual(expected, val, val_len); Free(&val); } -static void CheckIter(leveldb_iterator_t* iter, +static void CheckIter(rocksdb_iterator_t* iter, const char* key, const char* val) { size_t len; const char* str; - str = leveldb_iter_key(iter, &len); + str = rocksdb_iter_key(iter, &len); CheckEqual(key, str, len); - str = leveldb_iter_value(iter, &len); + str = rocksdb_iter_value(iter, &len); CheckEqual(val, str, len); } -// Callback from leveldb_writebatch_iterate() +// Callback from rocksdb_writebatch_iterate() static void CheckPut(void* ptr, const char* k, size_t klen, const char* v, size_t vlen) { @@ -104,7 +104,7 @@ static void CheckPut(void* ptr, (*state)++; } -// Callback from leveldb_writebatch_iterate() +// Callback from rocksdb_writebatch_iterate() static void CheckDel(void* ptr, const char* k, size_t klen) { int* state = (int*) ptr; CheckCondition(*state == 2); @@ -155,117 +155,117 @@ unsigned char FilterKeyMatch( } int main(int argc, char** argv) { - leveldb_t* db; - leveldb_comparator_t* cmp; - leveldb_cache_t* cache; - leveldb_env_t* env; - leveldb_options_t* options; - leveldb_readoptions_t* roptions; - leveldb_writeoptions_t* woptions; + rocksdb_t* db; + rocksdb_comparator_t* cmp; + rocksdb_cache_t* cache; + rocksdb_env_t* env; + rocksdb_options_t* options; + rocksdb_readoptions_t* roptions; + rocksdb_writeoptions_t* woptions; char* err = NULL; int run = -1; snprintf(dbname, sizeof(dbname), - "%s/leveldb_c_test-%d", + "%s/rocksdb_c_test-%d", GetTempDir(), ((int) geteuid())); StartPhase("create_objects"); - cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); - env = leveldb_create_default_env(); - cache = leveldb_cache_create_lru(100000); + cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); + env = rocksdb_create_default_env(); + cache = rocksdb_cache_create_lru(100000); - options = leveldb_options_create(); - leveldb_options_set_comparator(options, cmp); - leveldb_options_set_error_if_exists(options, 1); - leveldb_options_set_cache(options, cache); - leveldb_options_set_env(options, env); - leveldb_options_set_info_log(options, NULL); - leveldb_options_set_write_buffer_size(options, 100000); - leveldb_options_set_paranoid_checks(options, 1); - leveldb_options_set_max_open_files(options, 10); - leveldb_options_set_block_size(options, 1024); - leveldb_options_set_block_restart_interval(options, 8); - leveldb_options_set_compression(options, leveldb_no_compression); - leveldb_options_set_compression_options(options, -14, -1, 0); - int compression_levels[] = {leveldb_no_compression, leveldb_no_compression, - leveldb_no_compression, leveldb_no_compression}; - leveldb_options_set_compression_per_level(options, compression_levels, 4); + options = rocksdb_options_create(); + rocksdb_options_set_comparator(options, cmp); + rocksdb_options_set_error_if_exists(options, 1); + rocksdb_options_set_cache(options, cache); + rocksdb_options_set_env(options, env); + rocksdb_options_set_info_log(options, NULL); + rocksdb_options_set_write_buffer_size(options, 100000); + rocksdb_options_set_paranoid_checks(options, 1); + rocksdb_options_set_max_open_files(options, 10); + rocksdb_options_set_block_size(options, 1024); + rocksdb_options_set_block_restart_interval(options, 8); + rocksdb_options_set_compression(options, rocksdb_no_compression); + rocksdb_options_set_compression_options(options, -14, -1, 0); + int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression, + rocksdb_no_compression, rocksdb_no_compression}; + rocksdb_options_set_compression_per_level(options, compression_levels, 4); - roptions = leveldb_readoptions_create(); - leveldb_readoptions_set_verify_checksums(roptions, 1); - leveldb_readoptions_set_fill_cache(roptions, 0); + roptions = rocksdb_readoptions_create(); + rocksdb_readoptions_set_verify_checksums(roptions, 1); + rocksdb_readoptions_set_fill_cache(roptions, 0); - woptions = leveldb_writeoptions_create(); - leveldb_writeoptions_set_sync(woptions, 1); + woptions = rocksdb_writeoptions_create(); + rocksdb_writeoptions_set_sync(woptions, 1); StartPhase("destroy"); - leveldb_destroy_db(options, dbname, &err); + rocksdb_destroy_db(options, dbname, &err); Free(&err); StartPhase("open_error"); - db = leveldb_open(options, dbname, &err); + db = rocksdb_open(options, dbname, &err); CheckCondition(err != NULL); Free(&err); StartPhase("open"); - leveldb_options_set_create_if_missing(options, 1); - db = leveldb_open(options, dbname, &err); + rocksdb_options_set_create_if_missing(options, 1); + db = rocksdb_open(options, dbname, &err); CheckNoError(err); CheckGet(db, roptions, "foo", NULL); StartPhase("put"); - leveldb_put(db, woptions, "foo", 3, "hello", 5, &err); + rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err); CheckNoError(err); CheckGet(db, roptions, "foo", "hello"); StartPhase("compactall"); - leveldb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_compact_range(db, NULL, 0, NULL, 0); CheckGet(db, roptions, "foo", "hello"); StartPhase("compactrange"); - leveldb_compact_range(db, "a", 1, "z", 1); + rocksdb_compact_range(db, "a", 1, "z", 1); CheckGet(db, roptions, "foo", "hello"); StartPhase("writebatch"); { - leveldb_writebatch_t* wb = leveldb_writebatch_create(); - leveldb_writebatch_put(wb, "foo", 3, "a", 1); - leveldb_writebatch_clear(wb); - leveldb_writebatch_put(wb, "bar", 3, "b", 1); - leveldb_writebatch_put(wb, "box", 3, "c", 1); - leveldb_writebatch_delete(wb, "bar", 3); - leveldb_write(db, woptions, wb, &err); + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_put(wb, "foo", 3, "a", 1); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put(wb, "bar", 3, "b", 1); + rocksdb_writebatch_put(wb, "box", 3, "c", 1); + rocksdb_writebatch_delete(wb, "bar", 3); + rocksdb_write(db, woptions, wb, &err); CheckNoError(err); CheckGet(db, roptions, "foo", "hello"); CheckGet(db, roptions, "bar", NULL); CheckGet(db, roptions, "box", "c"); int pos = 0; - leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); + rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); CheckCondition(pos == 3); - leveldb_writebatch_destroy(wb); + rocksdb_writebatch_destroy(wb); } StartPhase("iter"); { - leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions); - CheckCondition(!leveldb_iter_valid(iter)); - leveldb_iter_seek_to_first(iter); - CheckCondition(leveldb_iter_valid(iter)); + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); CheckIter(iter, "box", "c"); - leveldb_iter_next(iter); + rocksdb_iter_next(iter); CheckIter(iter, "foo", "hello"); - leveldb_iter_prev(iter); + rocksdb_iter_prev(iter); CheckIter(iter, "box", "c"); - leveldb_iter_prev(iter); - CheckCondition(!leveldb_iter_valid(iter)); - leveldb_iter_seek_to_last(iter); + rocksdb_iter_prev(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_last(iter); CheckIter(iter, "foo", "hello"); - leveldb_iter_seek(iter, "b", 1); + rocksdb_iter_seek(iter, "b", 1); CheckIter(iter, "box", "c"); - leveldb_iter_get_error(iter, &err); + rocksdb_iter_get_error(iter, &err); CheckNoError(err); - leveldb_iter_destroy(iter); + rocksdb_iter_destroy(iter); } StartPhase("approximate_sizes"); @@ -279,39 +279,39 @@ int main(int argc, char** argv) { size_t start_len[2] = { 1, 21 }; const char* limit[2] = { "k00000000000000010000", "z" }; size_t limit_len[2] = { 21, 1 }; - leveldb_writeoptions_set_sync(woptions, 0); + rocksdb_writeoptions_set_sync(woptions, 0); for (i = 0; i < n; i++) { snprintf(keybuf, sizeof(keybuf), "k%020d", i); snprintf(valbuf, sizeof(valbuf), "v%020d", i); - leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf), + rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf), &err); CheckNoError(err); } - leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); + rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); CheckCondition(sizes[0] > 0); CheckCondition(sizes[1] > 0); } StartPhase("property"); { - char* prop = leveldb_property_value(db, "nosuchprop"); + char* prop = rocksdb_property_value(db, "nosuchprop"); CheckCondition(prop == NULL); - prop = leveldb_property_value(db, "rocksdb.stats"); + prop = rocksdb_property_value(db, "rocksdb.stats"); CheckCondition(prop != NULL); Free(&prop); } StartPhase("snapshot"); { - const leveldb_snapshot_t* snap; - snap = leveldb_create_snapshot(db); - leveldb_delete(db, woptions, "foo", 3, &err); + const rocksdb_snapshot_t* snap; + snap = rocksdb_create_snapshot(db); + rocksdb_delete(db, woptions, "foo", 3, &err); CheckNoError(err); - leveldb_readoptions_set_snapshot(roptions, snap); + rocksdb_readoptions_set_snapshot(roptions, snap); CheckGet(db, roptions, "foo", "hello"); - leveldb_readoptions_set_snapshot(roptions, NULL); + rocksdb_readoptions_set_snapshot(roptions, NULL); CheckGet(db, roptions, "foo", NULL); - leveldb_release_snapshot(db, snap); + rocksdb_release_snapshot(db, snap); } StartPhase("repair"); @@ -320,44 +320,44 @@ int main(int argc, char** argv) { // files (https://reviews.facebook.net/D6123) would leave // around deleted files and the repair process will find // those files and put them back into the database. - leveldb_compact_range(db, NULL, 0, NULL, 0); - leveldb_close(db); - leveldb_options_set_create_if_missing(options, 0); - leveldb_options_set_error_if_exists(options, 0); - leveldb_repair_db(options, dbname, &err); + rocksdb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_close(db); + rocksdb_options_set_create_if_missing(options, 0); + rocksdb_options_set_error_if_exists(options, 0); + rocksdb_repair_db(options, dbname, &err); CheckNoError(err); - db = leveldb_open(options, dbname, &err); + db = rocksdb_open(options, dbname, &err); CheckNoError(err); CheckGet(db, roptions, "foo", NULL); CheckGet(db, roptions, "bar", NULL); CheckGet(db, roptions, "box", "c"); - leveldb_options_set_create_if_missing(options, 1); - leveldb_options_set_error_if_exists(options, 1); + rocksdb_options_set_create_if_missing(options, 1); + rocksdb_options_set_error_if_exists(options, 1); } StartPhase("filter"); for (run = 0; run < 2; run++) { // First run uses custom filter, second run uses bloom filter CheckNoError(err); - leveldb_filterpolicy_t* policy; + rocksdb_filterpolicy_t* policy; if (run == 0) { - policy = leveldb_filterpolicy_create( + policy = rocksdb_filterpolicy_create( NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName); } else { - policy = leveldb_filterpolicy_create_bloom(10); + policy = rocksdb_filterpolicy_create_bloom(10); } // Create new database - leveldb_close(db); - leveldb_destroy_db(options, dbname, &err); - leveldb_options_set_filter_policy(options, policy); - db = leveldb_open(options, dbname, &err); + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_set_filter_policy(options, policy); + db = rocksdb_open(options, dbname, &err); CheckNoError(err); - leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); CheckNoError(err); - leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err); + rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err); CheckNoError(err); - leveldb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_compact_range(db, NULL, 0, NULL, 0); fake_filter_result = 1; CheckGet(db, roptions, "foo", "foovalue"); @@ -372,18 +372,18 @@ int main(int argc, char** argv) { CheckGet(db, roptions, "foo", "foovalue"); CheckGet(db, roptions, "bar", "barvalue"); } - leveldb_options_set_filter_policy(options, NULL); - leveldb_filterpolicy_destroy(policy); + rocksdb_options_set_filter_policy(options, NULL); + rocksdb_filterpolicy_destroy(policy); } StartPhase("cleanup"); - leveldb_close(db); - leveldb_options_destroy(options); - leveldb_readoptions_destroy(roptions); - leveldb_writeoptions_destroy(woptions); - leveldb_cache_destroy(cache); - leveldb_comparator_destroy(cmp); - leveldb_env_destroy(env); + rocksdb_close(db); + rocksdb_options_destroy(options); + rocksdb_readoptions_destroy(roptions); + rocksdb_writeoptions_destroy(woptions); + rocksdb_cache_destroy(cache); + rocksdb_comparator_destroy(cmp); + rocksdb_env_destroy(env); fprintf(stderr, "PASS\n"); return 0; diff --git a/db/db_bench.cc b/db/db_bench.cc index 16b6643c2a..de57326a6e 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -49,6 +49,7 @@ DEFINE_string(benchmarks, "compact," "readrandom," "readseq," + "readtocache," "readreverse," "readwhilewriting," "readrandomwriterandom," @@ -76,6 +77,7 @@ DEFINE_string(benchmarks, "\tdeleteseq -- delete N keys in sequential order\n" "\tdeleterandom -- delete N keys in random order\n" "\treadseq -- read N times sequentially\n" + "\treadtocache -- 1 thread reading database sequentially\n" "\treadreverse -- read N times in reverse order\n" "\treadrandom -- read N times in random order\n" "\treadmissing -- read N missing keys in random order\n" @@ -150,7 +152,7 @@ DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink" DEFINE_bool(histogram, false, "Print histogram of operation timings"); -DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size, +DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size, "Number of bytes to buffer in memtable before compacting"); DEFINE_int32(max_write_buffer_number, @@ -1062,6 +1064,10 @@ class Benchmark { method = &Benchmark::WriteRandom; } else if (name == Slice("readseq")) { method = &Benchmark::ReadSequential; + } else if (name == Slice("readtocache")) { + method = &Benchmark::ReadSequential; + num_threads = 1; + reads_ = num_; } else if (name == Slice("readreverse")) { method = &Benchmark::ReadReverse; } else if (name == Slice("readrandom")) { diff --git a/db/db_impl.cc b/db/db_impl.cc index 361a8c37a4..d3effae864 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -241,6 +241,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) mem_(new MemTable(internal_comparator_, mem_rep_factory_, NumberLevels(), options_)), logfile_number_(0), + super_version_(nullptr), tmp_batch_(), bg_compaction_scheduled_(0), bg_flush_scheduled_(0), @@ -316,6 +317,13 @@ DBImpl::~DBImpl() { bg_logstats_scheduled_) { bg_cv_.Wait(); } + if (super_version_ != nullptr) { + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + } mutex_.Unlock(); if (db_lock_ != nullptr) { @@ -345,6 +353,13 @@ void DBImpl::TEST_Destroy_DBImpl() { bg_logstats_scheduled_) { bg_cv_.Wait(); } + if (super_version_ != nullptr) { + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + } // Prevent new compactions from occuring. bg_work_gate_closed_ = true; @@ -443,6 +458,49 @@ void DBImpl::MaybeDumpStats() { } } +// DBImpl::SuperVersion methods +DBImpl::SuperVersion::SuperVersion(const int num_memtables) { + to_delete.resize(num_memtables); +} + +DBImpl::SuperVersion::~SuperVersion() { + for (auto td : to_delete) { + delete td; + } +} + +DBImpl::SuperVersion* DBImpl::SuperVersion::Ref() { + refs.fetch_add(1, std::memory_order_relaxed); + return this; +} + +bool DBImpl::SuperVersion::Unref() { + assert(refs > 0); + // fetch_sub returns the previous value of ref + return refs.fetch_sub(1, std::memory_order_relaxed) == 1; +} + +void DBImpl::SuperVersion::Cleanup() { + assert(refs.load(std::memory_order_relaxed) == 0); + imm.UnrefAll(&to_delete); + MemTable* m = mem->Unref(); + if (m != nullptr) { + to_delete.push_back(m); + } + current->Unref(); +} + +void DBImpl::SuperVersion::Init(MemTable* new_mem, const MemTableList& new_imm, + Version* new_current) { + mem = new_mem; + imm = new_imm; + current = new_current; + mem->Ref(); + imm.RefAll(); + current->Ref(); + refs.store(1, std::memory_order_relaxed); +} + // Returns the list of live files in 'sst_live' and the list // of all files in the filesystem in 'all_files'. // no_full_scan = true -- never do the full scan using GetChildren() @@ -518,11 +576,6 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, // It is not necessary to hold the mutex when invoking this method. void DBImpl::PurgeObsoleteFiles(DeletionState& state) { - // free pending memtables - for (auto m : state.memtables_to_free) { - delete m; - } - // check if there is anything to do if (!state.all_files.size() && !state.sst_delete_files.size() && @@ -1041,6 +1094,7 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) { stats.bytes_written = meta.file_size; stats.files_out_levelnp1 = 1; stats_[level].Add(stats); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); return s; } @@ -1129,6 +1183,7 @@ Status DBImpl::WriteLevel0Table(std::vector &mems, VersionEdit* edit, stats.micros = env_->NowMicros() - start_micros; stats.bytes_written = meta.file_size; stats_[level].Add(stats); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); return s; } @@ -1186,6 +1241,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, file_number, pending_outputs_, &deletion_state.memtables_to_free); if (s.ok()) { + InstallSuperVersion(deletion_state); if (madeProgress) { *madeProgress = 1; } @@ -1245,11 +1301,17 @@ int DBImpl::FindMinimumEmptyLevelFitting(int level) { void DBImpl::ReFitLevel(int level, int target_level) { assert(level < NumberLevels()); - MutexLock l(&mutex_); + SuperVersion* superversion_to_free = nullptr; + SuperVersion* new_superversion = + new SuperVersion(options_.max_write_buffer_number); + + mutex_.Lock(); // only allow one thread refitting if (refitting_level_) { + mutex_.Unlock(); Log(options_.info_log, "ReFitLevel: another thread is refitting"); + delete new_superversion; return; } refitting_level_ = true; @@ -1285,6 +1347,8 @@ void DBImpl::ReFitLevel(int level, int target_level) { edit.DebugString().data()); auto status = versions_->LogAndApply(&edit, &mutex_); + superversion_to_free = InstallSuperVersion(new_superversion); + new_superversion = nullptr; Log(options_.info_log, "LogAndApply: %s\n", status.ToString().data()); @@ -1296,6 +1360,10 @@ void DBImpl::ReFitLevel(int level, int target_level) { refitting_level_ = false; bg_work_gate_closed_ = false; + + mutex_.Unlock(); + delete superversion_to_free; + delete new_superversion; } int DBImpl::NumberLevels() { @@ -1311,8 +1379,7 @@ int DBImpl::Level0StopWriteTrigger() { } Status DBImpl::Flush(const FlushOptions& options) { - Status status = FlushMemTable(options); - return status; + return FlushMemTable(options); } SequenceNumber DBImpl::GetLatestSequenceNumber() const { @@ -1669,7 +1736,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, void DBImpl::BackgroundCallFlush() { bool madeProgress = false; - DeletionState deletion_state(options_.max_write_buffer_number); + DeletionState deletion_state(options_.max_write_buffer_number, true); assert(bg_flush_scheduled_); MutexLock l(&mutex_); @@ -1715,7 +1782,7 @@ void DBImpl::TEST_PurgeObsoleteteWAL() { void DBImpl::BackgroundCallCompaction() { bool madeProgress = false; - DeletionState deletion_state(options_.max_write_buffer_number); + DeletionState deletion_state(options_.max_write_buffer_number, true); MaybeDumpStats(); @@ -1768,7 +1835,7 @@ void DBImpl::BackgroundCallCompaction() { } Status DBImpl::BackgroundCompaction(bool* madeProgress, - DeletionState& deletion_state) { + DeletionState& deletion_state) { *madeProgress = false; mutex_.AssertHeld(); @@ -1821,6 +1888,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); status = versions_->LogAndApply(c->edit(), &mutex_); + InstallSuperVersion(deletion_state); VersionSet::LevelSummaryStorage tmp; Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", static_cast(f->number), @@ -2454,14 +2522,22 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, } stats.files_out_levelnp1 = num_output_files; - for (int i = 0; i < compact->compaction->num_input_files(0); i++) + for (int i = 0; i < compact->compaction->num_input_files(0); i++) { stats.bytes_readn += compact->compaction->input(0, i)->file_size; + RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, + compact->compaction->input(0, i)->file_size); + } - for (int i = 0; i < compact->compaction->num_input_files(1); i++) + for (int i = 0; i < compact->compaction->num_input_files(1); i++) { stats.bytes_readnp1 += compact->compaction->input(1, i)->file_size; + RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, + compact->compaction->input(1, i)->file_size); + } for (int i = 0; i < num_output_files; i++) { stats.bytes_written += compact->outputs[i].file_size; + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, + compact->outputs[i].file_size); } LogFlush(options_.info_log); @@ -2474,6 +2550,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, if (status.ok()) { status = InstallCompactionResults(compact); + InstallSuperVersion(deletion_state); } VersionSet::LevelSummaryStorage tmp; Log(options_.info_log, @@ -2581,6 +2658,44 @@ Status DBImpl::Get(const ReadOptions& options, return GetImpl(options, key, value); } +// DeletionState gets created and destructed outside of the lock -- we +// use this convinently to: +// * malloc one SuperVersion() outside of the lock -- new_superversion +// * delete one SuperVersion() outside of the lock -- superversion_to_free +// +// However, if InstallSuperVersion() gets called twice with the same, +// deletion_state, we can't reuse the SuperVersion() that got malloced because +// first call already used it. In that rare case, we take a hit and create a +// new SuperVersion() inside of the mutex. We do similar thing +// for superversion_to_free +void DBImpl::InstallSuperVersion(DeletionState& deletion_state) { + // if new_superversion == nullptr, it means somebody already used it + SuperVersion* new_superversion = + (deletion_state.new_superversion != nullptr) ? + deletion_state.new_superversion : new SuperVersion(); + SuperVersion* old_superversion = InstallSuperVersion(new_superversion); + deletion_state.new_superversion = nullptr; + if (deletion_state.superversion_to_free != nullptr) { + // somebody already put it there + delete old_superversion; + } else { + deletion_state.superversion_to_free = old_superversion; + } +} + +DBImpl::SuperVersion* DBImpl::InstallSuperVersion( + SuperVersion* new_superversion) { + mutex_.AssertHeld(); + new_superversion->Init(mem_, imm_, versions_->current()); + SuperVersion* old_superversion = super_version_; + super_version_ = new_superversion; + if (old_superversion != nullptr && old_superversion->Unref()) { + old_superversion->Cleanup(); + return old_superversion; // will let caller delete outside of mutex + } + return nullptr; +} + Status DBImpl::GetImpl(const ReadOptions& options, const Slice& key, std::string* value, @@ -2591,28 +2706,21 @@ Status DBImpl::GetImpl(const ReadOptions& options, StopWatchNano snapshot_timer(env_, false); StartPerfTimer(&snapshot_timer); SequenceNumber snapshot; - std::vector to_delete; - to_delete.reserve(options_.max_write_buffer_number); - mutex_.Lock(); + if (options.snapshot != nullptr) { snapshot = reinterpret_cast(options.snapshot)->number_; } else { snapshot = versions_->LastSequence(); } - MemTable* mem = mem_; - MemTableList imm = imm_; - Version* current = versions_->current(); - mem->Ref(); - imm.RefAll(); - current->Ref(); - - // Unlock while reading from files and memtables + // This can be replaced by using atomics and spinlock instead of big mutex + mutex_.Lock(); + SuperVersion* get_version = super_version_->Ref(); mutex_.Unlock(); + bool have_stat_update = false; Version::GetStats stats; - // Prepare to store a list of merge operations if merge occurs. MergeContext merge_context; @@ -2621,18 +2729,18 @@ Status DBImpl::GetImpl(const ReadOptions& options, // merge_operands will contain the sequence of merges in the latter case. LookupKey lkey(key, snapshot); BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); - if (mem->Get(lkey, value, &s, merge_context, options_)) { + if (get_version->mem->Get(lkey, value, &s, merge_context, options_)) { // Done RecordTick(options_.statistics.get(), MEMTABLE_HIT); - } else if (imm.Get(lkey, value, &s, merge_context, options_)) { + } else if (get_version->imm.Get(lkey, value, &s, merge_context, options_)) { // Done RecordTick(options_.statistics.get(), MEMTABLE_HIT); } else { StopWatchNano from_files_timer(env_, false); StartPerfTimer(&from_files_timer); - current->Get(options, lkey, value, &s, &merge_context, &stats, - options_, value_found); + get_version->current->Get(options, lkey, value, &s, &merge_context, &stats, + options_, value_found); have_stat_update = true; BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer); RecordTick(options_.statistics.get(), MEMTABLE_MISS); @@ -2640,22 +2748,30 @@ Status DBImpl::GetImpl(const ReadOptions& options, StopWatchNano post_process_timer(env_, false); StartPerfTimer(&post_process_timer); - mutex_.Lock(); - if (!options_.disable_seek_compaction && - have_stat_update && current->UpdateStats(stats)) { - MaybeScheduleFlushOrCompaction(); + bool delete_get_version = false; + if (!options_.disable_seek_compaction && have_stat_update) { + mutex_.Lock(); + if (get_version->current->UpdateStats(stats)) { + MaybeScheduleFlushOrCompaction(); + } + if (get_version->Unref()) { + get_version->Cleanup(); + delete_get_version = true; + } + mutex_.Unlock(); + } else { + if (get_version->Unref()) { + mutex_.Lock(); + get_version->Cleanup(); + mutex_.Unlock(); + delete_get_version = true; + } + } + if (delete_get_version) { + delete get_version; } - MemTable* m = mem->Unref(); - imm.UnrefAll(&to_delete); - current->Unref(); - mutex_.Unlock(); - // free up all obsolete memtables outside the mutex - delete m; - for (MemTable* v: to_delete) delete v; - - LogFlush(options_.info_log); // Note, tickers are atomic now - no lock protection needed any more. RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); @@ -2673,7 +2789,6 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, SequenceNumber snapshot; std::vector to_delete; - to_delete.reserve(options_.max_write_buffer_number); mutex_.Lock(); if (options.snapshot != nullptr) { @@ -2748,8 +2863,6 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, delete m; for (MemTable* v: to_delete) delete v; - LogFlush(options_.info_log); - RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS); RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, numKeys); RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytesRead); @@ -2831,17 +2944,27 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { w.done = false; StopWatch sw(env_, options_.statistics.get(), DB_WRITE); - MutexLock l(&mutex_); + mutex_.Lock(); writers_.push_back(&w); while (!w.done && &w != writers_.front()) { w.cv.Wait(); } + + if (!options.disableWAL) { + RecordTick(options_.statistics.get(), WRITE_WITH_WAL, 1); + } + if (w.done) { + mutex_.Unlock(); + RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1); return w.status; + } else { + RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1); } // May temporarily unlock and wait. - Status status = MakeRoomForWrite(my_batch == nullptr); + SuperVersion* superversion_to_free = nullptr; + Status status = MakeRoomForWrite(my_batch == nullptr, &superversion_to_free); uint64_t last_sequence = versions_->LastSequence(); Writer* last_writer = &w; if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions @@ -2877,7 +3000,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (!options.disableWAL) { StopWatchNano timer(env_); StartPerfTimer(&timer); - status = log_->AddRecord(WriteBatchInternal::Contents(updates)); + Slice log_entry = WriteBatchInternal::Contents(updates); + status = log_->AddRecord(log_entry); + RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1); + RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size()); if (status.ok() && options.sync) { if (options_.use_fsync) { StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS); @@ -2906,7 +3032,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { SEQUENCE_NUMBER, last_sequence); } StartPerfTimer(&pre_post_process_timer); - LogFlush(options_.info_log); mutex_.Lock(); if (status.ok()) { versions_->SetLastSequence(last_sequence); @@ -2933,6 +3058,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (!writers_.empty()) { writers_.front()->cv.Signal(); } + mutex_.Unlock(); + delete superversion_to_free; BumpPerfTime(&perf_context.write_pre_and_post_process_time, &pre_post_process_timer); return status; @@ -3027,7 +3154,8 @@ uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) { // REQUIRES: mutex_ is held // REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::MakeRoomForWrite(bool force) { +Status DBImpl::MakeRoomForWrite(bool force, + SuperVersion** superversion_to_free) { mutex_.AssertHeld(); assert(!writers_.empty()); bool allow_delay = !force; @@ -3036,6 +3164,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { uint64_t rate_limit_delay_millis = 0; Status s; double score; + *superversion_to_free = nullptr; while (true) { if (!bg_error_.ok()) { @@ -3162,6 +3291,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { // Do this without holding the dbmutex lock. assert(versions_->PrevLogNumber() == 0); uint64_t new_log_number = versions_->NewFileNumber(); + SuperVersion* new_superversion = nullptr; mutex_.Unlock(); { EnvOptions soptions(storage_options_); @@ -3178,6 +3308,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); memtmp = new MemTable( internal_comparator_, mem_rep_factory_, NumberLevels(), options_); + new_superversion = new SuperVersion(options_.max_write_buffer_number); } } mutex_.Lock(); @@ -3202,11 +3333,16 @@ Status DBImpl::MakeRoomForWrite(bool force) { mem_->SetLogNumber(logfile_number_); force = false; // Do not force another compaction if have room MaybeScheduleFlushOrCompaction(); + *superversion_to_free = InstallSuperVersion(new_superversion); } } return s; } +const std::string& DBImpl::GetName() const { + return dbname_; +} + Env* DBImpl::GetEnv() const { return env_; } @@ -3256,6 +3392,13 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { } else if (in == "stats") { char buf[1000]; + + uint64_t wal_bytes = 0; + uint64_t wal_synced = 0; + uint64_t user_bytes_written = 0; + uint64_t write_other = 0; + uint64_t write_self = 0; + uint64_t write_with_wal = 0; uint64_t total_bytes_written = 0; uint64_t total_bytes_read = 0; uint64_t micros_up = env_->NowMicros() - started_at_; @@ -3268,6 +3411,16 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { uint64_t interval_bytes_new = 0; double interval_seconds_up = 0; + Statistics* s = options_.statistics.get(); + if (s) { + wal_bytes = s->getTickerCount(WAL_FILE_BYTES); + wal_synced = s->getTickerCount(WAL_FILE_SYNCED); + user_bytes_written = s->getTickerCount(BYTES_WRITTEN); + write_other = s->getTickerCount(WRITE_DONE_BY_OTHER); + write_self = s->getTickerCount(WRITE_DONE_BY_SELF); + write_with_wal = s->getTickerCount(WRITE_WITH_WAL); + } + // Pardon the long line but I think it is easier to read this way. snprintf(buf, sizeof(buf), " Compactions\n" @@ -3324,19 +3477,38 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { } } - interval_bytes_new = stats_[0].bytes_written - last_stats_.bytes_new_; - interval_bytes_read = total_bytes_read - last_stats_.bytes_read_; - interval_bytes_written = total_bytes_written - last_stats_.bytes_written_; + interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_; + interval_bytes_read = total_bytes_read - last_stats_.compaction_bytes_read_; + interval_bytes_written = + total_bytes_written - last_stats_.compaction_bytes_written_; interval_seconds_up = seconds_up - last_stats_.seconds_up_; snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n", seconds_up, interval_seconds_up); value->append(buf); + snprintf(buf, sizeof(buf), + "Writes cumulative: %llu total, %llu batches, " + "%.1f per batch, %.2f ingest GB\n", + (unsigned long long) (write_other + write_self), + (unsigned long long) write_self, + (write_other + write_self) / (double) (write_self + 1), + user_bytes_written / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "WAL cumulative: %llu WAL writes, %llu WAL syncs, " + "%.2f writes per sync, %.2f GB written\n", + (unsigned long long) write_with_wal, + (unsigned long long ) wal_synced, + write_with_wal / (double) (wal_synced + 1), + wal_bytes / (1048576.0 * 1024)); + value->append(buf); + snprintf(buf, sizeof(buf), "Compaction IO cumulative (GB): " "%.2f new, %.2f read, %.2f write, %.2f read+write\n", - stats_[0].bytes_written / (1048576.0 * 1024), + user_bytes_written / (1048576.0 * 1024), total_bytes_read / (1048576.0 * 1024), total_bytes_written / (1048576.0 * 1024), (total_bytes_read + total_bytes_written) / (1048576.0 * 1024)); @@ -3345,7 +3517,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { snprintf(buf, sizeof(buf), "Compaction IO cumulative (MB/sec): " "%.1f new, %.1f read, %.1f write, %.1f read+write\n", - stats_[0].bytes_written / 1048576.0 / seconds_up, + user_bytes_written / 1048576.0 / seconds_up, total_bytes_read / 1048576.0 / seconds_up, total_bytes_written / 1048576.0 / seconds_up, (total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up); @@ -3354,9 +3526,38 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { // +1 to avoid divide by 0 and NaN snprintf(buf, sizeof(buf), "Amplification cumulative: %.1f write, %.1f compaction\n", - (double) total_bytes_written / (stats_[0].bytes_written+1), - (double) (total_bytes_written + total_bytes_read) - / (stats_[0].bytes_written+1)); + (double) (total_bytes_written + wal_bytes) + / (user_bytes_written + 1), + (double) (total_bytes_written + total_bytes_read + wal_bytes) + / (user_bytes_written + 1)); + value->append(buf); + + uint64_t interval_write_other = write_other - last_stats_.write_other_; + uint64_t interval_write_self = write_self - last_stats_.write_self_; + + snprintf(buf, sizeof(buf), + "Writes interval: %llu total, %llu batches, " + "%.1f per batch, %.1f ingest MB\n", + (unsigned long long) (interval_write_other + interval_write_self), + (unsigned long long) interval_write_self, + (double) (interval_write_other + interval_write_self) + / (interval_write_self + 1), + (user_bytes_written - last_stats_.ingest_bytes_) / 1048576.0); + value->append(buf); + + uint64_t interval_write_with_wal = + write_with_wal - last_stats_.write_with_wal_; + + uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_; + uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_; + + snprintf(buf, sizeof(buf), + "WAL interval: %llu WAL writes, %llu WAL syncs, " + "%.2f writes per sync, %.2f MB written\n", + (unsigned long long) interval_write_with_wal, + (unsigned long long ) interval_wal_synced, + interval_write_with_wal / (double) (interval_wal_synced + 1), + interval_wal_bytes / (1048576.0 * 1024)); value->append(buf); snprintf(buf, sizeof(buf), @@ -3381,9 +3582,10 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { // +1 to avoid divide by 0 and NaN snprintf(buf, sizeof(buf), "Amplification interval: %.1f write, %.1f compaction\n", - (double) interval_bytes_written / (interval_bytes_new+1), - (double) (interval_bytes_written + interval_bytes_read) / - (interval_bytes_new+1)); + (double) (interval_bytes_written + wal_bytes) + / (interval_bytes_new + 1), + (double) (interval_bytes_written + interval_bytes_read + wal_bytes) + / (interval_bytes_new + 1)); value->append(buf); snprintf(buf, sizeof(buf), @@ -3404,10 +3606,15 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { (unsigned long) total_slowdown_count); value->append(buf); - last_stats_.bytes_read_ = total_bytes_read; - last_stats_.bytes_written_ = total_bytes_written; - last_stats_.bytes_new_ = stats_[0].bytes_written; + last_stats_.compaction_bytes_read_ = total_bytes_read; + last_stats_.compaction_bytes_written_ = total_bytes_written; + last_stats_.ingest_bytes_ = user_bytes_written; last_stats_.seconds_up_ = seconds_up; + last_stats_.wal_bytes_ = wal_bytes; + last_stats_.wal_synced_ = wal_synced; + last_stats_.write_with_wal_ = write_with_wal; + last_stats_.write_other_ = write_other; + last_stats_.write_self_ = write_self; return true; } else if (in == "sstables") { @@ -3482,7 +3689,7 @@ Status DBImpl::DeleteFile(std::string name) { FileMetaData metadata; int maxlevel = NumberLevels(); VersionEdit edit(maxlevel); - DeletionState deletion_state; + DeletionState deletion_state(0, true); { MutexLock l(&mutex_); status = versions_->GetMetadataForFile(number, &level, &metadata); @@ -3512,14 +3719,14 @@ Status DBImpl::DeleteFile(std::string name) { } edit.DeleteFile(level, number); status = versions_->LogAndApply(&edit, &mutex_); + if (status.ok()) { + InstallSuperVersion(deletion_state); + } FindObsoleteFiles(deletion_state, false); } // lock released here LogFlush(options_.info_log); - - if (status.ok()) { - // remove files outside the db-lock - PurgeObsoleteFiles(deletion_state); - } + // remove files outside the db-lock + PurgeObsoleteFiles(deletion_state); return status; } @@ -3619,6 +3826,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { s = impl->versions_->LogAndApply(&edit, &impl->mutex_); } if (s.ok()) { + delete impl->InstallSuperVersion(new DBImpl::SuperVersion()); impl->mem_->SetLogNumber(impl->logfile_number_); impl->DeleteObsoleteFiles(); impl->MaybeScheduleFlushOrCompaction(); diff --git a/db/db_impl.h b/db/db_impl.h index 8fd141a210..5f2148c8b6 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -67,6 +67,7 @@ class DBImpl : public DB { virtual int NumberLevels(); virtual int MaxMemCompactionLevel(); virtual int Level0StopWriteTrigger(); + virtual const std::string& GetName() const; virtual Env* GetEnv() const; virtual const Options& GetOptions() const; virtual Status Flush(const FlushOptions& options); @@ -127,12 +128,38 @@ class DBImpl : public DB { default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL; } - // needed for CleanupIteratorState + // holds references to memtable, all immutable memtables and version + struct SuperVersion { + MemTable* mem; + MemTableList imm; + Version* current; + std::atomic refs; + // We need to_delete because during Cleanup(), imm.UnrefAll() returns + // all memtables that we need to free through this vector. We then + // delete all those memtables outside of mutex, during destruction + std::vector to_delete; + // should be called outside the mutex + explicit SuperVersion(const int num_memtables = 0); + ~SuperVersion(); + SuperVersion* Ref(); + // Returns true if this was the last reference and caller should + // call Clenaup() and delete the object + bool Unref(); + + // call these two methods with db mutex held + // Cleanup unrefs mem, imm and current. Also, it stores all memtables + // that needs to be deleted in to_delete vector. Unrefing those + // objects needs to be done in the mutex + void Cleanup(); + void Init(MemTable* new_mem, const MemTableList& new_imm, + Version* new_current); + }; + + // needed for CleanupIteratorState struct DeletionState { inline bool HaveSomethingToDelete() const { - return memtables_to_free.size() || - all_files.size() || + return all_files.size() || sst_delete_files.size() || log_delete_files.size(); } @@ -154,15 +181,35 @@ class DBImpl : public DB { // a list of memtables to be free std::vector memtables_to_free; + SuperVersion* superversion_to_free; // if nullptr nothing to free + + SuperVersion* new_superversion; // if nullptr no new superversion + // the current manifest_file_number, log_number and prev_log_number // that corresponds to the set of files in 'live'. uint64_t manifest_file_number, log_number, prev_log_number; - explicit DeletionState(const int num_memtables = 0) { + explicit DeletionState(const int num_memtables = 0, + bool create_superversion = false) { manifest_file_number = 0; log_number = 0; prev_log_number = 0; memtables_to_free.reserve(num_memtables); + superversion_to_free = nullptr; + new_superversion = + create_superversion ? new SuperVersion(num_memtables) : nullptr; + } + + ~DeletionState() { + // free pending memtables + for (auto m : memtables_to_free) { + delete m; + } + // free superversion. if nullptr, this will be noop + delete superversion_to_free; + // if new_superversion was not used, it will be non-nullptr and needs + // to be freed here + delete new_superversion; } }; @@ -239,7 +286,11 @@ class DBImpl : public DB { uint64_t* filenumber); uint64_t SlowdownAmount(int n, int top, int bottom); - Status MakeRoomForWrite(bool force /* compact even if there is room? */); + // MakeRoomForWrite will return superversion_to_free through an arugment, + // which the caller needs to delete. We do it because caller can delete + // the superversion outside of mutex + Status MakeRoomForWrite(bool force /* compact even if there is room? */, + SuperVersion** superversion_to_free); WriteBatch* BuildBatchGroup(Writer** last_writer); // Force current memtable contents to be flushed. @@ -323,6 +374,8 @@ class DBImpl : public DB { uint64_t logfile_number_; unique_ptr log_; + SuperVersion* super_version_; + std::string host_name_; // Queue of writers. @@ -440,15 +493,25 @@ class DBImpl : public DB { // Used to compute per-interval statistics struct StatsSnapshot { - uint64_t bytes_read_; - uint64_t bytes_written_; - uint64_t bytes_new_; + uint64_t compaction_bytes_read_; // Bytes read by compaction + uint64_t compaction_bytes_written_; // Bytes written by compaction + uint64_t ingest_bytes_; // Bytes written by user + uint64_t wal_bytes_; // Bytes written to WAL + uint64_t wal_synced_; // Number of times WAL is synced + uint64_t write_with_wal_; // Number of writes that request WAL + // These count the number of writes processed by the calling thread or + // another thread. + uint64_t write_other_; + uint64_t write_self_; double seconds_up_; - StatsSnapshot() : bytes_read_(0), bytes_written_(0), - bytes_new_(0), seconds_up_(0) {} + StatsSnapshot() : compaction_bytes_read_(0), compaction_bytes_written_(0), + ingest_bytes_(0), wal_bytes_(0), wal_synced_(0), + write_with_wal_(0), write_other_(0), write_self_(0), + seconds_up_(0) {} }; + // Counters from the previous time per-interval stats were computed StatsSnapshot last_stats_; static const int KEEP_LOG_FILE_NUM = 1000; @@ -480,6 +543,18 @@ class DBImpl : public DB { std::vector& snapshots, SequenceNumber* prev_snapshot); + // will return a pointer to SuperVersion* if previous SuperVersion + // if its reference count is zero and needs deletion or nullptr if not + // As argument takes a pointer to allocated SuperVersion + // Foreground threads call this function directly (they don't carry + // deletion state and have to handle their own creation and deletion + // of SuperVersion) + SuperVersion* InstallSuperVersion(SuperVersion* new_superversion); + // Background threads call this function, which is just a wrapper around + // the InstallSuperVersion() function above. Background threads carry + // deletion_state which can have new_superversion already allocated. + void InstallSuperVersion(DeletionState& deletion_state); + // Function that Get and KeyMayExist call with no_io true or false // Note: 'value_found' from KeyMayExist propagates here Status GetImpl(const ReadOptions& options, diff --git a/db/db_test.cc b/db/db_test.cc index 10babbac60..ff053c2170 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -701,23 +701,25 @@ static std::string Key(int i) { return std::string(buf); } -TEST(DBTest, Empty) { +/* +TEST(DBTest, GetFromImmutableLayer) { do { - ASSERT_TRUE(db_ != nullptr); - ASSERT_EQ("NOT_FOUND", Get("foo")); - } while (ChangeOptions()); -} + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + Reopen(&options); -TEST(DBTest, ReadWrite) { - do { ASSERT_OK(Put("foo", "v1")); ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); - ASSERT_EQ("v3", Get("foo")); - ASSERT_EQ("v2", Get("bar")); + + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + Put("k1", std::string(100000, 'x')); // Fill memtable + Put("k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_EQ("v1", Get("foo")); + env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls } while (ChangeOptions()); } +*/ // Make sure that when options.block_cache is set, after a new table is // created its index/filter blocks are added to block cache. @@ -731,7 +733,7 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); // Create a new talbe. - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); // index/filter blocks added to block cache right after table creation. ASSERT_EQ(1, @@ -776,157 +778,6 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); } -TEST(DBTest, LevelLimitReopen) { - Options options = CurrentOptions(); - Reopen(&options); - - const std::string value(1024 * 1024, ' '); - int i = 0; - while (NumTableFilesAtLevel(2) == 0) { - ASSERT_OK(Put(Key(i++), value)); - } - - options.num_levels = 1; - options.max_bytes_for_level_multiplier_additional.resize(1, 1); - Status s = TryReopen(&options); - ASSERT_EQ(s.IsCorruption(), true); - ASSERT_EQ(s.ToString(), - "Corruption: VersionEdit: db already has " - "more levels than options.num_levels"); - - options.num_levels = 10; - options.max_bytes_for_level_multiplier_additional.resize(10, 1); - ASSERT_OK(TryReopen(&options)); -} - -TEST(DBTest, Preallocation) { - const std::string src = dbname_ + "/alloc_test"; - unique_ptr srcfile; - const EnvOptions soptions; - ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions)); - srcfile->SetPreallocationBlockSize(1024 * 1024); - - // No writes should mean no preallocation - size_t block_size, last_allocated_block; - srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 0UL); - - // Small write should preallocate one block - srcfile->Append("test"); - srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 1UL); - - // Write an entire preallocation block, make sure we increased by two. - std::string buf(block_size, ' '); - srcfile->Append(buf); - srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 2UL); - - // Write five more blocks at once, ensure we're where we need to be. - buf = std::string(block_size * 5, ' '); - srcfile->Append(buf); - srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 7UL); -} - -TEST(DBTest, PutDeleteGet) { - do { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); - ASSERT_EQ("v2", Get("foo")); - ASSERT_OK(db_->Delete(WriteOptions(), "foo")); - ASSERT_EQ("NOT_FOUND", Get("foo")); - } while (ChangeOptions()); -} - - -TEST(DBTest, GetFromImmutableLayer) { - do { - Options options = CurrentOptions(); - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - Reopen(&options); - - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - - env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls - Put("k1", std::string(100000, 'x')); // Fill memtable - Put("k2", std::string(100000, 'y')); // Trigger compaction - ASSERT_EQ("v1", Get("foo")); - env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls - } while (ChangeOptions()); -} - -TEST(DBTest, GetFromVersions) { - do { - ASSERT_OK(Put("foo", "v1")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v1", Get("foo")); - } while (ChangeOptions()); -} - -TEST(DBTest, GetSnapshot) { - do { - // Try with both a short key and a long key - for (int i = 0; i < 2; i++) { - std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); - ASSERT_OK(Put(key, "v1")); - const Snapshot* s1 = db_->GetSnapshot(); - ASSERT_OK(Put(key, "v2")); - ASSERT_EQ("v2", Get(key)); - ASSERT_EQ("v1", Get(key, s1)); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v2", Get(key)); - ASSERT_EQ("v1", Get(key, s1)); - db_->ReleaseSnapshot(s1); - } - } while (ChangeOptions()); -} - -TEST(DBTest, GetLevel0Ordering) { - do { - // Check that we process level-0 files in correct order. The code - // below generates two level-0 files where the earlier one comes - // before the later one in the level-0 file list since the earlier - // one has a smaller "smallest" key. - ASSERT_OK(Put("bar", "b")); - ASSERT_OK(Put("foo", "v1")); - dbfull()->TEST_FlushMemTable(); - ASSERT_OK(Put("foo", "v2")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v2", Get("foo")); - } while (ChangeOptions()); -} - -TEST(DBTest, GetOrderedByLevels) { - do { - ASSERT_OK(Put("foo", "v1")); - Compact("a", "z"); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(Put("foo", "v2")); - ASSERT_EQ("v2", Get("foo")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v2", Get("foo")); - } while (ChangeOptions()); -} - -TEST(DBTest, GetPicksCorrectFile) { - do { - // Arrange to have multiple files in a non-level-0 level. - ASSERT_OK(Put("a", "va")); - Compact("a", "b"); - ASSERT_OK(Put("x", "vx")); - Compact("x", "y"); - ASSERT_OK(Put("f", "vf")); - Compact("f", "g"); - ASSERT_EQ("va", Get("a")); - ASSERT_EQ("vf", Get("f")); - ASSERT_EQ("vx", Get("x")); - } while (ChangeOptions()); -} - TEST(DBTest, GetEncountersEmptyLevel) { do { // Arrange for the following to happen: @@ -4510,6 +4361,10 @@ class ModelDB: public DB { return -1; } + virtual const std::string& GetName() const { + return name_; + } + virtual Env* GetEnv() const { return nullptr; } @@ -4587,6 +4442,7 @@ class ModelDB: public DB { }; const Options options_; KVMap map_; + std::string name_ = ""; }; static std::string RandomKey(Random* rnd, int minimum = 0) { diff --git a/db/memtable.cc b/db/memtable.cc index 9b5df942db..55549a142f 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -279,7 +279,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, *s = Status::Corruption("Error: Could not perform merge."); } } else { - *s = Status::NotFound(Slice()); + *s = Status::NotFound(); } found_final_value = true; break; diff --git a/db/table_cache.cc b/db/table_cache.cc index e18c20c990..736bf60f5c 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -50,9 +50,8 @@ Status TableCache::FindTable(const EnvOptions& toptions, Cache::Handle** handle, bool* table_io, const bool no_io) { Status s; - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - Slice key(buf, sizeof(buf)); + Slice key(reinterpret_cast(&file_number), sizeof(file_number)); + *handle = cache_->Lookup(key); if (*handle == nullptr) { if (no_io) { // Dont do IO and return a not-found status @@ -165,9 +164,8 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options, } void TableCache::Evict(uint64_t file_number) { - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - cache_->Erase(Slice(buf, sizeof(buf))); + Slice key(reinterpret_cast(&file_number), sizeof(file_number)); + cache_->Erase(key); } } // namespace rocksdb diff --git a/db/version_set.cc b/db/version_set.cc index 74a33b6fb4..ee6c36c0f5 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -545,7 +545,7 @@ void Version::Get(const ReadOptions& options, case kFound: return; case kDeleted: - *status = Status::NotFound(Slice()); // Use empty error message for speed + *status = Status::NotFound(); // Use empty error message for speed return; case kCorrupt: *status = Status::Corruption("corrupted key for ", user_key); @@ -570,7 +570,7 @@ void Version::Get(const ReadOptions& options, user_key); } } else { - *status = Status::NotFound(Slice()); // Use an empty error message for speed + *status = Status::NotFound(); // Use an empty error message for speed } } @@ -1112,12 +1112,6 @@ class VersionSet::Builder { MaybeAddFile(v, level, *base_iter); } } - // Pre-sort level0 for Get() - if (vset_->options_->compaction_style == kCompactionStyleUniversal) { - std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo); - } else { - std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst); - } CheckConsistency(v); } @@ -1683,6 +1677,12 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) { void VersionSet::Finalize(Version* v, std::vector& size_being_compacted) { + // Pre-sort level0 for Get() + if (options_->compaction_style == kCompactionStyleUniversal) { + std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo); + } else { + std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst); + } double max_score = 0; int max_score_level = 0; diff --git a/db/version_set.h b/db/version_set.h index aab4b82bc8..579ec3346d 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -274,12 +274,14 @@ class VersionSet { int64_t NumLevelBytes(int level) const; // Return the last sequence number. - uint64_t LastSequence() const { return last_sequence_; } + uint64_t LastSequence() const { + return last_sequence_.load(std::memory_order_acquire); + } // Set the last sequence number to s. void SetLastSequence(uint64_t s) { assert(s >= last_sequence_); - last_sequence_ = s; + last_sequence_.store(s, std::memory_order_release); } // Mark the specified file number as used. @@ -478,7 +480,7 @@ class VersionSet { const InternalKeyComparator icmp_; uint64_t next_file_number_; uint64_t manifest_file_number_; - uint64_t last_sequence_; + std::atomic last_sequence_; uint64_t log_number_; uint64_t prev_log_number_; // 0 or backing store for memtable being compacted diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 787bcf4315..a3b18084a8 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -54,171 +54,204 @@ extern "C" { /* Exported types */ -typedef struct leveldb_t leveldb_t; -typedef struct leveldb_cache_t leveldb_cache_t; -typedef struct leveldb_comparator_t leveldb_comparator_t; -typedef struct leveldb_env_t leveldb_env_t; -typedef struct leveldb_filelock_t leveldb_filelock_t; -typedef struct leveldb_filterpolicy_t leveldb_filterpolicy_t; -typedef struct leveldb_iterator_t leveldb_iterator_t; -typedef struct leveldb_logger_t leveldb_logger_t; -typedef struct leveldb_options_t leveldb_options_t; -typedef struct leveldb_randomfile_t leveldb_randomfile_t; -typedef struct leveldb_readoptions_t leveldb_readoptions_t; -typedef struct leveldb_seqfile_t leveldb_seqfile_t; -typedef struct leveldb_snapshot_t leveldb_snapshot_t; -typedef struct leveldb_writablefile_t leveldb_writablefile_t; -typedef struct leveldb_writebatch_t leveldb_writebatch_t; -typedef struct leveldb_writeoptions_t leveldb_writeoptions_t; +typedef struct rocksdb_t rocksdb_t; +typedef struct rocksdb_cache_t rocksdb_cache_t; +typedef struct rocksdb_comparator_t rocksdb_comparator_t; +typedef struct rocksdb_env_t rocksdb_env_t; +typedef struct rocksdb_filelock_t rocksdb_filelock_t; +typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t; +typedef struct rocksdb_iterator_t rocksdb_iterator_t; +typedef struct rocksdb_logger_t rocksdb_logger_t; +typedef struct rocksdb_options_t rocksdb_options_t; +typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; +typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; +typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; +typedef struct rocksdb_snapshot_t rocksdb_snapshot_t; +typedef struct rocksdb_writablefile_t rocksdb_writablefile_t; +typedef struct rocksdb_writebatch_t rocksdb_writebatch_t; +typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; +typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t; /* DB operations */ -extern leveldb_t* leveldb_open( - const leveldb_options_t* options, +extern rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, const char* name, char** errptr); -extern void leveldb_close(leveldb_t* db); +extern void rocksdb_close(rocksdb_t* db); -extern void leveldb_put( - leveldb_t* db, - const leveldb_writeoptions_t* options, +extern void rocksdb_put( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, const char* key, size_t keylen, const char* val, size_t vallen, char** errptr); -extern void leveldb_delete( - leveldb_t* db, - const leveldb_writeoptions_t* options, +extern void rocksdb_delete( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, const char* key, size_t keylen, char** errptr); -extern void leveldb_write( - leveldb_t* db, - const leveldb_writeoptions_t* options, - leveldb_writebatch_t* batch, +extern void rocksdb_write( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr); /* Returns NULL if not found. A malloc()ed array otherwise. Stores the length of the array in *vallen. */ -extern char* leveldb_get( - leveldb_t* db, - const leveldb_readoptions_t* options, +extern char* rocksdb_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, size_t* vallen, char** errptr); -extern leveldb_iterator_t* leveldb_create_iterator( - leveldb_t* db, - const leveldb_readoptions_t* options); +extern rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, + const rocksdb_readoptions_t* options); -extern const leveldb_snapshot_t* leveldb_create_snapshot( - leveldb_t* db); +extern const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db); -extern void leveldb_release_snapshot( - leveldb_t* db, - const leveldb_snapshot_t* snapshot); +extern void rocksdb_release_snapshot( + rocksdb_t* db, + const rocksdb_snapshot_t* snapshot); /* Returns NULL if property name is unknown. Else returns a pointer to a malloc()-ed null-terminated value. */ -extern char* leveldb_property_value( - leveldb_t* db, +extern char* rocksdb_property_value( + rocksdb_t* db, const char* propname); -extern void leveldb_approximate_sizes( - leveldb_t* db, +extern void rocksdb_approximate_sizes( + rocksdb_t* db, int num_ranges, const char* const* range_start_key, const size_t* range_start_key_len, const char* const* range_limit_key, const size_t* range_limit_key_len, uint64_t* sizes); -extern void leveldb_compact_range( - leveldb_t* db, +extern void rocksdb_compact_range( + rocksdb_t* db, const char* start_key, size_t start_key_len, const char* limit_key, size_t limit_key_len); /* Management operations */ -extern void leveldb_destroy_db( - const leveldb_options_t* options, +extern void rocksdb_destroy_db( + const rocksdb_options_t* options, const char* name, char** errptr); -extern void leveldb_repair_db( - const leveldb_options_t* options, +extern void rocksdb_repair_db( + const rocksdb_options_t* options, const char* name, char** errptr); /* Iterator */ -extern void leveldb_iter_destroy(leveldb_iterator_t*); -extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*); -extern void leveldb_iter_seek_to_first(leveldb_iterator_t*); -extern void leveldb_iter_seek_to_last(leveldb_iterator_t*); -extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen); -extern void leveldb_iter_next(leveldb_iterator_t*); -extern void leveldb_iter_prev(leveldb_iterator_t*); -extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen); -extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen); -extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr); +extern void rocksdb_iter_destroy(rocksdb_iterator_t*); +extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*); +extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*); +extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*); +extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen); +extern void rocksdb_iter_next(rocksdb_iterator_t*); +extern void rocksdb_iter_prev(rocksdb_iterator_t*); +extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen); +extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen); +extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr); /* Write batch */ -extern leveldb_writebatch_t* leveldb_writebatch_create(); -extern void leveldb_writebatch_destroy(leveldb_writebatch_t*); -extern void leveldb_writebatch_clear(leveldb_writebatch_t*); -extern void leveldb_writebatch_put( - leveldb_writebatch_t*, +extern rocksdb_writebatch_t* rocksdb_writebatch_create(); +extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*); +extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*); +extern void rocksdb_writebatch_put( + rocksdb_writebatch_t*, const char* key, size_t klen, const char* val, size_t vlen); -extern void leveldb_writebatch_delete( - leveldb_writebatch_t*, +extern void rocksdb_writebatch_delete( + rocksdb_writebatch_t*, const char* key, size_t klen); -extern void leveldb_writebatch_iterate( - leveldb_writebatch_t*, +extern void rocksdb_writebatch_iterate( + rocksdb_writebatch_t*, void* state, void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), void (*deleted)(void*, const char* k, size_t klen)); /* Options */ -extern leveldb_options_t* leveldb_options_create(); -extern void leveldb_options_destroy(leveldb_options_t*); -extern void leveldb_options_set_comparator( - leveldb_options_t*, - leveldb_comparator_t*); -extern void leveldb_options_set_compression_per_level( - leveldb_options_t* opt, +extern rocksdb_options_t* rocksdb_options_create(); +extern void rocksdb_options_destroy(rocksdb_options_t*); +extern void rocksdb_options_set_comparator( + rocksdb_options_t*, + rocksdb_comparator_t*); +extern void rocksdb_options_set_compression_per_level( + rocksdb_options_t* opt, int* level_values, size_t num_levels); -extern void leveldb_options_set_filter_policy( - leveldb_options_t*, - leveldb_filterpolicy_t*); -extern void leveldb_options_set_create_if_missing( - leveldb_options_t*, unsigned char); -extern void leveldb_options_set_error_if_exists( - leveldb_options_t*, unsigned char); -extern void leveldb_options_set_paranoid_checks( - leveldb_options_t*, unsigned char); -extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*); -extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*); -extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t); -extern void leveldb_options_set_max_open_files(leveldb_options_t*, int); -extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*); -extern void leveldb_options_set_block_size(leveldb_options_t*, size_t); -extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int); -extern void leveldb_options_set_compression_options( - leveldb_options_t* opt, int w_bits, int level, int strategy); +extern void rocksdb_options_set_filter_policy( + rocksdb_options_t*, + rocksdb_filterpolicy_t*); +extern void rocksdb_options_set_create_if_missing( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_error_if_exists( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_paranoid_checks( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*); +extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*); +extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int); +extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*); +extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int); +extern void rocksdb_options_set_compression_options( + rocksdb_options_t*, int, int, int); +extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_slowdown_writes_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_target_file_size_base( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t*, int); +extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int); +extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int); +extern void rocksdb_options_set_use_fsync( + rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int); +extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int); +extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*); +extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*); + enum { - leveldb_no_compression = 0, - leveldb_snappy_compression = 1 + rocksdb_no_compression = 0, + rocksdb_snappy_compression = 1, + rocksdb_zlib_compression = 1, + rocksdb_bz2_compression = 1 }; -extern void leveldb_options_set_compression(leveldb_options_t*, int); +extern void rocksdb_options_set_compression(rocksdb_options_t*, int); +enum { + rocksdb_level_compaction = 0, + rocksdb_universal_compaction = 1 +}; +extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int); +extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*); /* Comparator */ -extern leveldb_comparator_t* leveldb_comparator_create( +extern rocksdb_comparator_t* rocksdb_comparator_create( void* state, void (*destructor)(void*), int (*compare)( @@ -226,11 +259,11 @@ extern leveldb_comparator_t* leveldb_comparator_create( const char* a, size_t alen, const char* b, size_t blen), const char* (*name)(void*)); -extern void leveldb_comparator_destroy(leveldb_comparator_t*); +extern void rocksdb_comparator_destroy(rocksdb_comparator_t*); /* Filter policy */ -extern leveldb_filterpolicy_t* leveldb_filterpolicy_create( +extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( void* state, void (*destructor)(void*), char* (*create_filter)( @@ -243,40 +276,65 @@ extern leveldb_filterpolicy_t* leveldb_filterpolicy_create( const char* key, size_t length, const char* filter, size_t filter_length), const char* (*name)(void*)); -extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*); +extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*); -extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom( +extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom( int bits_per_key); /* Read options */ -extern leveldb_readoptions_t* leveldb_readoptions_create(); -extern void leveldb_readoptions_destroy(leveldb_readoptions_t*); -extern void leveldb_readoptions_set_verify_checksums( - leveldb_readoptions_t*, +extern rocksdb_readoptions_t* rocksdb_readoptions_create(); +extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*); +extern void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t*, unsigned char); -extern void leveldb_readoptions_set_fill_cache( - leveldb_readoptions_t*, unsigned char); -extern void leveldb_readoptions_set_snapshot( - leveldb_readoptions_t*, - const leveldb_snapshot_t*); +extern void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t*, unsigned char); +extern void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t*, + const rocksdb_snapshot_t*); /* Write options */ -extern leveldb_writeoptions_t* leveldb_writeoptions_create(); -extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*); -extern void leveldb_writeoptions_set_sync( - leveldb_writeoptions_t*, unsigned char); +extern rocksdb_writeoptions_t* rocksdb_writeoptions_create(); +extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*); +extern void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t*, unsigned char); +extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable); /* Cache */ -extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity); -extern void leveldb_cache_destroy(leveldb_cache_t* cache); +extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity); +extern void rocksdb_cache_destroy(rocksdb_cache_t* cache); /* Env */ -extern leveldb_env_t* leveldb_create_default_env(); -extern void leveldb_env_destroy(leveldb_env_t*); +extern rocksdb_env_t* rocksdb_create_default_env(); +extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n); +extern void rocksdb_env_destroy(rocksdb_env_t*); + +/* Universal Compaction options */ + +enum { + rocksdb_similar_size_compaction_stop_style = 0, + rocksdb_total_size_compaction_stop_style = 1 +}; + +extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ; +extern void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t*); #ifdef __cplusplus } /* end extern "C" */ diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 7396f84454..c4c5aa87fe 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -228,6 +228,10 @@ class DB { // Number of files in level-0 that would stop writes. virtual int Level0StopWriteTrigger() = 0; + // Get DB name -- the exact same name that was provided as an argument to + // DB::Open() + virtual const std::string& GetName() const = 0; + // Get Env object from the DB virtual Env* GetEnv() const = 0; diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index bd5d485def..4ae93af1bb 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -111,52 +111,72 @@ enum Tickers { BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL + + // Writes can be processed by requesting thread or by the thread at the + // head of the writers queue. + WRITE_DONE_BY_SELF, + WRITE_DONE_BY_OTHER, + + WRITE_WITH_WAL, // Number of Write calls that request WAL + + COMPACT_READ_BYTES, // Bytes read during compaction + COMPACT_WRITE_BYTES, // Bytes written during compaction + TICKER_ENUM_MAX }; // The order of items listed in Tickers should be the same as // the order listed in TickersNameMap const std::vector> TickersNameMap = { - { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" }, - { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" }, - { BLOCK_CACHE_ADD, "rocksdb.block.cache.add" }, - { BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" }, - { BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" }, - { BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" }, - { BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" }, - { BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" }, - { BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" }, - { BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" }, - { MEMTABLE_HIT, "rocksdb.memtable.hit" }, - { MEMTABLE_MISS, "rocksdb.memtable.miss" }, - { COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" }, - { COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" }, - { COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" }, - { NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" }, - { NUMBER_KEYS_READ, "rocksdb.number.keys.read" }, - { NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" }, - { BYTES_WRITTEN, "rocksdb.bytes.written" }, - { BYTES_READ, "rocksdb.bytes.read" }, - { NO_FILE_CLOSES, "rocksdb.no.file.closes" }, - { NO_FILE_OPENS, "rocksdb.no.file.opens" }, - { NO_FILE_ERRORS, "rocksdb.no.file.errors" }, - { STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" }, - { STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" }, - { STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" }, - { RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" }, - { NO_ITERATORS, "rocksdb.num.iterators" }, - { NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" }, - { NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" }, - { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" }, - { NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" }, - { NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" }, - { SEQUENCE_NUMBER, "rocksdb.sequence.number" }, - { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" }, - { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" }, - { NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" }, - { GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" }, - { BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" }, - { BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" }, + { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" }, + { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" }, + { BLOCK_CACHE_ADD, "rocksdb.block.cache.add" }, + { BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" }, + { BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" }, + { BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" }, + { BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" }, + { BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" }, + { BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" }, + { BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" }, + { MEMTABLE_HIT, "rocksdb.memtable.hit" }, + { MEMTABLE_MISS, "rocksdb.memtable.miss" }, + { COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" }, + { COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" }, + { COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" }, + { NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" }, + { NUMBER_KEYS_READ, "rocksdb.number.keys.read" }, + { NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" }, + { BYTES_WRITTEN, "rocksdb.bytes.written" }, + { BYTES_READ, "rocksdb.bytes.read" }, + { NO_FILE_CLOSES, "rocksdb.no.file.closes" }, + { NO_FILE_OPENS, "rocksdb.no.file.opens" }, + { NO_FILE_ERRORS, "rocksdb.no.file.errors" }, + { STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" }, + { STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" }, + { STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" }, + { RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" }, + { NO_ITERATORS, "rocksdb.num.iterators" }, + { NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" }, + { NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" }, + { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" }, + { NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" }, + { NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" }, + { SEQUENCE_NUMBER, "rocksdb.sequence.number" }, + { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" }, + { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" }, + { NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" }, + { GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" }, + { BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" }, + { BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" }, + { WAL_FILE_SYNCED, "rocksdb.wal.synced" }, + { WAL_FILE_BYTES, "rocksdb.wal.bytes" }, + { WRITE_DONE_BY_SELF, "rocksdb.write.self" }, + { WRITE_DONE_BY_OTHER, "rocksdb.write.other" }, + { WRITE_WITH_WAL, "rocksdb.write.wal" }, + { COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" }, + { COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" }, }; /** diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index b118e3db4b..e2304fdb67 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -25,7 +25,7 @@ namespace rocksdb { class Status { public: // Create a success status. - Status() : state_(nullptr) { } + Status() : code_(kOk), state_(nullptr) { } ~Status() { delete[] state_; } // Copy the specified status. @@ -39,6 +39,10 @@ class Status { static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kNotFound, msg, msg2); } + // Fast path for not found without malloc; + static Status NotFound() { + return Status(kNotFound); + } static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kCorruption, msg, msg2); } @@ -59,7 +63,7 @@ class Status { } // Returns true iff the status indicates success. - bool ok() const { return (state_ == nullptr); } + bool ok() const { return code() == kOk; } // Returns true iff the status indicates a NotFound error. bool IsNotFound() const { return code() == kNotFound; } @@ -87,13 +91,6 @@ class Status { std::string ToString() const; private: - // OK status has a nullptr state_. Otherwise, state_ is a new[] array - // of the following form: - // state_[0..3] == length of message - // state_[4] == code - // state_[5..] == message - const char* state_; - enum Code { kOk = 0, kNotFound = 1, @@ -105,20 +102,30 @@ class Status { kIncomplete = 7 }; - Code code() const { - return (state_ == nullptr) ? kOk : static_cast(state_[4]); - } + // A nullptr state_ (which is always the case for OK) means the message + // is empty. + // of the following form: + // state_[0..3] == length of message + // state_[4..] == message + Code code_; + const char* state_; + Code code() const { + return code_; + } + explicit Status(Code code) : code_(code), state_(nullptr) { } Status(Code code, const Slice& msg, const Slice& msg2); static const char* CopyState(const char* s); }; inline Status::Status(const Status& s) { + code_ = s.code_; state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); } inline void Status::operator=(const Status& s) { // The following condition catches both aliasing (when this == &s), // and the common case where both s and *this are ok. + code_ = s.code_; if (state_ != s.state_) { delete[] state_; state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); diff --git a/include/rocksdb/transaction_log.h b/include/rocksdb/transaction_log.h index abf0725748..41a3250d8d 100644 --- a/include/rocksdb/transaction_log.h +++ b/include/rocksdb/transaction_log.h @@ -56,7 +56,7 @@ class LogFile { }; struct BatchResult { - SequenceNumber sequence = SequenceNumber(); + SequenceNumber sequence = 0; std::unique_ptr writeBatchPtr; }; diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h new file mode 100644 index 0000000000..335e028576 --- /dev/null +++ b/include/utilities/backupable_db.h @@ -0,0 +1,133 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "utilities/stackable_db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +#include +#include +#include + +namespace rocksdb { + +struct BackupableDBOptions { + // Where to keep the backup files. Has to be different than dbname_ + // Best to set this to dbname_ + "/backups" + // Required + std::string backup_dir; + + // Backup Env object. It will be used for backup file I/O. If it's + // nullptr, backups will be written out using DBs Env. If it's + // non-nullptr, backup's I/O will be performed using this object. + // If you want to have backups on HDFS, use HDFS Env here! + // Default: nullptr + Env* backup_env; + + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + // If sync == true, we can guarantee you'll get consistent backup even + // on a machine crash/reboot. Backup process is slower with sync enabled. + // If sync == false, we don't guarantee anything on machine reboot. However, + // chances are some of the backups are consistent. + // Default: true + bool sync; + + // If true, it will delete whatever backups there are already + // Default: false + bool destroy_old_data; + + explicit BackupableDBOptions(const std::string& _backup_dir, + Env* _backup_env = nullptr, + Logger* _info_log = nullptr, + bool _sync = true, + bool _destroy_old_data = false) : + backup_dir(_backup_dir), + backup_env(_backup_env), + info_log(_info_log), + sync(_sync), + destroy_old_data(_destroy_old_data) { } +}; + +class BackupEngine; + +typedef uint32_t BackupID; + +struct BackupInfo { + BackupID backup_id; + int64_t timestamp; + uint64_t size; + + BackupInfo() {} + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size) + : backup_id(_backup_id), timestamp(_timestamp), size(_size) {} +}; + +// Stack your DB with BackupableDB to be able to backup the DB +class BackupableDB : public StackableDB { + public: + // BackupableDBOptions have to be the same as the ones used in a previous + // incarnation of the DB + // + // BackupableDB ownes the pointer `DB* db` now. You should not delete it or + // use it after the invocation of BackupableDB + BackupableDB(DB* db, const BackupableDBOptions& options); + virtual ~BackupableDB(); + + // Captures the state of the database in the latest backup + // NOT a thread safe call + Status CreateNewBackup(bool flush_before_backup = false); + // Returns info about backups in backup_info + void GetBackupInfo(std::vector* backup_info); + // deletes old backups, keeping latest num_backups_to_keep alive + Status PurgeOldBackups(uint32_t num_backups_to_keep); + // deletes a specific backup + Status DeleteBackup(BackupID backup_id); + + private: + BackupEngine* backup_engine_; +}; + +// Use this class to access information about backups and restore from them +class RestoreBackupableDB { + public: + RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options); + ~RestoreBackupableDB(); + + // Returns info about backups in backup_info + void GetBackupInfo(std::vector* backup_info); + + // restore from backup with backup_id + // IMPORTANT -- if you restore from some backup that is not the latest, + // and you start creating new backups from the new DB, all the backups + // that were newer than the backup you restored from will be deleted + // + // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. + // If you try creating a new backup now, old backups 4 and 5 will be deleted + // and new backup with ID 4 will be created. + Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir, + const std::string& wal_dir); + + // restore from the latest backup + Status RestoreDBFromLatestBackup(const std::string& db_dir, + const std::string& wal_dir); + // deletes old backups, keeping latest num_backups_to_keep alive + Status PurgeOldBackups(uint32_t num_backups_to_keep); + // deletes a specific backup + Status DeleteBackup(BackupID backup_id); + + private: + BackupEngine* backup_engine_; +}; + +} // rocksdb namespace diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h index e74bf353b4..2d86a611b7 100644 --- a/include/utilities/stackable_db.h +++ b/include/utilities/stackable_db.h @@ -103,6 +103,10 @@ class StackableDB : public DB { return db_->Level0StopWriteTrigger(); } + virtual const std::string& GetName() const override { + return db_->GetName(); + } + virtual Env* GetEnv() const override { return db_->GetEnv(); } diff --git a/port/stack_trace.cc b/port/stack_trace.cc index a98f26eacf..aa01fd0cf3 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -31,12 +31,7 @@ static const char* GetExecutableName() } } -static void StackTraceHandler(int sig) { - // reset to default handler - signal(sig, SIG_DFL); - - fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); - +void PrintStack(int first_frames_to_skip) { const int kMaxFrames = 100; void *frames[kMaxFrames]; @@ -45,11 +40,8 @@ static void StackTraceHandler(int sig) { auto executable = GetExecutableName(); - const int kSkip = 2; // skip the top two signal handler related frames - - for (int i = kSkip; i < num_frames; ++i) - { - fprintf(stderr, "#%-2d %p ", i - kSkip, frames[i]); + for (int i = first_frames_to_skip; i < num_frames; ++i) { + fprintf(stderr, "#%-2d ", i - first_frames_to_skip); if (symbols) { fprintf(stderr, "%s ", symbols[i]); } @@ -57,22 +49,29 @@ static void StackTraceHandler(int sig) { // out source to addr2line, for the address translation const int kLineMax = 256; char cmd[kLineMax]; - sprintf(cmd,"addr2line %p -e %s 2>&1", frames[i] , executable); + sprintf(cmd, "addr2line %p -e %s -f -C 2>&1", frames[i], executable); auto f = popen(cmd, "r"); if (f) { char line[kLineMax]; while (fgets(line, sizeof(line), f)) { - fprintf(stderr, "%s", line); + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); } pclose(f); - } else { - fprintf(stderr, "\n"); } } else { - fprintf(stderr, "\n"); + fprintf(stderr, " %p", frames[i]); } + fprintf(stderr, "\n"); } +} +static void StackTraceHandler(int sig) { + // reset to default handler + signal(sig, SIG_DFL); + fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); + // skip the top three signal handler related frames + PrintStack(3); // re-signal to default handler (so we still get core dump if needed...) raise(sig); } @@ -96,6 +95,7 @@ void InstallStackTraceHandler() { namespace rocksdb { void InstallStackTraceHandler() {} +void PrintStack(int first_frames_to_skip) {} } diff --git a/util/autovector.h b/util/autovector.h new file mode 100644 index 0000000000..9998e29560 --- /dev/null +++ b/util/autovector.h @@ -0,0 +1,329 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include +#include +#include + +namespace rocksdb { + +// A vector that leverages pre-allocated stack-based array to achieve better +// performance for array with small amount of items. +// +// The interface resembles that of vector, but with less features since we aim +// to solve the problem that we have in hand, rather than implementing a +// full-fledged generic container. +// +// Currently we don't support: +// * reserve()/shrink_to_fit()/resize() +// If used correctly, in most cases, people should not touch the +// underlying vector at all. +// * random insert()/erase(), please only use push_back()/pop_back(). +// * No move/swap operations. Each autovector instance has a +// stack-allocated array and if we want support move/swap operations, we +// need to copy the arrays other than just swapping the pointers. In this +// case we'll just explicitly forbid these operations since they may +// lead users to make false assumption by thinking they are inexpensive +// operations. +// +// Naming style of public methods almost follows that of the STL's. +template +class autovector { + public: + // General STL-style container member types. + typedef T value_type; + typedef typename std::vector::difference_type difference_type; + typedef typename std::vector::size_type size_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + + // This class is the base for regular/const iterator + template + class iterator_impl { + public: + // -- iterator traits + typedef iterator_impl self_type; + typedef TValueType value_type; + typedef TValueType& reference; + typedef TValueType* pointer; + typedef typename TAutoVector::difference_type difference_type; + typedef std::random_access_iterator_tag iterator_category; + + iterator_impl(TAutoVector* vect, size_t index) + : vect_(vect) + , index_(index) { + }; + iterator_impl(const iterator_impl&) = default; + ~iterator_impl() { } + iterator_impl& operator=(const iterator_impl&) = default; + + // -- Advancement + // iterator++ + self_type& operator++() { + ++index_; + return *this; + } + + // ++iterator + self_type operator++(int) { + auto old = *this; + ++index_; + return old; + } + + // iterator-- + self_type& operator--() { + --index_; + return *this; + } + + // --iterator + self_type operator--(int) { + auto old = *this; + --index_; + return old; + } + + self_type operator-(difference_type len) { + return self_type(vect_, index_ - len); + } + + difference_type operator-(const self_type& other) { + assert(vect_ == other.vect_); + return index_ - other.index_; + } + + self_type operator+(difference_type len) { + return self_type(vect_, index_ + len); + } + + self_type& operator+=(difference_type len) { + index_ += len; + return *this; + } + + self_type& operator-=(difference_type len) { + index_ -= len; + return *this; + } + + // -- Reference + reference operator*() { + assert(vect_->size() >= index_); + return (*vect_)[index_]; + } + pointer operator->() { + assert(vect_->size() >= index_); + return &(*vect_)[index_]; + } + + // -- Logical Operators + bool operator==(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ == other.index_; + } + + bool operator!=(const self_type& other) const { + return !(*this == other); + } + + bool operator>(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ > other.index_; + } + + bool operator<(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ < other.index_; + } + + bool operator>=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ >= other.index_; + } + + bool operator<=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ <= other.index_; + } + + private: + TAutoVector* vect_ = nullptr; + size_t index_ = 0; + }; + + typedef iterator_impl iterator; + typedef iterator_impl const_iterator; + typedef std::reverse_iterator reverse_iterator; + typedef std::reverse_iterator const_reverse_iterator; + + autovector() = default; + ~autovector() = default; + + // -- Immutable operations + // Indicate if all data resides in in-stack data structure. + bool only_in_stack() const { + // If no element was inserted at all, the vector's capacity will be `0`. + return vect_.capacity() == 0; + } + + size_type size() const { + return num_stack_items_ + vect_.size(); + } + + bool empty() const { + return size() == 0; + } + + // will not check boundry + const_reference operator[](size_type n) const { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + reference operator[](size_type n) { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + // will check boundry + const_reference at(size_type n) const { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference at(size_type n) { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference front() { + assert(!empty()); + return *begin(); + } + + const_reference front() const { + assert(!empty()); + return *begin(); + } + + reference back() { + assert(!empty()); + return *(end() - 1); + } + + const_reference back() const { + assert(!empty()); + return *(end() - 1); + } + + // -- Mutable Operations + void push_back(T&& item) { + if (num_stack_items_ < kSize) { + values_[num_stack_items_++] = std::move(item); + } else { + vect_.push_back(item); + } + } + + void push_back(const T& item) { + push_back(value_type(item)); + } + + template + void emplace_back(Args&&... args) { + push_back(value_type(args...)); + } + + void pop_back() { + assert(!empty()); + if (!vect_.empty()) { + vect_.pop_back(); + } else { + --num_stack_items_; + } + } + + void clear() { + num_stack_items_ = 0; + vect_.clear(); + } + + // -- Copy and Assignment + autovector& assign(const autovector& other); + + autovector(const autovector& other) { + assign(other); + } + + autovector& operator=(const autovector& other) { + return assign(other); + } + + // move operation are disallowed since it is very hard to make sure both + // autovectors are allocated from the same function stack. + autovector& operator=(autovector&& other) = delete; + autovector(autovector&& other) = delete; + + // -- Iterator Operations + iterator begin() { + return iterator(this, 0); + } + + const_iterator begin() const { + return const_iterator(this, 0); + } + + iterator end() { + return iterator(this, this->size()); + } + + const_iterator end() const { + return const_iterator(this, this->size()); + } + + reverse_iterator rbegin() { + return reverse_iterator(end()); + } + + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + + reverse_iterator rend() { + return reverse_iterator(begin()); + } + + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + private: + size_type num_stack_items_ = 0; // current number of items + value_type values_[kSize]; // the first `kSize` items + // used only if there are more than `kSize` items. + std::vector vect_; +}; + +template +autovector& autovector::assign(const autovector& other) { + // copy the internal vector + vect_.assign(other.vect_.begin(), other.vect_.end()); + + // copy array + num_stack_items_ = other.num_stack_items_; + std::copy(other.values_, other.values_ + num_stack_items_, values_); + + return *this; +} + +} // rocksdb diff --git a/util/autovector_test.cc b/util/autovector_test.cc new file mode 100644 index 0000000000..6d709a374b --- /dev/null +++ b/util/autovector_test.cc @@ -0,0 +1,290 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include + +#include "rocksdb/env.h" +#include "util/autovector.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +using namespace std; + +class AutoVectorTest { }; + +const size_t kSize = 8; +TEST(AutoVectorTest, PushBackAndPopBack) { + autovector vec; + ASSERT_TRUE(vec.empty()); + ASSERT_EQ(0ul, vec.size()); + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.push_back(i); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i]); + ASSERT_EQ(i, vec.at(i)); + } + + size_t size = vec.size(); + while (size != 0) { + vec.pop_back(); + // will always be in heap + ASSERT_TRUE(!vec.only_in_stack()); + ASSERT_EQ(--size, vec.size()); + } + + ASSERT_TRUE(vec.empty()); +} + +TEST(AutoVectorTest, EmplaceBack) { + typedef std::pair ValueType; + autovector vec; + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.emplace_back(i, std::to_string(i + 123)); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i].first); + ASSERT_EQ(std::to_string(i + 123), vec[i].second); + } + + vec.clear(); + ASSERT_TRUE(vec.empty()); + ASSERT_TRUE(!vec.only_in_stack()); +} + +void AssertEqual( + const autovector& a, const autovector& b) { + ASSERT_EQ(a.size(), b.size()); + ASSERT_EQ(a.empty(), b.empty()); + ASSERT_EQ(a.only_in_stack(), b.only_in_stack()); + for (size_t i = 0; i < a.size(); ++i) { + ASSERT_EQ(a[i], b[i]); + } +} + +TEST(AutoVectorTest, CopyAndAssignment) { + // Test both heap-allocated and stack-allocated cases. + for (auto size : { kSize / 2, kSize * 1000 }) { + autovector vec; + for (size_t i = 0; i < size; ++i) { + vec.push_back(i); + } + + { + autovector other; + other = vec; + AssertEqual(other, vec); + } + + { + autovector other(vec); + AssertEqual(other, vec); + } + } +} + +TEST(AutoVectorTest, Iterators) { + autovector vec; + for (size_t i = 0; i < kSize * 1000; ++i) { + vec.push_back(std::to_string(i)); + } + + // basic operator test + ASSERT_EQ(vec.front(), *vec.begin()); + ASSERT_EQ(vec.back(), *(vec.end() - 1)); + ASSERT_TRUE(vec.begin() < vec.end()); + + // non-const iterator + size_t index = 0; + for (const auto& item : vec) { + ASSERT_EQ(vec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) { + ASSERT_EQ(vec[index--], *pos); + } + + // const iterator + const auto& cvec = vec; + index = 0; + for (const auto& item : cvec) { + ASSERT_EQ(cvec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) { + ASSERT_EQ(cvec[index--], *pos); + } + + // forward and backward + auto pos = vec.begin(); + while (pos != vec.end()) { + auto old_val = *pos; + auto old = pos++; + // HACK: make sure -> works + ASSERT_TRUE(!old->empty()); + ASSERT_EQ(old_val, *old); + ASSERT_TRUE(pos == vec.end() || old_val != *pos); + } + + pos = vec.begin(); + for (size_t i = 0; i < vec.size(); i += 2) { + // Cannot use ASSERT_EQ since that macro depends on iostream serialization + ASSERT_TRUE(pos + 2 - 2 == pos); + pos += 2; + ASSERT_TRUE(pos >= vec.begin()); + ASSERT_TRUE(pos <= vec.end()); + + size_t diff = static_cast(pos - vec.begin()); + ASSERT_EQ(i + 2, diff); + } +} + +vector GetTestKeys(size_t size) { + vector keys; + keys.resize(size); + + int index = 0; + for (auto& key : keys) { + key = "item-" + to_string(index++); + } + return keys; +} + +template +void BenchmarkVectorCreationAndInsertion( + string name, size_t ops, size_t item_size, + const std::vector& items) { + auto env = Env::Default(); + + int index = 0; + auto start_time = env->NowNanos(); + auto ops_remaining = ops; + while(ops_remaining--) { + TVector v; + for (size_t i = 0; i < item_size; ++i) { + v.push_back(items[index++]); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "created " << ops << " " << name << " instances:\n\t" + << "each was inserted with " << item_size << " elements\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; +} + +template +size_t BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { + TVector v; + for (const auto& item : GetTestKeys(elem_size)) { + v.push_back(item); + } + auto env = Env::Default(); + + auto ops_remaining = ops; + auto start_time = env->NowNanos(); + size_t total = 0; + while (ops_remaining--) { + auto end = v.end(); + for (auto pos = v.begin(); pos != end; ++pos) { + total += pos->size(); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "performed " << ops << " sequence access against " << name << "\n\t" + << "size: " << elem_size << "\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; + // HACK avoid compiler's optimization to ignore total + return total; +} + +// This test case only reports the performance between std::vector +// and autovector. We chose string for comparison because in most +// o our use cases we used std::vector. +TEST(AutoVectorTest, PerfBench) { + // We run same operations for kOps times in order to get a more fair result. + size_t kOps = 100000; + + // Creation and insertion test + // Test the case when there is: + // * no element inserted: internal array of std::vector may not really get + // initialize. + // * one element inserted: internal array of std::vector must have + // initialized. + // * kSize elements inserted. This shows the most time we'll spend if we + // keep everything in stack. + // * 2 * kSize elements inserted. The internal vector of + // autovector must have been initialized. + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: std::string)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + auto string_keys = GetTestKeys(kOps * 2 * kSize); + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, string_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, string_keys + ); + cout << "-----------------------------------" << endl; + } + + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: uint64_t)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + vector int_keys(kOps * 2 * kSize); + for (size_t i = 0; i < kOps * 2 * kSize; ++i) { + int_keys[i] = i; + } + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, int_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, int_keys + ); + cout << "-----------------------------------" << endl; + } + + // Sequence Access Test + cout << "=====================================================" << endl; + cout << "Sequence Access Test" << endl; + cout << "=====================================================" << endl; + for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) { + BenchmarkSequenceAccess>( + "vector", kOps, elem_size + ); + BenchmarkSequenceAccess>( + "autovector", kOps, elem_size + ); + cout << "-----------------------------------" << endl; + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/cache.cc b/util/cache.cc index 34a12d3453..ddd808b415 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include "rocksdb/cache.h" #include "port/port.h" @@ -111,8 +111,8 @@ class HandleTable { } void Resize() { - uint32_t new_length = 4; - while (new_length < elems_) { + uint32_t new_length = 16; + while (new_length < elems_ * 1.5) { new_length *= 2; } LRUHandle** new_list = new LRUHandle*[new_length]; @@ -264,18 +264,20 @@ Cache::Handle* LRUCache::Insert( LRUHandle* e = reinterpret_cast( malloc(sizeof(LRUHandle)-1 + key.size())); - std::list last_reference_list; + std::vector last_reference_list; + last_reference_list.reserve(1); + + e->value = value; + e->deleter = deleter; + e->charge = charge; + e->key_length = key.size(); + e->hash = hash; + e->refs = 2; // One from LRUCache, one for the returned handle + memcpy(e->key_data, key.data(), key.size()); { MutexLock l(&mutex_); - e->value = value; - e->deleter = deleter; - e->charge = charge; - e->key_length = key.size(); - e->hash = hash; - e->refs = 2; // One from LRUCache, one for the returned handle - memcpy(e->key_data, key.data(), key.size()); LRU_Append(e); LRUHandle* old = table_.Insert(e); diff --git a/util/coding.cc b/util/coding.cc index 2d70647fb7..ce67fa4866 100644 --- a/util/coding.cc +++ b/util/coding.cc @@ -217,6 +217,17 @@ Slice GetLengthPrefixedSlice(const char* data) { return Slice(p, len); } +Slice GetSliceUntil(Slice* slice, char delimiter) { + uint32_t len; + for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) { + // nothing + } + + Slice ret(slice->data(), len); + slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0)); + return ret; +} + void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, uint32_t bits, uint64_t value) { assert((offset + bits + 7)/8 <= dstlen); diff --git a/util/coding.h b/util/coding.h index 3fd892f791..4477dc799e 100644 --- a/util/coding.h +++ b/util/coding.h @@ -40,6 +40,8 @@ extern bool GetVarint64(Slice* input, uint64_t* value); extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); extern Slice GetLengthPrefixedSlice(const char* data); +extern Slice GetSliceUntil(Slice* slice, char delimiter); + // Pointer-based variants of GetVarint... These either store a value // in *v and return a pointer just past the parsed value, or return // nullptr on error. These routines only look at bytes in the range diff --git a/util/env_posix.cc b/util/env_posix.cc index 1ed8d69600..3db0fd62e8 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -395,7 +395,7 @@ class PosixMmapFile : public WritableFile { } Status MapNewRegion() { -#ifdef OS_LINUX +#ifdef ROCKSDB_FALLOCATE_PRESENT assert(base_ == nullptr); TEST_KILL_RANDOM(rocksdb_kill_odds); @@ -581,7 +581,7 @@ class PosixMmapFile : public WritableFile { #endif } -#ifdef OS_LINUX +#ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) { TEST_KILL_RANDOM(rocksdb_kill_odds); if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { @@ -758,7 +758,7 @@ class PosixWritableFile : public WritableFile { #endif } -#ifdef OS_LINUX +#ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) { TEST_KILL_RANDOM(rocksdb_kill_odds); if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { @@ -862,7 +862,7 @@ class PosixRandomRWFile : public RandomRWFile { return Status::OK(); } -#ifdef OS_LINUX +#ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) { if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { return Status::OK(); @@ -1303,7 +1303,7 @@ class PosixEnv : public Env { } bool SupportsFastAllocate(const std::string& path) { -#ifdef OS_LINUX +#ifdef ROCKSDB_FALLOCATE_PRESENT struct statfs s; if (statfs(path.c_str(), &s)){ return false; diff --git a/util/posix_logger.h b/util/posix_logger.h index 0a09bd1ebc..8f7463c98b 100644 --- a/util/posix_logger.h +++ b/util/posix_logger.h @@ -36,15 +36,19 @@ class PosixLogger : public Logger { const static uint64_t flush_every_seconds_ = 5; std::atomic_uint_fast64_t last_flush_micros_; Env* env_; + bool flush_pending_; public: PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env) : file_(f), gettid_(gettid), log_size_(0), fd_(fileno(f)), - last_flush_micros_(0), env_(env) { } + last_flush_micros_(0), env_(env), flush_pending_(false) { } virtual ~PosixLogger() { fclose(file_); } virtual void Flush() { - fflush(file_); + if (flush_pending_) { + flush_pending_ = false; + fflush(file_); + } last_flush_micros_ = env_->NowMicros(); } virtual void Logv(const char* format, va_list ap) { @@ -107,7 +111,7 @@ class PosixLogger : public Logger { assert(p <= limit); const size_t write_size = p - base; -#ifdef OS_LINUX +#ifdef ROCKSDB_FALLOCATE_PRESENT // If this write would cross a boundary of kDebugLogChunkSize // space, pre-allocate more space to avoid overly large // allocations from filesystem allocsize options. @@ -124,6 +128,7 @@ class PosixLogger : public Logger { #endif size_t sz = fwrite(base, 1, write_size, file_); + flush_pending_ = true; assert(sz == write_size); if (sz > 0) { log_size_ += write_size; @@ -131,6 +136,7 @@ class PosixLogger : public Logger { uint64_t now_micros = static_cast(now_tv.tv_sec) * 1000000 + now_tv.tv_usec; if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + flush_pending_ = false; fflush(file_); last_flush_micros_ = now_micros; } diff --git a/util/stack_trace.h b/util/stack_trace.h index 888304462e..3b06e1df06 100644 --- a/util/stack_trace.h +++ b/util/stack_trace.h @@ -11,4 +11,7 @@ namespace rocksdb { // Currently supports linux only. No-op otherwise. void InstallStackTraceHandler(); +// Prints stack, skips skip_first_frames frames +void PrintStack(int first_frames_to_skip = 0); + } // namespace rocksdb diff --git a/util/status.cc b/util/status.cc index f7c40e9526..69060a7ccf 100644 --- a/util/status.cc +++ b/util/status.cc @@ -16,68 +16,65 @@ namespace rocksdb { const char* Status::CopyState(const char* state) { uint32_t size; memcpy(&size, state, sizeof(size)); - char* result = new char[size + 5]; - memcpy(result, state, size + 5); + char* result = new char[size + 4]; + memcpy(result, state, size + 4); return result; } -Status::Status(Code code, const Slice& msg, const Slice& msg2) { +Status::Status(Code code, const Slice& msg, const Slice& msg2) : + code_(code) { assert(code != kOk); const uint32_t len1 = msg.size(); const uint32_t len2 = msg2.size(); const uint32_t size = len1 + (len2 ? (2 + len2) : 0); - char* result = new char[size + 5]; + char* result = new char[size + 4]; memcpy(result, &size, sizeof(size)); - result[4] = static_cast(code); - memcpy(result + 5, msg.data(), len1); + memcpy(result + 4, msg.data(), len1); if (len2) { - result[5 + len1] = ':'; - result[6 + len1] = ' '; - memcpy(result + 7 + len1, msg2.data(), len2); + result[4 + len1] = ':'; + result[5 + len1] = ' '; + memcpy(result + 6 + len1, msg2.data(), len2); } state_ = result; } std::string Status::ToString() const { - if (state_ == nullptr) { - return "OK"; - } else { - char tmp[30]; - const char* type; - switch (code()) { - case kOk: - type = "OK"; - break; - case kNotFound: - type = "NotFound: "; - break; - case kCorruption: - type = "Corruption: "; - break; - case kNotSupported: - type = "Not implemented: "; - break; - case kInvalidArgument: - type = "Invalid argument: "; - break; - case kIOError: - type = "IO error: "; - break; - case kMergeInProgress: - type = "Merge In Progress: "; - break; - default: - snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", - static_cast(code())); - type = tmp; - break; - } - std::string result(type); + char tmp[30]; + const char* type; + switch (code_) { + case kOk: + return "OK"; + case kNotFound: + type = "NotFound: "; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + case kMergeInProgress: + type = "Merge In Progress: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast(code())); + type = tmp; + break; + } + std::string result(type); + if (state_ != nullptr) { uint32_t length; memcpy(&length, state_, sizeof(length)); - result.append(state_ + 5, length); - return result; + result.append(state_ + 4, length); } + return result; } } // namespace rocksdb diff --git a/util/testharness.h b/util/testharness.h index 936ee8b6c4..f15917816e 100644 --- a/util/testharness.h +++ b/util/testharness.h @@ -15,6 +15,7 @@ #include "rocksdb/env.h" #include "rocksdb/slice.h" #include "util/random.h" +#include "util/stack_trace.h" namespace rocksdb { namespace test { @@ -58,6 +59,7 @@ class Tester { ~Tester() { if (!ok_) { fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); + PrintStack(2); exit(1); } } diff --git a/utilities/.DS_Store b/utilities/.DS_Store deleted file mode 100644 index daeccc094b..0000000000 Binary files a/utilities/.DS_Store and /dev/null differ diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc new file mode 100644 index 0000000000..61e009cd31 --- /dev/null +++ b/utilities/backupable/backupable_db.cc @@ -0,0 +1,874 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "utilities/backupable_db.h" +#include "db/filename.h" +#include "util/coding.h" +#include "rocksdb/transaction_log.h" + +#define __STDC_FORMAT_MACROS + +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +// -------- BackupEngine class --------- +class BackupEngine { + public: + BackupEngine(Env* db_env, const BackupableDBOptions& options); + ~BackupEngine(); + Status CreateNewBackup(DB* db, bool flush_before_backup = false); + Status PurgeOldBackups(uint32_t num_backups_to_keep); + Status DeleteBackup(BackupID backup_id); + + void GetBackupInfo(std::vector* backup_info); + Status RestoreDBFromBackup(BackupID backup_id, const std::string &db_dir, + const std::string &wal_dir); + Status RestoreDBFromLatestBackup(const std::string &db_dir, + const std::string &wal_dir) { + return RestoreDBFromBackup(latest_backup_id_, db_dir, wal_dir); + } + + void DeleteBackupsNewerThan(uint64_t sequence_number); + + private: + class BackupMeta { + public: + BackupMeta(const std::string& meta_filename, + std::unordered_map* file_refs, Env* env) + : timestamp_(0), size_(0), meta_filename_(meta_filename), + file_refs_(file_refs), env_(env) {} + + ~BackupMeta() {} + + void RecordTimestamp() { + env_->GetCurrentTime(×tamp_); + } + int64_t GetTimestamp() const { + return timestamp_; + } + uint64_t GetSize() const { + return size_; + } + void SetSequenceNumber(uint64_t sequence_number) { + sequence_number_ = sequence_number; + } + uint64_t GetSequenceNumber() { + return sequence_number_; + } + + void AddFile(const std::string& filename, uint64_t size); + void Delete(); + + bool Empty() { + return files_.empty(); + } + + const std::vector& GetFiles() { + return files_; + } + + Status LoadFromFile(const std::string& backup_dir); + Status StoreToFile(bool sync); + + private: + int64_t timestamp_; + // sequence number is only approximate, should not be used + // by clients + uint64_t sequence_number_; + uint64_t size_; + std::string const meta_filename_; + // files with relative paths (without "/" prefix!!) + std::vector files_; + std::unordered_map* file_refs_; + Env* env_; + + static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024; // 10MB + }; // BackupMeta + + inline std::string GetAbsolutePath( + const std::string &relative_path = "") const { + assert(relative_path.size() == 0 || relative_path[0] != '/'); + return options_.backup_dir + "/" + relative_path; + } + inline std::string GetPrivateDirRel() const { + return "private"; + } + inline std::string GetPrivateFileRel(BackupID backup_id, + const std::string &file = "") const { + assert(file.size() == 0 || file[0] != '/'); + return GetPrivateDirRel() + "/" + std::to_string(backup_id) + "/" + file; + } + inline std::string GetSharedFileRel(const std::string& file = "") const { + assert(file.size() == 0 || file[0] != '/'); + return "shared/" + file; + } + inline std::string GetLatestBackupFile(bool tmp = false) const { + return GetAbsolutePath(std::string("LATEST_BACKUP") + (tmp ? ".tmp" : "")); + } + inline std::string GetBackupMetaDir() const { + return GetAbsolutePath("meta"); + } + inline std::string GetBackupMetaFile(BackupID backup_id) const { + return GetBackupMetaDir() + "/" + std::to_string(backup_id); + } + + Status GetLatestBackupFileContents(uint32_t* latest_backup); + Status PutLatestBackupFileContents(uint32_t latest_backup); + // if size_limit == 0, there is no size limit, copy everything + Status CopyFile(const std::string& src, + const std::string& dst, + Env* src_env, + Env* dst_env, + bool sync, + uint64_t* size = nullptr, + uint64_t size_limit = 0); + // if size_limit == 0, there is no size limit, copy everything + Status BackupFile(BackupID backup_id, + BackupMeta* backup, + bool shared, + const std::string& src_dir, + const std::string& src_fname, // starts with "/" + uint64_t size_limit = 0); + // Will delete all the files we don't need anymore + // If full_scan == true, it will do the full scan of files/ directory + // and delete all the files that are not referenced from backuped_file_refs_ + void GarbageCollection(bool full_scan); + + // backup state data + BackupID latest_backup_id_; + std::map backups_; + std::unordered_map backuped_file_refs_; + std::vector obsolete_backups_; + + // options data + BackupableDBOptions options_; + Env* db_env_; + Env* backup_env_; + + static const size_t copy_file_buffer_size_ = 5 * 1024 * 1024LL; // 5MB +}; + +BackupEngine::BackupEngine(Env* db_env, const BackupableDBOptions& options) + : options_(options), + db_env_(db_env), + backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_) { + + // create all the dirs we need + backup_env_->CreateDirIfMissing(GetAbsolutePath()); + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel())); + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel())); + backup_env_->CreateDirIfMissing(GetBackupMetaDir()); + + std::vector backup_meta_files; + backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files); + // create backups_ structure + for (auto& file : backup_meta_files) { + BackupID backup_id = 0; + sscanf(file.c_str(), "%u", &backup_id); + if (backup_id == 0 || file != std::to_string(backup_id)) { + // invalid file name, delete that + backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file); + continue; + } + assert(backups_.find(backup_id) == backups_.end()); + backups_.insert(std::make_pair( + backup_id, BackupMeta(GetBackupMetaFile(backup_id), + &backuped_file_refs_, backup_env_))); + } + + if (options_.destroy_old_data) { // Destory old data + for (auto& backup : backups_) { + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + backups_.clear(); + // start from beginning + latest_backup_id_ = 0; + // GarbageCollection() will do the actual deletion + } else { // Load data from storage + // load the backups if any + for (auto& backup : backups_) { + Status s = backup.second.LoadFromFile(options_.backup_dir); + if (!s.ok()) { + Log(options_.info_log, "Backup %u corrupted - deleting -- %s", + backup.first, s.ToString().c_str()); + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + } + // delete obsolete backups from the structure + for (auto ob : obsolete_backups_) { + backups_.erase(ob); + } + + Status s = GetLatestBackupFileContents(&latest_backup_id_); + // If latest backup file is corrupted or non-existent + // set latest backup as the biggest backup we have + // or 0 if we have no backups + if (!s.ok() || + backups_.find(latest_backup_id_) == backups_.end()) { + auto itr = backups_.end(); + latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first; + } + } + + // delete any backups that claim to be later than latest + for (auto itr = backups_.upper_bound(latest_backup_id_); + itr != backups_.end();) { + itr->second.Delete(); + obsolete_backups_.push_back(itr->first); + itr = backups_.erase(itr); + } + + PutLatestBackupFileContents(latest_backup_id_); // Ignore errors + GarbageCollection(true); + Log(options_.info_log, + "Initialized BackupEngine, the latest backup is %u.", + latest_backup_id_); +} + +BackupEngine::~BackupEngine() { + LogFlush(options_.info_log); +} + +void BackupEngine::DeleteBackupsNewerThan(uint64_t sequence_number) { + for (auto backup : backups_) { + if (backup.second.GetSequenceNumber() > sequence_number) { + Log(options_.info_log, + "Deleting backup %u because sequence number (%" PRIu64 + ") is newer than %" PRIu64 "", + backup.first, backup.second.GetSequenceNumber(), sequence_number); + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + } + for (auto ob : obsolete_backups_) { + backups_.erase(backups_.find(ob)); + } + auto itr = backups_.end(); + latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first; + PutLatestBackupFileContents(latest_backup_id_); // Ignore errors + GarbageCollection(false); +} + +Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) { + Status s; + std::vector live_files; + VectorLogPtr live_wal_files; + uint64_t manifest_file_size = 0; + uint64_t sequence_number = db->GetLatestSequenceNumber(); + + s = db->DisableFileDeletions(); + if (s.ok()) { + // this will return live_files prefixed with "/" + s = db->GetLiveFiles(live_files, &manifest_file_size, flush_before_backup); + } + // if we didn't flush before backup, we need to also get WAL files + if (s.ok() && !flush_before_backup) { + // returns file names prefixed with "/" + s = db->GetSortedWalFiles(live_wal_files); + } + if (!s.ok()) { + db->EnableFileDeletions(); + return s; + } + + BackupID new_backup_id = latest_backup_id_ + 1; + assert(backups_.find(new_backup_id) == backups_.end()); + auto ret = backups_.insert(std::make_pair( + new_backup_id, BackupMeta(GetBackupMetaFile(new_backup_id), + &backuped_file_refs_, backup_env_))); + assert(ret.second == true); + auto& new_backup = ret.first->second; + new_backup.RecordTimestamp(); + new_backup.SetSequenceNumber(sequence_number); + + Log(options_.info_log, "Started the backup process -- creating backup %u", + new_backup_id); + + // create private dir + s = backup_env_->CreateDir(GetAbsolutePath(GetPrivateFileRel(new_backup_id))); + + // copy live_files + for (size_t i = 0; s.ok() && i < live_files.size(); ++i) { + uint64_t number; + FileType type; + bool ok = ParseFileName(live_files[i], &number, &type); + if (!ok) { + assert(false); + return Status::Corruption("Can't parse file name. This is very bad"); + } + // we should only get sst, manifest and current files here + assert(type == kTableFile || + type == kDescriptorFile || + type == kCurrentFile); + + // rules: + // * if it's kTableFile, than it's shared + // * if it's kDescriptorFile, limit the size to manifest_file_size + s = BackupFile(new_backup_id, + &new_backup, + type == kTableFile, /* shared */ + db->GetName(), /* src_dir */ + live_files[i], /* src_fname */ + (type == kDescriptorFile) ? manifest_file_size : 0); + } + + // copy WAL files + for (size_t i = 0; s.ok() && i < live_wal_files.size(); ++i) { + if (live_wal_files[i]->Type() == kAliveLogFile) { + // we only care about live log files + // copy the file into backup_dir/files// + s = BackupFile(new_backup_id, + &new_backup, + false, /* not shared */ + db->GetOptions().wal_dir, + live_wal_files[i]->PathName()); + } + } + + // we copied all the files, enable file deletions + db->EnableFileDeletions(); + + if (s.ok()) { + // persist the backup metadata on the disk + s = new_backup.StoreToFile(options_.sync); + } + if (s.ok()) { + // install the newly created backup meta! (atomic) + s = PutLatestBackupFileContents(new_backup_id); + } + if (!s.ok()) { + // clean all the files we might have created + Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str()); + backups_.erase(new_backup_id); + GarbageCollection(true); + return s; + } + + // here we know that we succeeded and installed the new backup + // in the LATEST_BACKUP file + latest_backup_id_ = new_backup_id; + Log(options_.info_log, "Backup DONE. All is good"); + return s; +} + +Status BackupEngine::PurgeOldBackups(uint32_t num_backups_to_keep) { + Log(options_.info_log, "Purging old backups, keeping %u", + num_backups_to_keep); + while (num_backups_to_keep < backups_.size()) { + Log(options_.info_log, "Deleting backup %u", backups_.begin()->first); + backups_.begin()->second.Delete(); + obsolete_backups_.push_back(backups_.begin()->first); + backups_.erase(backups_.begin()); + } + GarbageCollection(false); + return Status::OK(); +} + +Status BackupEngine::DeleteBackup(BackupID backup_id) { + Log(options_.info_log, "Deleting backup %u", backup_id); + auto backup = backups_.find(backup_id); + if (backup == backups_.end()) { + return Status::NotFound("Backup not found"); + } + backup->second.Delete(); + obsolete_backups_.push_back(backup_id); + backups_.erase(backup); + GarbageCollection(false); + return Status::OK(); +} + +void BackupEngine::GetBackupInfo(std::vector* backup_info) { + backup_info->reserve(backups_.size()); + for (auto& backup : backups_) { + if (!backup.second.Empty()) { + backup_info->push_back(BackupInfo( + backup.first, backup.second.GetTimestamp(), backup.second.GetSize())); + } + } +} + +Status BackupEngine::RestoreDBFromBackup(BackupID backup_id, + const std::string &db_dir, + const std::string &wal_dir) { + auto backup_itr = backups_.find(backup_id); + if (backup_itr == backups_.end()) { + return Status::NotFound("Backup not found"); + } + auto& backup = backup_itr->second; + if (backup.Empty()) { + return Status::NotFound("Backup not found"); + } + + Log(options_.info_log, "Restoring backup id %u\n", backup_id); + + // just in case. Ignore errors + db_env_->CreateDirIfMissing(db_dir); + db_env_->CreateDirIfMissing(wal_dir); + + // delete log files that might have been already in wal_dir. + // This is important since they might get replayed to the restored DB, + // which will then differ from the backuped DB + std::vector delete_children; + db_env_->GetChildren(wal_dir, &delete_children); // ignore errors + for (auto f : delete_children) { + db_env_->DeleteFile(wal_dir + "/" + f); // ignore errors + } + // Also delete all the db_dir children. This is not so important + // because obsolete files will be deleted by DBImpl::PurgeObsoleteFiles() + delete_children.clear(); + db_env_->GetChildren(db_dir, &delete_children); // ignore errors + for (auto f : delete_children) { + db_env_->DeleteFile(db_dir + "/" + f); // ignore errors + } + + Status s; + for (auto& file : backup.GetFiles()) { + std::string dst; + // 1. extract the filename + size_t slash = file.find_last_of('/'); + // file will either be shared/ or private// + assert(slash != std::string::npos); + dst = file.substr(slash + 1); + + // 2. find the filetype + uint64_t number; + FileType type; + bool ok = ParseFileName(dst, &number, &type); + if (!ok) { + return Status::Corruption("Backup corrupted"); + } + // 3. Construct the final path + // kLogFile lives in wal_dir and all the rest live in db_dir + dst = ((type == kLogFile) ? wal_dir : db_dir) + + "/" + dst; + + Log(options_.info_log, "Restoring %s to %s\n", file.c_str(), dst.c_str()); + s = CopyFile(GetAbsolutePath(file), dst, backup_env_, db_env_, false); + if (!s.ok()) { + break; + } + } + + Log(options_.info_log, "Restoring done -- %s\n", s.ToString().c_str()); + return s; +} + +// latest backup id is an ASCII representation of latest backup id +Status BackupEngine::GetLatestBackupFileContents(uint32_t* latest_backup) { + Status s; + unique_ptr file; + s = backup_env_->NewSequentialFile(GetLatestBackupFile(), + &file, + EnvOptions()); + if (!s.ok()) { + return s; + } + + char buf[11]; + Slice data; + s = file->Read(10, &data, buf); + if (!s.ok() || data.size() == 0) { + return s.ok() ? Status::Corruption("Latest backup file corrupted") : s; + } + buf[data.size()] = 0; + + *latest_backup = 0; + sscanf(data.data(), "%u", latest_backup); + if (backup_env_->FileExists(GetBackupMetaFile(*latest_backup)) == false) { + s = Status::Corruption("Latest backup file corrupted"); + } + return Status::OK(); +} + +// this operation HAS to be atomic +// writing 4 bytes to the file is atomic alright, but we should *never* +// do something like 1. delete file, 2. write new file +// We write to a tmp file and then atomically rename +Status BackupEngine::PutLatestBackupFileContents(uint32_t latest_backup) { + Status s; + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = backup_env_->NewWritableFile(GetLatestBackupFile(true), + &file, + env_options); + if (!s.ok()) { + backup_env_->DeleteFile(GetLatestBackupFile(true)); + return s; + } + + char file_contents[10]; + int len = sprintf(file_contents, "%u\n", latest_backup); + s = file->Append(Slice(file_contents, len)); + if (s.ok() && options_.sync) { + file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + if (s.ok()) { + // atomically replace real file with new tmp + s = backup_env_->RenameFile(GetLatestBackupFile(true), + GetLatestBackupFile(false)); + } + return s; +} + +Status BackupEngine::CopyFile(const std::string& src, + const std::string& dst, + Env* src_env, + Env* dst_env, + bool sync, + uint64_t* size, + uint64_t size_limit) { + Status s; + unique_ptr dst_file; + unique_ptr src_file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + if (size != nullptr) { + *size = 0; + } + + // Check if size limit is set. if not, set it to very big number + if (size_limit == 0) { + size_limit = std::numeric_limits::max(); + } + + s = src_env->NewSequentialFile(src, &src_file, env_options); + if (s.ok()) { + s = dst_env->NewWritableFile(dst, &dst_file, env_options); + } + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[copy_file_buffer_size_]); + Slice data; + + do { + size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? + copy_file_buffer_size_ : size_limit; + s = src_file->Read(buffer_to_read, &data, buf.get()); + size_limit -= data.size(); + if (size != nullptr) { + *size += data.size(); + } + if (s.ok()) { + s = dst_file->Append(data); + } + } while (s.ok() && data.size() > 0 && size_limit > 0); + + if (s.ok() && sync) { + s = dst_file->Sync(); + } + + return s; +} + +// src_fname will always start with "/" +Status BackupEngine::BackupFile(BackupID backup_id, + BackupMeta* backup, + bool shared, + const std::string& src_dir, + const std::string& src_fname, + uint64_t size_limit) { + + assert(src_fname.size() > 0 && src_fname[0] == '/'); + std::string dst_relative = src_fname.substr(1); + if (shared) { + dst_relative = GetSharedFileRel(dst_relative); + } else { + dst_relative = GetPrivateFileRel(backup_id, dst_relative); + } + std::string dst_path = GetAbsolutePath(dst_relative); + Status s; + uint64_t size; + + // if it's shared, we also need to check if it exists -- if it does, + // no need to copy it again + if (shared && backup_env_->FileExists(dst_path)) { + backup_env_->GetFileSize(dst_path, &size); // Ignore error + Log(options_.info_log, "%s already present", src_fname.c_str()); + } else { + Log(options_.info_log, "Copying %s", src_fname.c_str()); + s = CopyFile(src_dir + src_fname, + dst_path, + db_env_, + backup_env_, + options_.sync, + &size, + size_limit); + } + if (s.ok()) { + backup->AddFile(dst_relative, size); + } + return s; +} + +void BackupEngine::GarbageCollection(bool full_scan) { + Log(options_.info_log, "Starting garbage collection"); + std::vector to_delete; + for (auto& itr : backuped_file_refs_) { + if (itr.second == 0) { + Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first)); + Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(), + s.ToString().c_str()); + to_delete.push_back(itr.first); + } + } + for (auto& td : to_delete) { + backuped_file_refs_.erase(td); + } + if (!full_scan) { + // take care of private dirs -- if full_scan == true, then full_scan will + // take care of them + for (auto backup_id : obsolete_backups_) { + std::string private_dir = GetPrivateFileRel(backup_id); + Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir)); + Log(options_.info_log, "Deleting private dir %s -- %s", + private_dir.c_str(), s.ToString().c_str()); + } + } + obsolete_backups_.clear(); + + if (full_scan) { + Log(options_.info_log, "Starting full scan garbage collection"); + // delete obsolete shared files + std::vector shared_children; + backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()), + &shared_children); + for (auto& child : shared_children) { + std::string rel_fname = GetSharedFileRel(child); + // if it's not refcounted, delete it + if (backuped_file_refs_.find(rel_fname) == backuped_file_refs_.end()) { + // this might be a directory, but DeleteFile will just fail in that + // case, so we're good + Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname)); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", rel_fname.c_str()); + } + } + } + + // delete obsolete private files + std::vector private_children; + backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()), + &private_children); + for (auto& child : private_children) { + BackupID backup_id = 0; + sscanf(child.c_str(), "%u", &backup_id); + if (backup_id == 0 || backups_.find(backup_id) != backups_.end()) { + // it's either not a number or it's still alive. continue + continue; + } + // here we have to delete the dir and all its children + std::string full_private_path = + GetAbsolutePath(GetPrivateFileRel(backup_id)); + std::vector subchildren; + backup_env_->GetChildren(full_private_path, &subchildren); + for (auto& subchild : subchildren) { + Status s = backup_env_->DeleteFile(full_private_path + subchild); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", + (full_private_path + subchild).c_str()); + } + } + // finally delete the private dir + Status s = backup_env_->DeleteDir(full_private_path); + Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(), + s.ToString().c_str()); + } + } +} + +// ------- BackupMeta class -------- + +void BackupEngine::BackupMeta::AddFile(const std::string& filename, + uint64_t size) { + size_ += size; + files_.push_back(filename); + auto itr = file_refs_->find(filename); + if (itr == file_refs_->end()) { + file_refs_->insert(std::make_pair(filename, 1)); + } else { + ++itr->second; // increase refcount if already present + } +} + +void BackupEngine::BackupMeta::Delete() { + for (auto& file : files_) { + auto itr = file_refs_->find(file); + assert(itr != file_refs_->end()); + --(itr->second); // decrease refcount + } + files_.clear(); + // delete meta file + env_->DeleteFile(meta_filename_); + timestamp_ = 0; +} + +// each backup meta file is of the format: +// +// +// +// +// +// ... +// TODO: maybe add checksum? +Status BackupEngine::BackupMeta::LoadFromFile(const std::string& backup_dir) { + assert(Empty()); + Status s; + unique_ptr backup_meta_file; + s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions()); + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[max_backup_meta_file_size_ + 1]); + Slice data; + s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf.get()); + + if (!s.ok() || data.size() == max_backup_meta_file_size_) { + return s.ok() ? Status::IOError("File size too big") : s; + } + buf[data.size()] = 0; + + uint32_t num_files = 0; + int bytes_read = 0; + sscanf(data.data(), "%" PRId64 "%n", ×tamp_, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + sscanf(data.data(), "%" PRIu64 "%n", &sequence_number_, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + sscanf(data.data(), "%u%n", &num_files, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + + std::vector> files; + + for (uint32_t i = 0; s.ok() && i < num_files; ++i) { + std::string filename = GetSliceUntil(&data, '\n').ToString(); + uint64_t size; + s = env_->GetFileSize(backup_dir + "/" + filename, &size); + files.push_back(std::make_pair(filename, size)); + } + + if (s.ok()) { + for (auto file : files) { + AddFile(file.first, file.second); + } + } + + return s; +} + +Status BackupEngine::BackupMeta::StoreToFile(bool sync) { + Status s; + unique_ptr backup_meta_file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file, + env_options); + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[max_backup_meta_file_size_]); + int len = 0, buf_size = max_backup_meta_file_size_; + len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_); + len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n", + sequence_number_); + len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size()); + for (size_t i = 0; i < files_.size(); ++i) { + len += snprintf(buf.get() + len, buf_size - len, "%s\n", files_[i].c_str()); + } + + s = backup_meta_file->Append(Slice(buf.get(), (size_t)len)); + if (s.ok() && sync) { + s = backup_meta_file->Sync(); + } + if (s.ok()) { + s = backup_meta_file->Close(); + } + if (s.ok()) { + s = env_->RenameFile(meta_filename_ + ".tmp", meta_filename_); + } + return s; +} + +// --- BackupableDB methods -------- + +BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options) + : StackableDB(db), backup_engine_(new BackupEngine(db->GetEnv(), options)) { + backup_engine_->DeleteBackupsNewerThan(GetLatestSequenceNumber()); +} + +BackupableDB::~BackupableDB() { + delete backup_engine_; +} + +Status BackupableDB::CreateNewBackup(bool flush_before_backup) { + return backup_engine_->CreateNewBackup(this, flush_before_backup); +} + +void BackupableDB::GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); +} + +Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) { + return backup_engine_->PurgeOldBackups(num_backups_to_keep); +} + +Status BackupableDB::DeleteBackup(BackupID backup_id) { + return backup_engine_->DeleteBackup(backup_id); +} + +// --- RestoreBackupableDB methods ------ + +RestoreBackupableDB::RestoreBackupableDB(Env* db_env, + const BackupableDBOptions& options) + : backup_engine_(new BackupEngine(db_env, options)) {} + +RestoreBackupableDB::~RestoreBackupableDB() { + delete backup_engine_; +} + +void +RestoreBackupableDB::GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); +} + +Status RestoreBackupableDB::RestoreDBFromBackup(BackupID backup_id, + const std::string& db_dir, + const std::string& wal_dir) { + return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir); +} + +Status +RestoreBackupableDB::RestoreDBFromLatestBackup(const std::string& db_dir, + const std::string& wal_dir) { + return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir); +} + +Status RestoreBackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) { + return backup_engine_->PurgeOldBackups(num_backups_to_keep); +} + +Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) { + return backup_engine_->DeleteBackup(backup_id); +} + +} // namespace rocksdb diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc new file mode 100644 index 0000000000..af4af0d02e --- /dev/null +++ b/utilities/backupable/backupable_db_test.cc @@ -0,0 +1,668 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/types.h" +#include "rocksdb/transaction_log.h" +#include "utilities/utility_db.h" +#include "utilities/backupable_db.h" +#include "util/testharness.h" +#include "util/random.h" +#include "util/testutil.h" +#include "util/auto_roll_logger.h" + +#include +#include + +namespace rocksdb { + +namespace { + +using std::unique_ptr; + +class DummyDB : public StackableDB { + public: + /* implicit */ + DummyDB(const Options& options, const std::string& dbname) + : StackableDB(nullptr), options_(options), dbname_(dbname), + deletions_enabled_(true), sequence_number_(0) {} + + virtual SequenceNumber GetLatestSequenceNumber() const { + return ++sequence_number_; + } + + virtual const std::string& GetName() const override { + return dbname_; + } + + virtual Env* GetEnv() const override { + return options_.env; + } + + virtual const Options& GetOptions() const override { + return options_; + } + + virtual Status EnableFileDeletions() override { + ASSERT_TRUE(!deletions_enabled_); + deletions_enabled_ = true; + return Status::OK(); + } + + virtual Status DisableFileDeletions() override { + ASSERT_TRUE(deletions_enabled_); + deletions_enabled_ = false; + return Status::OK(); + } + + virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, + bool flush_memtable = true) override { + ASSERT_TRUE(!deletions_enabled_); + vec = live_files_; + *mfs = 100; + return Status::OK(); + } + + class DummyLogFile : public LogFile { + public: + /* implicit */ + DummyLogFile(const std::string& path, bool alive = true) + : path_(path), alive_(alive) {} + + virtual std::string PathName() const override { + return path_; + } + + virtual uint64_t LogNumber() const { + // what business do you have calling this method? + ASSERT_TRUE(false); + return 0; + } + + virtual WalFileType Type() const override { + return alive_ ? kAliveLogFile : kArchivedLogFile; + } + + virtual SequenceNumber StartSequence() const { + // backupabledb should not need this method + ASSERT_TRUE(false); + return 0; + } + + virtual uint64_t SizeFileBytes() const { + // backupabledb should not need this method + ASSERT_TRUE(false); + return 0; + } + + private: + std::string path_; + bool alive_; + }; // DummyLogFile + + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + ASSERT_TRUE(!deletions_enabled_); + files.resize(wal_files_.size()); + for (size_t i = 0; i < files.size(); ++i) { + files[i].reset( + new DummyLogFile(wal_files_[i].first, wal_files_[i].second)); + } + return Status::OK(); + } + + std::vector live_files_; + // pair + std::vector> wal_files_; + private: + Options options_; + std::string dbname_; + bool deletions_enabled_; + mutable SequenceNumber sequence_number_; +}; // DummyDB + +class TestEnv : public EnvWrapper { + public: + explicit TestEnv(Env* t) : EnvWrapper(t) {} + + class DummySequentialFile : public SequentialFile { + public: + DummySequentialFile() : SequentialFile(), rnd_(5) {} + virtual Status Read(size_t n, Slice* result, char* scratch) { + size_t read_size = (n > size_left) ? size_left : n; + for (size_t i = 0; i < read_size; ++i) { + scratch[i] = rnd_.Next() & 255; + } + *result = Slice(scratch, read_size); + size_left -= read_size; + return Status::OK(); + } + + virtual Status Skip(uint64_t n) { + size_left = (n > size_left) ? size_left - n : 0; + return Status::OK(); + } + private: + size_t size_left = 200; + Random rnd_; + }; + + Status NewSequentialFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) { + opened_files_.push_back(f); + if (dummy_sequential_file_) { + r->reset(new TestEnv::DummySequentialFile()); + return Status::OK(); + } else { + return EnvWrapper::NewSequentialFile(f, r, options); + } + } + + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) { + if (limit_written_files_ <= 0) { + return Status::IOError("Sorry, can't do this"); + } + limit_written_files_--; + return EnvWrapper::NewWritableFile(f, r, options); + } + + void AssertOpenedFiles(std::vector& should_have_opened) { + sort(should_have_opened.begin(), should_have_opened.end()); + sort(opened_files_.begin(), opened_files_.end()); + ASSERT_TRUE(opened_files_ == should_have_opened); + } + + void ClearOpenedFiles() { + opened_files_.clear(); + } + + void SetLimitWrittenFiles(uint64_t limit) { + limit_written_files_ = limit; + } + + void SetDummySequentialFile(bool dummy_sequential_file) { + dummy_sequential_file_ = dummy_sequential_file; + } + + private: + bool dummy_sequential_file_ = false; + std::vector opened_files_; + uint64_t limit_written_files_ = 1000000; +}; // TestEnv + +class FileManager : public EnvWrapper { + public: + explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {} + + Status DeleteRandomFileInDir(const std::string dir) { + std::vector children; + GetChildren(dir, &children); + if (children.size() <= 2) { // . and .. + return Status::NotFound(""); + } + while (true) { + int i = rnd_.Next() % children.size(); + if (children[i] != "." && children[i] != "..") { + return DeleteFile(dir + "/" + children[i]); + } + } + // should never get here + assert(false); + return Status::NotFound(""); + } + + Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) { + uint64_t size; + Status s = GetFileSize(fname, &size); + if (!s.ok()) { + return s; + } + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = NewRandomRWFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + + for (uint64_t i = 0; s.ok() && i < bytes_to_corrupt; ++i) { + std::string tmp; + // write one random byte to a random position + s = file->Write(rnd_.Next() % size, test::RandomString(&rnd_, 1, &tmp)); + } + return s; + } + + Status WriteToFile(const std::string& fname, const std::string& data) { + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + Status s = EnvWrapper::NewWritableFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + return file->Append(Slice(data)); + } + private: + Random rnd_; +}; // FileManager + +// utility functions +static void FillDB(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + + ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value))); + } +} + +static void AssertExists(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value; + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_EQ(value, "testvalue" + std::to_string(i)); + } +} + +static void AssertEmpty(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_TRUE(s.IsNotFound()); + } +} + +class BackupableDBTest { + public: + BackupableDBTest() { + // set up files + dbname_ = test::TmpDir() + "/backupable_db"; + backupdir_ = test::TmpDir() + "/backupable_db_backup"; + + // set up envs + env_ = Env::Default(); + test_db_env_.reset(new TestEnv(env_)); + test_backup_env_.reset(new TestEnv(env_)); + file_manager_.reset(new FileManager(env_)); + + // set up db options + options_.create_if_missing = true; + options_.paranoid_checks = true; + options_.write_buffer_size = 1 << 17; // 128KB + options_.env = test_db_env_.get(); + options_.wal_dir = dbname_; + // set up backup db options + CreateLoggerFromOptions(dbname_, backupdir_, env_, + Options(), &logger_); + backupable_options_.reset(new BackupableDBOptions( + backupdir_, test_backup_env_.get(), logger_.get(), true)); + + // delete old files in db + DestroyDB(dbname_, Options()); + } + + DB* OpenDB() { + DB* db; + ASSERT_OK(DB::Open(options_, dbname_, &db)); + return db; + } + + void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false) { + // reset all the defaults + test_backup_env_->SetLimitWrittenFiles(1000000); + test_db_env_->SetLimitWrittenFiles(1000000); + test_db_env_->SetDummySequentialFile(dummy); + + DB* db; + if (dummy) { + dummy_db_ = new DummyDB(options_, dbname_); + db = dummy_db_; + } else { + ASSERT_OK(DB::Open(options_, dbname_, &db)); + } + backupable_options_->destroy_old_data = destroy_old_data; + db_.reset(new BackupableDB(db, *backupable_options_)); + } + + void CloseBackupableDB() { + db_.reset(nullptr); + } + + void OpenRestoreDB() { + backupable_options_->destroy_old_data = false; + restore_db_.reset( + new RestoreBackupableDB(test_db_env_.get(), *backupable_options_)); + } + + void CloseRestoreDB() { + restore_db_.reset(nullptr); + } + + // restores backup backup_id and asserts the existence of + // [start_exist, end_exist> and not-existence of + // [end_exist, end> + // + // if backup_id == 0, it means restore from latest + // if end == 0, don't check AssertEmpty + void AssertBackupConsistency(BackupID backup_id, uint32_t start_exist, + uint32_t end_exist, uint32_t end = 0) { + bool opened_restore = false; + if (restore_db_.get() == nullptr) { + opened_restore = true; + OpenRestoreDB(); + } + if (backup_id > 0) { + ASSERT_OK(restore_db_->RestoreDBFromBackup(backup_id, dbname_, dbname_)); + } else { + ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_)); + } + DB* db = OpenDB(); + AssertExists(db, start_exist, end_exist); + if (end != 0) { + AssertEmpty(db, end_exist, end); + } + delete db; + if (opened_restore) { + CloseRestoreDB(); + } + } + + // files + std::string dbname_; + std::string backupdir_; + + // envs + Env* env_; + unique_ptr test_db_env_; + unique_ptr test_backup_env_; + unique_ptr file_manager_; + + // all the dbs! + DummyDB* dummy_db_; // BackupableDB owns dummy_db_ + unique_ptr db_; + unique_ptr restore_db_; + + // options + Options options_; + unique_ptr backupable_options_; + std::shared_ptr logger_; +}; // BackupableDBTest + +void AppendPath(const std::string& path, std::vector& v) { + for (auto& f : v) { + f = path + f; + } +} + +// this will make sure that backup does not copy the same file twice +TEST(BackupableDBTest, NoDoubleCopy) { + OpenBackupableDB(true, true); + + // should write 5 DB files + LATEST_BACKUP + one meta file + test_backup_env_->SetLimitWrittenFiles(7); + test_db_env_->ClearOpenedFiles(); + test_db_env_->SetLimitWrittenFiles(0); + dummy_db_->live_files_ = { "/00010.sst", "/00011.sst", + "/CURRENT", "/MANIFEST-01" }; + dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + ASSERT_OK(db_->CreateNewBackup(false)); + std::vector should_have_openened = dummy_db_->live_files_; + should_have_openened.push_back("/00011.log"); + AppendPath(dbname_, should_have_openened); + test_db_env_->AssertOpenedFiles(should_have_openened); + + // should write 4 new DB files + LATEST_BACKUP + one meta file + // should not write/copy 00010.sst, since it's already there! + test_backup_env_->SetLimitWrittenFiles(6); + test_db_env_->ClearOpenedFiles(); + dummy_db_->live_files_ = { "/00010.sst", "/00015.sst", + "/CURRENT", "/MANIFEST-01" }; + dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + ASSERT_OK(db_->CreateNewBackup(false)); + // should not open 00010.sst - it's already there + should_have_openened = { "/00015.sst", "/CURRENT", + "/MANIFEST-01", "/00011.log" }; + AppendPath(dbname_, should_have_openened); + test_db_env_->AssertOpenedFiles(should_have_openened); + + ASSERT_OK(db_->DeleteBackup(1)); + ASSERT_EQ(true, + test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst")); + // 00011.sst was only in backup 1, should be deleted + ASSERT_EQ(false, + test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst")); + ASSERT_EQ(true, + test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst")); + + // MANIFEST file size should be only 100 + uint64_t size; + test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size); + ASSERT_EQ(100UL, size); + test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size); + ASSERT_EQ(200UL, size); + + CloseBackupableDB(); +} + +// test various kind of corruptions that may happen: +// 1. Not able to write a file for backup - that backup should fail, +// everything else should work +// 2. Corrupted/deleted LATEST_BACKUP - everything should work fine +// 3. Corrupted backup meta file or missing backuped file - we should +// not be able to open that backup, but all other backups should be +// fine +TEST(BackupableDBTest, CorruptionsTest) { + const int keys_iteration = 5000; + Random rnd(6); + Status s; + + OpenBackupableDB(true); + // create five backups + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + } + + // ---------- case 1. - fail a write ----------- + // try creating backup 6, but fail a write + FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6); + test_backup_env_->SetLimitWrittenFiles(2); + // should fail + s = db_->CreateNewBackup(!!(rnd.Next() % 2)); + ASSERT_TRUE(!s.ok()); + test_backup_env_->SetLimitWrittenFiles(1000000); + // latest backup should have all the keys + CloseBackupableDB(); + AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6); + + // ---------- case 2. - corrupt/delete latest backup ----------- + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/LATEST_BACKUP", 2)); + AssertBackupConsistency(0, 0, keys_iteration * 5); + ASSERT_OK(file_manager_->DeleteFile(backupdir_ + "/LATEST_BACKUP")); + AssertBackupConsistency(0, 0, keys_iteration * 5); + // create backup 6, point LATEST_BACKUP to 5 + OpenBackupableDB(); + FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6); + ASSERT_OK(db_->CreateNewBackup(false)); + CloseBackupableDB(); + ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "5")); + AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6); + // assert that all 6 data is gone! + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/6") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/6") == false); + + // --------- case 3. corrupted backup meta or missing backuped file ---- + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3)); + // since 5 meta is now corrupted, latest backup should be 4 + AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5); + OpenRestoreDB(); + s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + CloseRestoreDB(); + ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4")); + // 4 is corrupted, 3 is the latest backup now + AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5); + OpenRestoreDB(); + s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_); + CloseRestoreDB(); + ASSERT_TRUE(!s.ok()); + + // new backup should be 4! + OpenBackupableDB(); + FillDB(db_.get(), keys_iteration * 3, keys_iteration * 4); + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + CloseBackupableDB(); + AssertBackupConsistency(4, 0, keys_iteration * 4, keys_iteration * 5); +} + +// open DB, write, close DB, backup, restore, repeat +TEST(BackupableDBTest, OfflineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 5000; + const int max_key = keys_iteration * 4 + 10; + // first iter -- flush before backup + // second iter -- don't flush before backup + for (int iter = 0; iter < 2; ++iter) { + // delete old data + DestroyDB(dbname_, Options()); + bool destroy_data = true; + + // every iteration -- + // 1. insert new data in the DB + // 2. backup the DB + // 3. destroy the db + // 4. restore the db, check everything is still there + for (int i = 0; i < 5; ++i) { + // in last iteration, put smaller amount of data, + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + // ---- insert new data and back up ---- + OpenBackupableDB(destroy_data); + destroy_data = false; + FillDB(db_.get(), keys_iteration * i, fill_up_to); + ASSERT_OK(db_->CreateNewBackup(iter == 0)); + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + // ---- make sure it's empty ---- + DB* db = OpenDB(); + AssertEmpty(db, 0, fill_up_to); + delete db; + + // ---- restore the DB ---- + OpenRestoreDB(); + if (i >= 3) { // test purge old backups + // when i == 4, purge to only 1 backup + // when i == 3, purge to 2 backups + ASSERT_OK(restore_db_->PurgeOldBackups(5 - i)); + } + // ---- make sure the data is there --- + AssertBackupConsistency(0, 0, fill_up_to, max_key); + CloseRestoreDB(); + } + } +} + +// open DB, write, backup, write, backup, close, restore +TEST(BackupableDBTest, OnlineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 5000; + const int max_key = keys_iteration * 4 + 10; + Random rnd(7); + // delete old data + DestroyDB(dbname_, Options()); + + OpenBackupableDB(true); + // write some data, backup, repeat + for (int i = 0; i < 5; ++i) { + if (i == 4) { + // delete backup number 2, online delete! + OpenRestoreDB(); + ASSERT_OK(restore_db_->DeleteBackup(2)); + CloseRestoreDB(); + } + // in last iteration, put smaller amount of data, + // so that backups can share sst files + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + FillDB(db_.get(), keys_iteration * i, fill_up_to); + // we should get consistent results with flush_before_backup + // set to both true and false + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + } + // close and destroy + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + // ---- make sure it's empty ---- + DB* db = OpenDB(); + AssertEmpty(db, 0, max_key); + delete db; + + // ---- restore every backup and verify all the data is there ---- + OpenRestoreDB(); + for (int i = 1; i <= 5; ++i) { + if (i == 2) { + // we deleted backup 2 + Status s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + } else { + int fill_up_to = std::min(keys_iteration * i, max_key); + AssertBackupConsistency(i, 0, fill_up_to, max_key); + } + } + + // delete some backups -- this should leave only backups 3 and 5 alive + ASSERT_OK(restore_db_->DeleteBackup(4)); + ASSERT_OK(restore_db_->PurgeOldBackups(2)); + + std::vector backup_info; + restore_db_->GetBackupInfo(&backup_info); + ASSERT_EQ(2UL, backup_info.size()); + + // check backup 3 + AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key); + // check backup 5 + AssertBackupConsistency(5, 0, max_key); + + CloseRestoreDB(); +} + +TEST(BackupableDBTest, DeleteNewerBackups) { + // create backups 1, 2, 3, 4, 5 + OpenBackupableDB(true); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), 100 * i, 100 * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(i % 2))); + } + CloseBackupableDB(); + + // backup 3 is fine + AssertBackupConsistency(3, 0, 300, 500); + // this should delete backups 4 and 5 + OpenBackupableDB(); + CloseBackupableDB(); + // backups 4 and 5 don't exist + OpenRestoreDB(); + Status s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_); + ASSERT_TRUE(s.IsNotFound()); + s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_); + ASSERT_TRUE(s.IsNotFound()); + CloseRestoreDB(); +} + +} // anon namespace + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/utilities/merge_operators/.DS_Store b/utilities/merge_operators/.DS_Store deleted file mode 100644 index 5008ddfcf5..0000000000 Binary files a/utilities/merge_operators/.DS_Store and /dev/null differ diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc index 216dbe84e7..81af64622e 100644 --- a/utilities/merge_operators/string_append/stringappend_test.cc +++ b/utilities/merge_operators/string_append/stringappend_test.cc @@ -41,9 +41,7 @@ std::shared_ptr OpenTtlDb(char delim_char) { Options options; options.create_if_missing = true; options.merge_operator.reset(new StringAppendTESTOperator(delim_char)); - Status s; - db = new DBWithTTL(123456, options, kDbName, s, false); - ASSERT_OK(s); + ASSERT_OK(UtilityDB::OpenTtlDB(options, kDbName, &db, 123456)); return std::shared_ptr(db); } @@ -53,6 +51,7 @@ class StringLists { public: //Constructor: specifies the rocksdb db + /* implicit */ StringLists(std::shared_ptr db) : db_(db), merge_option_(), @@ -75,7 +74,7 @@ class StringLists { // Returns the list of strings associated with key (or "" if does not exist) bool Get(const std::string& key, std::string* const result){ - assert(result != NULL); // we should have a place to store the result + assert(result != nullptr); // we should have a place to store the result auto s = db_->Get(get_option_, key, result); if (s.ok()) { diff --git a/utilities/ttl/db_ttl.cc b/utilities/ttl/db_ttl.cc index ee4a948b9d..5b704930b2 100644 --- a/utilities/ttl/db_ttl.cc +++ b/utilities/ttl/db_ttl.cc @@ -10,40 +10,27 @@ namespace rocksdb { -// Open the db inside DBWithTTL because options needs pointer to its ttl -DBWithTTL::DBWithTTL(const int32_t ttl, - const Options& options, - const std::string& dbname, - Status& st, - bool read_only) - : StackableDB(nullptr) { - Options options_to_open = options; - - if (options.compaction_filter) { - ttl_comp_filter_.reset( - new TtlCompactionFilter(ttl, options.compaction_filter)); - options_to_open.compaction_filter = ttl_comp_filter_.get(); +void DBWithTTL::SanitizeOptions(int32_t ttl, Options* options) { + if (options->compaction_filter) { + options->compaction_filter = + new TtlCompactionFilter(ttl, options->compaction_filter); } else { - options_to_open.compaction_filter_factory = - std::shared_ptr( - new TtlCompactionFilterFactory( - ttl, options.compaction_filter_factory)); + options->compaction_filter_factory = + std::shared_ptr(new TtlCompactionFilterFactory( + ttl, options->compaction_filter_factory)); } - if (options.merge_operator) { - options_to_open.merge_operator.reset( - new TtlMergeOperator(options.merge_operator)); - } - - if (read_only) { - st = DB::OpenForReadOnly(options_to_open, dbname, &db_); - } else { - st = DB::Open(options_to_open, dbname, &db_); + if (options->merge_operator) { + options->merge_operator.reset( + new TtlMergeOperator(options->merge_operator)); } } +// Open the db inside DBWithTTL because options needs pointer to its ttl +DBWithTTL::DBWithTTL(DB* db) : StackableDB(db) {} + DBWithTTL::~DBWithTTL() { - delete db_; + delete GetOptions().compaction_filter; } Status UtilityDB::OpenTtlDB( @@ -53,9 +40,19 @@ Status UtilityDB::OpenTtlDB( int32_t ttl, bool read_only) { Status st; - *dbptr = new DBWithTTL(ttl, options, dbname, st, read_only); - if (!st.ok()) { - delete *dbptr; + Options options_to_open = options; + DBWithTTL::SanitizeOptions(ttl, &options_to_open); + DB* db; + + if (read_only) { + st = DB::OpenForReadOnly(options_to_open, dbname, &db); + } else { + st = DB::Open(options_to_open, dbname, &db); + } + if (st.ok()) { + *dbptr = new DBWithTTL(db); + } else { + delete db; } return st; } @@ -122,10 +119,8 @@ Status DBWithTTL::StripTS(std::string* str) { return st; } -Status DBWithTTL::Put( - const WriteOptions& opt, - const Slice& key, - const Slice& val) { +Status DBWithTTL::Put(const WriteOptions& opt, const Slice& key, + const Slice& val) { WriteBatch batch; batch.Put(key, val); return Write(opt, &batch); @@ -166,10 +161,6 @@ bool DBWithTTL::KeyMayExist(const ReadOptions& options, return ret; } -Status DBWithTTL::Delete(const WriteOptions& wopts, const Slice& key) { - return db_->Delete(wopts, key); -} - Status DBWithTTL::Merge(const WriteOptions& opt, const Slice& key, const Slice& value) { @@ -221,86 +212,6 @@ Iterator* DBWithTTL::NewIterator(const ReadOptions& opts) { return new TtlIterator(db_->NewIterator(opts)); } -const Snapshot* DBWithTTL::GetSnapshot() { - return db_->GetSnapshot(); -} - -void DBWithTTL::ReleaseSnapshot(const Snapshot* snapshot) { - db_->ReleaseSnapshot(snapshot); -} - -bool DBWithTTL::GetProperty(const Slice& property, std::string* value) { - return db_->GetProperty(property, value); -} - -void DBWithTTL::GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { - db_->GetApproximateSizes(r, n, sizes); -} - -void DBWithTTL::CompactRange(const Slice* begin, const Slice* end, - bool reduce_level, int target_level) { - db_->CompactRange(begin, end, reduce_level, target_level); -} - -int DBWithTTL::NumberLevels() { - return db_->NumberLevels(); -} - -int DBWithTTL::MaxMemCompactionLevel() { - return db_->MaxMemCompactionLevel(); -} - -int DBWithTTL::Level0StopWriteTrigger() { - return db_->Level0StopWriteTrigger(); -} - -Env* DBWithTTL::GetEnv() const { - return db_->GetEnv(); -} - -const Options& DBWithTTL::GetOptions() const { - return db_->GetOptions(); -} - -Status DBWithTTL::Flush(const FlushOptions& fopts) { - return db_->Flush(fopts); -} - -Status DBWithTTL::DisableFileDeletions() { - return db_->DisableFileDeletions(); -} - -Status DBWithTTL::EnableFileDeletions() { - return db_->EnableFileDeletions(); -} - -Status DBWithTTL::GetLiveFiles(std::vector& vec, uint64_t* mfs, - bool flush_memtable) { - return db_->GetLiveFiles(vec, mfs, flush_memtable); -} - -SequenceNumber DBWithTTL::GetLatestSequenceNumber() const { - return db_->GetLatestSequenceNumber(); -} - -Status DBWithTTL::GetSortedWalFiles(VectorLogPtr& files) { - return db_->GetSortedWalFiles(files); -} - -Status DBWithTTL::DeleteFile(std::string name) { - return db_->DeleteFile(name); -} - -Status DBWithTTL::GetDbIdentity(std::string& identity) { - return db_->GetDbIdentity(identity); -} - -Status DBWithTTL::GetUpdatesSince( - SequenceNumber seq_number, - unique_ptr* iter) { - return db_->GetUpdatesSince(seq_number, iter); -} - void DBWithTTL::TEST_Destroy_DBWithTtl() { ((DBImpl*) db_)->TEST_Destroy_DBImpl(); } diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h index c5270764e0..2fdc664e21 100644 --- a/utilities/ttl/db_ttl.h +++ b/utilities/ttl/db_ttl.h @@ -14,82 +14,33 @@ namespace rocksdb { class DBWithTTL : public StackableDB { public: - DBWithTTL(const int32_t ttl, - const Options& options, - const std::string& dbname, - Status& st, - bool read_only); + static void SanitizeOptions(int32_t ttl, Options* options); + + explicit DBWithTTL(DB* db); virtual ~DBWithTTL(); - virtual Status Put(const WriteOptions& o, - const Slice& key, - const Slice& val); + virtual Status Put(const WriteOptions& o, const Slice& key, + const Slice& val) override; - virtual Status Get(const ReadOptions& options, - const Slice& key, - std::string* value); + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) override; - virtual std::vector MultiGet(const ReadOptions& options, - const std::vector& keys, - std::vector* values); + virtual std::vector MultiGet( + const ReadOptions& options, const std::vector& keys, + std::vector* values) override; virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, std::string* value, bool* value_found = nullptr) override; - virtual Status Delete(const WriteOptions& wopts, const Slice& key); + virtual Status Merge(const WriteOptions& options, const Slice& key, + const Slice& value) override; - virtual Status Merge(const WriteOptions& options, - const Slice& key, - const Slice& value); + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; - - virtual Status Write(const WriteOptions& opts, WriteBatch* updates); - - virtual Iterator* NewIterator(const ReadOptions& opts); - - virtual const Snapshot* GetSnapshot(); - - virtual void ReleaseSnapshot(const Snapshot* snapshot); - - virtual bool GetProperty(const Slice& property, std::string* value); - - virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes); - - virtual void CompactRange(const Slice* begin, const Slice* end, - bool reduce_level = false, int target_level = -1); - - virtual int NumberLevels(); - - virtual int MaxMemCompactionLevel(); - - virtual int Level0StopWriteTrigger(); - - virtual Env* GetEnv() const; - - virtual const Options& GetOptions() const; - - virtual Status Flush(const FlushOptions& fopts); - - virtual Status DisableFileDeletions(); - - virtual Status EnableFileDeletions(); - - virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, - bool flush_memtable = true); - - virtual Status GetSortedWalFiles(VectorLogPtr& files); - - virtual Status DeleteFile(std::string name); - - virtual Status GetDbIdentity(std::string& identity); - - virtual SequenceNumber GetLatestSequenceNumber() const; - - virtual Status GetUpdatesSince(SequenceNumber seq_number, - unique_ptr* iter); + virtual Iterator* NewIterator(const ReadOptions& opts) override; // Simulate a db crash, no elegant closing of database. void TEST_Destroy_DBWithTtl(); @@ -113,10 +64,6 @@ class DBWithTTL : public StackableDB { static const int32_t kMinTimestamp = 1368146402; // 05/09/2013:5:40PM GMT-8 static const int32_t kMaxTimestamp = 2147483647; // 01/18/2038:7:14PM GMT-8 - - private: - DB* db_; - unique_ptr ttl_comp_filter_; }; class TtlIterator : public Iterator {