diff --git a/.clang-format b/.clang-format index a1e9a48e40..7c279811ac 100644 --- a/.clang-format +++ b/.clang-format @@ -2,46 +2,4 @@ # http://clang.llvm.org/docs/ClangFormatStyleOptions.html --- BasedOnStyle: Google -AccessModifierOffset: -1 -ConstructorInitializerIndentWidth: 4 -AlignEscapedNewlinesLeft: true -AlignTrailingComments: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortIfStatementsOnASingleLine: false -AllowShortLoopsOnASingleLine: false -AlwaysBreakTemplateDeclarations: true -AlwaysBreakBeforeMultilineStrings: true -BreakBeforeBinaryOperators: false -BreakConstructorInitializersBeforeComma: false -BinPackParameters: false -ColumnLimit: 80 -ConstructorInitializerAllOnOneLineOrOnePerLine: true -DerivePointerBinding: true -ExperimentalAutoDetectBinPacking: true -IndentCaseLabels: false -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCSpaceBeforeProtocolList: false -PenaltyBreakBeforeFirstCallParameter: 10 -PenaltyBreakComment: 60 -PenaltyBreakString: 1000 -PenaltyBreakFirstLessLess: 20 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerBindsToType: true -SpacesBeforeTrailingComments: 2 -Cpp11BracedListStyle: true -Standard: Cpp11 -IndentWidth: 2 -TabWidth: 8 -UseTab: Never -BreakBeforeBraces: Attach -IndentFunctionDeclarationAfterType: false -SpacesInParentheses: false -SpacesInAngles: false -SpaceInEmptyParentheses: false -SpacesInCStyleCastParentheses: false -SpaceAfterControlStatementKeyword: true -SpaceBeforeAssignmentOperators: true -ContinuationIndentWidth: 4 ... diff --git a/Makefile b/Makefile index 1412489c2a..b770ca7b07 100644 --- a/Makefile +++ b/Makefile @@ -126,19 +126,22 @@ $(SHARED2): $(SHARED3) ln -fs $(SHARED3) $(SHARED2) endif -$(SHARED3): - $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $@ $(LDFLAGS) +$(SHARED3): $(LIBOBJECTS) + $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES)-o $@ endif # PLATFORM_SHARED_EXT all: $(LIBRARY) $(PROGRAMS) .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ - release tags valgrind_check whitebox_crash_test + release tags valgrind_check whitebox_crash_test format +# Will also generate shared libraries. release: $(MAKE) clean OPT="-DNDEBUG -O2" $(MAKE) -j32 + OPT="-DNDEBUG -O2" $(MAKE) all -j32 + OPT="-DNDEBUG -O2" $(MAKE) $(SHARED) -j32 coverage: $(MAKE) clean @@ -195,6 +198,9 @@ tags: ctags * -R cscope -b `find . -name '*.cc'` `find . -name '*.h'` +format: + build_tools/format-diff.sh + # --------------------------------------------------------------------------- # Unit tests and tools # --------------------------------------------------------------------------- @@ -416,6 +422,12 @@ DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d)) depend: $(DEPFILES) +# if the make goal is either "clean" or "format", we shouldn't +# try to import the *.d files. +# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly +# working solution. ifneq ($(MAKECMDGOALS),clean) +ifneq ($(MAKECMDGOALS),format) -include $(DEPFILES) endif +endif diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 87c4c871dc..8e83ae497e 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -81,9 +81,9 @@ PLATFORM_CCFLAGS= PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS" PLATFORM_SHARED_EXT="so" -PLATFORM_SHARED_LDFLAGS="${EXEC_LDFLAGS_SHARED} -shared -Wl,-soname -Wl," +PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," PLATFORM_SHARED_CFLAGS="-fPIC" -PLATFORM_SHARED_VERSIONED=true +PLATFORM_SHARED_VERSIONED=false # generic port files (working on all platform by #ifdef) go directly in /port GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "` diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh index ae2bb57da5..e8c9f090b9 100644 --- a/build_tools/fbcode.gcc481.sh +++ b/build_tools/fbcode.gcc481.sh @@ -60,7 +60,7 @@ AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic" -CFLAGS+=" -nostdlib $LIBGCC_INCLUDE $GLIBC_INCLUDE" +CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE" CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT" CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2" diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh new file mode 100755 index 0000000000..758135c9f8 --- /dev/null +++ b/build_tools/format-diff.sh @@ -0,0 +1,83 @@ +#!/bin/bash +set -e +# If clang_format_diff.py command is not specfied, we assume we are able to +# access directly without any path. +if [ -z $CLANG_FORMAT_DIFF ] +then +CLANG_FORMAT_DIFF="clang-format-diff.py" +fi + +# Check clang-format-diff.py +if ! which $CLANG_FORMAT_DIFF &> /dev/null +then + echo "You didn't have clang-format-diff.py available in your computer!" + echo "You can download it by running: " + echo " curl https://fburl.com/clang-format-diff" + exit 128 +fi + +# Check argparse, a library that clang-format-diff.py requires. +python 2>/dev/null << EOF +import argparse +EOF + +if [ "$?" != 0 ] +then + echo "To run clang-format-diff.py, we'll need the library "argparse" to be" + echo "installed. You can try either of the follow ways to install it:" + echo " 1. Manually download argparse: https://pypi.python.org/pypi/argparse" + echo " 2. easy_install argparse (if you have easy_install)" + echo " 3. pip install argparse (if you have pip)" + exit 129 +fi + +# TODO(kailiu) following work is not complete since we still need to figure +# out how to add the modified files done pre-commit hook to git's commit index. +# +# Check if this script has already been added to pre-commit hook. +# Will suggest user to add this script to pre-commit hook if their pre-commit +# is empty. +# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit" +# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null +# then +# echo "Would you like to add this script to pre-commit hook, which will do " +# echo -n "the format check for all the affected lines before you check in (y/n):" +# read add_to_hook +# if [ "$add_to_hook" == "y" ] +# then +# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH +# fi +# fi + +# Check the format of recently changed lines, +diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1) + +if [ -z "$diffs" ] +then + echo "Nothing needs to be reformatted!" + exit 0 +fi + +# Highlight the insertion/deletion from the clang-format-diff.py's output +COLOR_END="\033[0m" +COLOR_RED="\033[0;31m" +COLOR_GREEN="\033[0;32m" + +echo -e "Detect lines that doesn't follow the format rules:\r" +# Add the color to the diff. lines added will be green; lines removed will be red. +echo "$diffs" | + sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" | + sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/" +echo -e "Would you like to fix the format automatically (y/n): \c" + +# Make sure under any mode, we can read user input. +exec < /dev/tty +read to_fix + +if [ "$to_fix" != "y" ] +then + exit 1 +fi + +# Do in-place format adjustment. +git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1 diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index b0c130e3cf..d38b67c3ce 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -50,7 +50,7 @@ make release --num=$NUM \ --writes=$NUM \ --cache_size=6442450944 \ - --cache_numshardbits=4 \ + --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ --statistics=1 \ @@ -68,7 +68,7 @@ make release --num=$NUM \ --writes=$((NUM / 10)) \ --cache_size=6442450944 \ - --cache_numshardbits=4 \ + --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ --statistics=1 \ @@ -87,7 +87,7 @@ make release --num=$NUM \ --writes=$NUM \ --cache_size=6442450944 \ - --cache_numshardbits=4 \ + --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ --statistics=1 \ @@ -106,7 +106,7 @@ make release --num=$NUM \ --reads=$((NUM / 5)) \ --cache_size=6442450944 \ - --cache_numshardbits=4 \ + --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ --disable_seek_compaction=1 \ @@ -126,7 +126,7 @@ make release --num=$NUM \ --reads=$((NUM / 5)) \ --cache_size=104857600 \ - --cache_numshardbits=4 \ + --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ --disable_seek_compaction=1 \ @@ -147,7 +147,7 @@ make release --reads=$((NUM / 5)) \ --writes=512 \ --cache_size=6442450944 \ - --cache_numshardbits=4 \ + --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --write_buffer_size=1000000000 \ --open_files=55000 \ @@ -169,7 +169,7 @@ make release --num=$((NUM / 4)) \ --writes=$((NUM / 4)) \ --cache_size=6442450944 \ - --cache_numshardbits=4 \ + --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ --statistics=1 \ @@ -179,6 +179,25 @@ make release --sync=0 \ --threads=1 > /dev/null +# dummy test just to compact the data +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 1000)) \ + --reads=$((NUM / 1000)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > /dev/null + # measure readrandom after load with filluniquerandom with 6GB block cache ./db_bench \ --benchmarks=readrandom \ @@ -188,7 +207,7 @@ make release --num=$((NUM / 4)) \ --reads=$((NUM / 4)) \ --cache_size=6442450944 \ - --cache_numshardbits=4 \ + --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ --disable_seek_compaction=1 \ @@ -200,6 +219,28 @@ make release --sync=0 \ --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom +# measure readwhilewriting after load with filluniquerandom with 6GB block cache +./db_bench \ + --benchmarks=readwhilewriting \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --reads=$((NUM / 4)) \ + --writes_per_second=1000 \ + --write_buffer_size=100000000 \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readwhilewriting + # measure memtable performance -- none of the data gets flushed to disk ./db_bench \ --benchmarks=fillrandom,readrandom, \ @@ -208,7 +249,7 @@ make release --num=$((NUM / 10)) \ --reads=$NUM \ --cache_size=6442450944 \ - --cache_numshardbits=4 \ + --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --write_buffer_size=1000000000 \ --open_files=55000 \ @@ -264,3 +305,4 @@ send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_m send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom +send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting diff --git a/db/builder.cc b/db/builder.cc index 53d2f89853..930fc180ae 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -42,7 +42,7 @@ Status BuildTable(const std::string& dbname, const Comparator* user_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, - const bool enable_compression) { + const CompressionType compression) { Status s; meta->file_size = 0; meta->smallest_seqno = meta->largest_seqno = 0; @@ -65,7 +65,7 @@ Status BuildTable(const std::string& dbname, } TableBuilder* builder = GetTableBuilder(options, file.get(), - options.compression); + compression); // the first key is the smallest key Slice key = iter->key(); diff --git a/db/builder.h b/db/builder.h index 8c525bd050..2600dc24b8 100644 --- a/db/builder.h +++ b/db/builder.h @@ -43,6 +43,6 @@ extern Status BuildTable(const std::string& dbname, const Comparator* user_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, - const bool enable_compression); + const CompressionType compression); } // namespace rocksdb diff --git a/db/c.cc b/db/c.cc index 36ee2d486b..68f3613367 100644 --- a/db/c.cc +++ b/db/c.cc @@ -788,6 +788,10 @@ void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) { env->rep->SetBackgroundThreads(n); } +void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n, Env::HIGH); +} + void rocksdb_env_destroy(rocksdb_env_t* env) { if (!env->is_default) delete env->rep; delete env; diff --git a/db/compaction.cc b/db/compaction.cc new file mode 100644 index 0000000000..703e7aeaeb --- /dev/null +++ b/db/compaction.cc @@ -0,0 +1,214 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction.h" + +namespace rocksdb { + +static uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->file_size; + } + return sum; +} + +Compaction::Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, + uint64_t max_grandparent_overlap_bytes, + bool seek_compaction, bool enable_compression) + : level_(level), + out_level_(out_level), + max_output_file_size_(target_file_size), + maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes), + input_version_(input_version), + number_levels_(input_version_->NumberLevels()), + seek_compaction_(seek_compaction), + enable_compression_(enable_compression), + grandparent_index_(0), + seen_key_(false), + overlapped_bytes_(0), + base_index_(-1), + parent_index_(-1), + score_(0), + bottommost_level_(false), + is_full_compaction_(false), + level_ptrs_(std::vector(number_levels_)) { + + input_version_->Ref(); + edit_ = new VersionEdit(); + for (int i = 0; i < number_levels_; i++) { + level_ptrs_[i] = 0; + } +} + +Compaction::~Compaction() { + delete edit_; + if (input_version_ != nullptr) { + input_version_->Unref(); + } +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + // If level_== out_level_, the purpose is to force compaction filter to be + // applied to that level, and thus cannot be a trivia move. + return (level_ != out_level_ && + num_input_files(0) == 1 && + num_input_files(1) == 0 && + TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_); +} + +void Compaction::AddInputDeletions(VersionEdit* edit) { + for (int which = 0; which < 2; which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + edit->DeleteFile(level_ + which, inputs_[which][i]->number); + } + } +} + +bool Compaction::IsBaseLevelForKey(const Slice& user_key) { + if (input_version_->vset_->options_->compaction_style == + kCompactionStyleUniversal) { + return bottommost_level_; + } + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); + for (int lvl = level_ + 2; lvl < number_levels_; lvl++) { + const std::vector& files = input_version_->files_[lvl]; + for (; level_ptrs_[lvl] < files.size(); ) { + FileMetaData* f = files[level_ptrs_[lvl]]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so definitely not base level + return false; + } + break; + } + level_ptrs_[lvl]++; + } + } + return true; +} + +bool Compaction::ShouldStopBefore(const Slice& internal_key) { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(internal_key, + grandparents_[grandparent_index_]->largest.Encode()) > 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + } + assert(grandparent_index_ + 1 >= grandparents_.size() || + icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(), + grandparents_[grandparent_index_+1]->smallest.Encode()) + < 0); + grandparent_index_++; + } + seen_key_ = true; + + if (overlapped_bytes_ > maxGrandParentOverlapBytes_) { + // Too much overlap for current output; start new output + overlapped_bytes_ = 0; + return true; + } else { + return false; + } +} + +// Mark (or clear) each file that is being compacted +void Compaction::MarkFilesBeingCompacted(bool value) { + for (int i = 0; i < 2; i++) { + std::vector v = inputs_[i]; + for (unsigned int j = 0; j < inputs_[i].size(); j++) { + assert(value ? !inputs_[i][j]->being_compacted : + inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = value; + } + } +} + +// Is this compaction producing files at the bottommost level? +void Compaction::SetupBottomMostLevel(bool isManual) { + if (input_version_->vset_->options_->compaction_style == + kCompactionStyleUniversal) { + // If universal compaction style is used and manual + // compaction is occuring, then we are guaranteed that + // all files will be picked in a single compaction + // run. We can safely set bottommost_level_ = true. + // If it is not manual compaction, then bottommost_level_ + // is already set when the Compaction was created. + if (isManual) { + bottommost_level_ = true; + } + return; + } + bottommost_level_ = true; + int num_levels = input_version_->vset_->NumberLevels(); + for (int i = output_level() + 1; i < num_levels; i++) { + if (input_version_->NumLevelFiles(i) > 0) { + bottommost_level_ = false; + break; + } + } +} + +void Compaction::ReleaseInputs() { + if (input_version_ != nullptr) { + input_version_->Unref(); + input_version_ = nullptr; + } +} + +void Compaction::ResetNextCompactionIndex() { + input_version_->ResetNextCompactionIndex(level_); +} + +static void InputSummary(std::vector& files, char* output, + int len) { + int write = 0; + for (unsigned int i = 0; i < files.size(); i++) { + int sz = len - write; + int ret = snprintf(output + write, sz, "%lu(%lu) ", + (unsigned long)files.at(i)->number, + (unsigned long)files.at(i)->file_size); + if (ret < 0 || ret >= sz) + break; + write += ret; + } +} + +void Compaction::Summary(char* output, int len) { + int write = snprintf(output, len, + "Base version %lu Base level %d, seek compaction:%d, inputs:", + (unsigned long)input_version_->GetVersionNumber(), + level_, + seek_compaction_); + if (write < 0 || write > len) { + return; + } + + char level_low_summary[100]; + InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary)); + char level_up_summary[100]; + if (inputs_[1].size()) { + InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary)); + } else { + level_up_summary[0] = '\0'; + } + + snprintf(output + write, len - write, "[%s],[%s]", + level_low_summary, level_up_summary); +} + +} // namespace rocksdb diff --git a/db/compaction.h b/db/compaction.h new file mode 100644 index 0000000000..4cc0197da8 --- /dev/null +++ b/db/compaction.h @@ -0,0 +1,131 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" + +namespace rocksdb { + +class Version; + +// A Compaction encapsulates information about a compaction. +class Compaction { + public: + ~Compaction(); + + // Return the level that is being compacted. Inputs from "level" + // will be merged. + int level() const { return level_; } + + // Outputs will go to this level + int output_level() const { return out_level_; } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return edit_; } + + // "which" must be either 0 or 1 + int num_input_files(int which) const { return inputs_[which].size(); } + + // Return the ith input file at "level()+which" ("which" must be 0 or 1). + FileMetaData* input(int which, int i) const { return inputs_[which][i]; } + + // Maximum size of files to build during this compaction. + uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + + // Whether compression will be enabled for compaction outputs + bool enable_compression() const { return enable_compression_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the information we have available guarantees that + // the compaction is producing data in "level+1" for which no data exists + // in levels greater than "level+1". + bool IsBaseLevelForKey(const Slice& user_key); + + // Returns true iff we should stop building the current output + // before processing "internal_key". + bool ShouldStopBefore(const Slice& internal_key); + + // Release the input version for the compaction, once the compaction + // is successful. + void ReleaseInputs(); + + void Summary(char* output, int len); + + // Return the score that was used to pick this compaction run. + double score() const { return score_; } + + // Is this compaction creating a file in the bottom most level? + bool BottomMostLevel() { return bottommost_level_; } + + // Does this compaction include all sst files? + bool IsFullCompaction() { return is_full_compaction_; } + + private: + friend class Version; + friend class VersionSet; + + Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes, + bool seek_compaction = false, bool enable_compression = true); + + int level_; + int out_level_; // levels to which output files are stored + uint64_t max_output_file_size_; + uint64_t maxGrandParentOverlapBytes_; + Version* input_version_; + VersionEdit* edit_; + int number_levels_; + + bool seek_compaction_; + bool enable_compression_; + + // Each compaction reads inputs from "level_" and "level_+1" + std::vector inputs_[2]; // The two sets of inputs + + // State used to check for number of of overlapping grandparent files + // (parent == level_ + 1, grandparent == level_ + 2) + std::vector grandparents_; + size_t grandparent_index_; // Index in grandparent_starts_ + bool seen_key_; // Some output key has been seen + uint64_t overlapped_bytes_; // Bytes of overlap between current output + // and grandparent files + int base_index_; // index of the file in files_[level_] + int parent_index_; // index of some file with same range in files_[level_+1] + double score_; // score that was used to pick this compaction. + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + // Does this compaction include all sst files? + bool is_full_compaction_; + + // level_ptrs_ holds indices into input_version_->levels_: our state + // is that we are positioned at one of the file ranges for each + // higher level than the ones involved in this compaction (i.e. for + // all L >= level_ + 2). + std::vector level_ptrs_; + + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool); + + // Initialize whether compaction producing files at the bottommost level + void SetupBottomMostLevel(bool isManual); + + // In case of compaction error, reset the nextIndex that is used + // to pick up the next file to be compacted from files_by_size_ + void ResetNextCompactionIndex(); +}; + +} // namespace rocksdb diff --git a/db/db_bench.cc b/db/db_bench.cc index de57326a6e..6636278370 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -95,6 +95,8 @@ DEFINE_string(benchmarks, "\tmergerandom -- same as updaterandom/appendrandom using merge" " operator. " "Must be used with merge_operator\n" + "\treadrandommergerandom -- perform N random read-or-merge " + "operations. Must be used with merge_operator\n" "\tseekrandom -- N random seeks\n" "\tcrc32c -- repeated crc32c of 4K of data\n" "\tacquireload -- load N*1000 times\n" @@ -113,6 +115,11 @@ DEFINE_int64(numdistinct, 1000, "read/write on fewer keys so that gets are more likely to find the" " key and puts are more likely to update the same key"); +DEFINE_int64(merge_keys, -1, + "Number of distinct keys to use for MergeRandom and " + "ReadRandomMergeRandom. " + "If negative, there will be FLAGS_num keys."); + DEFINE_int64(reads, -1, "Number of read operations to do. " "If negative, do FLAGS_num reads."); @@ -298,6 +305,11 @@ DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed" "default value 90 means 90% operations out of all reads and writes" " operations are reads. In other words, 9 gets for every 1 put."); +DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed" + " as percentage) for the ReadRandomMergeRandom workload. The" + " default value 70 means 70% out of all read and merge operations" + " are merges. In other words, 7 merges for every 3 gets."); + DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/" "deletes (used in RandomWithVerify only). RandomWithVerify " "calculates writepercent as (100 - FLAGS_readwritepercent - " @@ -449,6 +461,9 @@ DEFINE_uint64(bytes_per_sync, rocksdb::Options().bytes_per_sync, DEFINE_bool(filter_deletes, false, " On true, deletes use bloom-filter and drop" " the delete if key not present"); +DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge" + " operations on a key in the memtable"); + static bool ValidatePrefixSize(const char* flagname, int32_t value) { if (value < 0 || value>=2000000000) { fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n", @@ -788,6 +803,7 @@ class Benchmark { long long reads_; long long writes_; long long readwrites_; + long long merge_keys_; int heap_counter_; char keyFormat_[100]; // will contain the format of key. e.g "%016d" void PrintHeader() { @@ -963,6 +979,7 @@ class Benchmark { readwrites_((FLAGS_writes < 0 && FLAGS_reads < 0)? FLAGS_num : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads) ), + merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys), heap_counter_(0) { std::vector files; FLAGS_env->GetChildren(FLAGS_db, &files); @@ -990,8 +1007,8 @@ class Benchmark { } unique_ptr GenerateKeyFromInt(long long v, const char* suffix = "") { - unique_ptr keyInStr(new char[kMaxKeySize]); - snprintf(keyInStr.get(), kMaxKeySize, keyFormat_, v, suffix); + unique_ptr keyInStr(new char[kMaxKeySize + 1]); + snprintf(keyInStr.get(), kMaxKeySize + 1, keyFormat_, v, suffix); return keyInStr; } @@ -1092,6 +1109,14 @@ class Benchmark { method = &Benchmark::ReadWhileWriting; } else if (name == Slice("readrandomwriterandom")) { method = &Benchmark::ReadRandomWriteRandom; + } else if (name == Slice("readrandommergerandom")) { + if (FLAGS_merge_operator.empty()) { + fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", + name.ToString().c_str()); + method = nullptr; + } else { + method = &Benchmark::ReadRandomMergeRandom; + } } else if (name == Slice("updaterandom")) { method = &Benchmark::UpdateRandom; } else if (name == Slice("appendrandom")) { @@ -1427,6 +1452,7 @@ class Benchmark { FLAGS_merge_operator.c_str()); exit(1); } + options.max_successive_merges = FLAGS_max_successive_merges; // set universal style compaction configurations, if applicable if (FLAGS_universal_size_ratio != 0) { @@ -2381,13 +2407,16 @@ class Benchmark { // // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8 // to simulate random additions over 64-bit integers using merge. + // + // The number of merges on the same key can be controlled by adjusting + // FLAGS_merge_keys. void MergeRandom(ThreadState* thread) { RandomGenerator gen; // The number of iterations is the larger of read_ or write_ Duration duration(FLAGS_duration, readwrites_); while (!duration.Done(1)) { - const long long k = thread->rand.Next() % FLAGS_num; + const long long k = thread->rand.Next() % merge_keys_; unique_ptr key = GenerateKeyFromInt(k); Status s = db_->Merge(write_options_, key.get(), @@ -2406,6 +2435,68 @@ class Benchmark { thread->stats.AddMessage(msg); } + // Read and merge random keys. The amount of reads and merges are controlled + // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct + // keys (and thus also the number of reads and merges on the same key) can be + // adjusted with FLAGS_merge_keys. + // + // As with MergeRandom, the merge operator to use should be defined by + // FLAGS_merge_operator. + void ReadRandomMergeRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + long long num_hits = 0; + long long num_gets = 0; + long long num_merges = 0; + size_t max_length = 0; + + // the number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + + while (!duration.Done(1)) { + const long long k = thread->rand.Next() % merge_keys_; + unique_ptr key = GenerateKeyFromInt(k); + + bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent; + + if (do_merge) { + Status s = db_->Merge(write_options_, key.get(), + gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); + exit(1); + } + + num_merges++; + + } else { + Status s = db_->Get(options, key.get(), &value); + if (value.length() > max_length) + max_length = value.length(); + + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + num_hits++; + } + + num_gets++; + + } + + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "(reads:%lld merges:%lld total:%lld hits:%lld maxlength:%zu)", + num_gets, num_merges, readwrites_, num_hits, max_length); + thread->stats.AddMessage(msg); + } + + void Compact(ThreadState* thread) { db_->CompactRange(nullptr, nullptr); } diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index a7232246a3..04d6d0e17a 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -74,7 +74,7 @@ Status DBImpl::GetLiveFiles(std::vector& ret, // Make a set of all of the live *.sst files std::set live; - versions_->AddLiveFilesCurrentVersion(&live); + versions_->current()->AddLiveFiles(&live); ret.clear(); ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST diff --git a/db/db_impl.cc b/db/db_impl.cc index 3e470389bd..57565996ad 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -56,6 +56,7 @@ #include "util/mutexlock.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" +#include "util/autovector.h" namespace rocksdb { @@ -228,12 +229,34 @@ CompressionType GetCompressionType(const Options& options, int level, } } +CompressionType GetCompressionFlush(const Options& options) { + // Compressing memtable flushes might not help unless the sequential load + // optimization is used for leveled compaction. Otherwise the CPU and + // latency overhead is not offset by saving much space. + + bool can_compress; + + if (options.compaction_style == kCompactionStyleUniversal) { + can_compress = + (options.compaction_options_universal.compression_size_percent < 0); + } else { + // For leveled compress when min_level_to_compress == 0. + can_compress = (GetCompressionType(options, 0, true) != kNoCompression); + } + + if (can_compress) { + return options.compression; + } else { + return kNoCompression; + } +} + DBImpl::DBImpl(const Options& options, const std::string& dbname) : env_(options.env), dbname_(dbname), internal_comparator_(options.comparator), - options_(SanitizeOptions( - dbname, &internal_comparator_, &internal_filter_policy_, options)), + options_(SanitizeOptions(dbname, &internal_comparator_, + &internal_filter_policy_, options)), internal_filter_policy_(options.filter_policy), owns_info_log_(options_.info_log != options.info_log), db_lock_(nullptr), @@ -241,8 +264,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) shutting_down_(nullptr), bg_cv_(&mutex_), mem_rep_factory_(options_.memtable_factory.get()), - mem_(new MemTable(internal_comparator_, mem_rep_factory_, - NumberLevels(), options_)), + mem_(new MemTable(internal_comparator_, options_)), logfile_number_(0), super_version_(nullptr), tmp_batch_(), @@ -390,7 +412,7 @@ uint64_t DBImpl::TEST_Current_Manifest_FileNo() { } Status DBImpl::NewDB() { - VersionEdit new_db(NumberLevels()); + VersionEdit new_db; new_db.SetComparatorName(user_comparator()->Name()); new_db.SetLogNumber(0); new_db.SetNextFile(2); @@ -857,10 +879,8 @@ void DBImpl::PurgeObsoleteWALFiles() { // If externalTable is set, then apply recovered transactions // to that table. This is used for readonly mode. -Status DBImpl::Recover( - VersionEdit* edit, - MemTable* external_table, - bool error_if_log_file_exist) { +Status DBImpl::Recover(VersionEdit* edit, MemTable* external_table, + bool error_if_log_file_exist) { mutex_.AssertHeld(); assert(db_lock_ == nullptr); @@ -1028,8 +1048,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, WriteBatchInternal::SetContents(&batch, record); if (mem == nullptr) { - mem = new MemTable(internal_comparator_, mem_rep_factory_, - NumberLevels(), options_); + mem = new MemTable(internal_comparator_, options_); mem->Ref(); } status = WriteBatchInternal::InsertInto(&batch, mem, &options_); @@ -1088,7 +1107,8 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) { s = BuildTable(dbname_, env_, options_, storage_options_, table_cache_.get(), iter, &meta, user_comparator(), newest_snapshot, - earliest_seqno_in_memtable, true); + earliest_seqno_in_memtable, + GetCompressionFlush(options_)); LogFlush(options_.info_log); mutex_.Lock(); } @@ -1149,15 +1169,11 @@ Status DBImpl::WriteLevel0Table(autovector& mems, VersionEdit* edit, Log(options_.info_log, "Level-0 flush table #%lu: started", (unsigned long)meta.number); - // We skip compression if universal compression is used and the size - // threshold is set for compression. - bool enable_compression = (options_.compaction_style - != kCompactionStyleUniversal || - options_.compaction_options_universal.compression_size_percent < 0); + s = BuildTable(dbname_, env_, options_, storage_options_, table_cache_.get(), iter, &meta, user_comparator(), newest_snapshot, - earliest_seqno_in_memtable, enable_compression); + earliest_seqno_in_memtable, GetCompressionFlush(options_)); LogFlush(options_.info_log); delete iter; Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s", @@ -1279,8 +1295,11 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, return s; } -void DBImpl::CompactRange(const Slice* begin, const Slice* end, - bool reduce_level, int target_level) { +void DBImpl::CompactRange(const Slice* begin, + const Slice* end, + bool reduce_level, + int target_level) { + FlushMemTable(FlushOptions()); int max_level_with_files = 1; { MutexLock l(&mutex_); @@ -1291,9 +1310,15 @@ void DBImpl::CompactRange(const Slice* begin, const Slice* end, } } } - TEST_FlushMemTable(); // TODO(sanjay): Skip if memtable does not overlap - for (int level = 0; level < max_level_with_files; level++) { - TEST_CompactRange(level, begin, end); + for (int level = 0; level <= max_level_with_files; level++) { + // in case the compaction is unversal or if we're compacting the + // bottom-most level, the output level will be the same as input one + if (options_.compaction_style == kCompactionStyleUniversal || + level == max_level_with_files) { + RunManualCompaction(level, level, begin, end); + } else { + RunManualCompaction(level, level + 1, begin, end); + } } if (reduce_level) { @@ -1305,13 +1330,13 @@ void DBImpl::CompactRange(const Slice* begin, const Slice* end, // return the same level if it cannot be moved int DBImpl::FindMinimumEmptyLevelFitting(int level) { mutex_.AssertHeld(); + Version* current = versions_->current(); int minimum_level = level; for (int i = level - 1; i > 0; --i) { // stop if level i is not empty - if (versions_->NumLevelFiles(i) > 0) break; - + if (current->NumLevelFiles(i) > 0) break; // stop if level i is too small (cannot fit the level files) - if (versions_->MaxBytesForLevel(i) < versions_->NumLevelBytes(level)) break; + if (versions_->MaxBytesForLevel(i) < current->NumLevelBytes(level)) break; minimum_level = i; } @@ -1356,7 +1381,7 @@ void DBImpl::ReFitLevel(int level, int target_level) { Log(options_.info_log, "Before refitting:\n%s", versions_->current()->DebugString().data()); - VersionEdit edit(NumberLevels()); + VersionEdit edit; for (const auto& f : versions_->current()->files_[level]) { edit.DeleteFile(level, f->number); edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest, @@ -1590,13 +1615,17 @@ Status DBImpl::AppendSortedWalsOfType(const std::string& path, return status; } -void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { - assert(level >= 0); +void DBImpl::RunManualCompaction(int input_level, + int output_level, + const Slice* begin, + const Slice* end) { + assert(input_level >= 0); InternalKey begin_storage, end_storage; ManualCompaction manual; - manual.level = level; + manual.input_level = input_level; + manual.output_level = output_level; manual.done = false; manual.in_progress = false; // For universal compaction, we enforce every manual compaction to compact @@ -1624,11 +1653,11 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { // can compact any range of keys/files. // // bg_manual_only_ is non-zero when at least one thread is inside - // TEST_CompactRange(), i.e. during that time no other compaction will + // RunManualCompaction(), i.e. during that time no other compaction will // get scheduled (see MaybeScheduleFlushOrCompaction). // // Note that the following loop doesn't stop more that one thread calling - // TEST_CompactRange() from getting to the second while loop below. + // RunManualCompaction() from getting to the second while loop below. // However, only one of them will actually schedule compaction, while // others will wait on a condition variable until it completes. @@ -1658,6 +1687,15 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { --bg_manual_only_; } +void DBImpl::TEST_CompactRange(int level, + const Slice* begin, + const Slice* end) { + int output_level = (options_.compaction_style == kCompactionStyleUniversal) + ? level + : level + 1; + RunManualCompaction(level, output_level, begin, end); +} + Status DBImpl::FlushMemTable(const FlushOptions& options) { // nullptr batch means just wait for earlier writes to be done Status s = Write(WriteOptions(), nullptr); @@ -1803,6 +1841,11 @@ void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); } +uint64_t DBImpl::TEST_GetLevel0TotalSize() { + MutexLock l(&mutex_); + return versions_->current()->NumLevelBytes(0); +} + void DBImpl::BackgroundCallCompaction() { bool madeProgress = false; DeletionState deletion_state(true); @@ -1877,23 +1920,27 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, unique_ptr c; bool is_manual = (manual_compaction_ != nullptr) && (manual_compaction_->in_progress == false); - InternalKey manual_end; + InternalKey manual_end_storage; + InternalKey* manual_end = &manual_end_storage; if (is_manual) { ManualCompaction* m = manual_compaction_; assert(!m->in_progress); m->in_progress = true; // another thread cannot pick up the same work - c.reset(versions_->CompactRange(m->level, m->begin, m->end)); - if (c) { - manual_end = c->input(0, c->num_input_files(0) - 1)->largest; - } else { + c.reset(versions_->CompactRange( + m->input_level, m->output_level, m->begin, m->end, &manual_end)); + if (!c) { m->done = true; } Log(options_.info_log, - "Manual compaction at level-%d from %s .. %s; will stop at %s\n", - m->level, + "Manual compaction from level-%d to level-%d from %s .. %s; will stop " + "at %s\n", + m->input_level, + m->output_level, (m->begin ? m->begin->DebugString().c_str() : "(begin)"), (m->end ? m->end->DebugString().c_str() : "(end)"), - (m->done ? "(end)" : manual_end.DebugString().c_str())); + ((m->done || manual_end == nullptr) + ? "(end)" + : manual_end->DebugString().c_str())); } else if (!options_.disable_auto_compactions) { c.reset(versions_->PickCompaction()); } @@ -1912,13 +1959,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, f->smallest_seqno, f->largest_seqno); status = versions_->LogAndApply(c->edit(), &mutex_); InstallSuperVersion(deletion_state); - VersionSet::LevelSummaryStorage tmp; + Version::LevelSummaryStorage tmp; Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", - static_cast(f->number), - c->level() + 1, + static_cast(f->number), c->level() + 1, static_cast(f->file_size), - status.ToString().c_str(), - versions_->LevelSummary(&tmp)); + status.ToString().c_str(), versions_->current()->LevelSummary(&tmp)); versions_->ReleaseCompactionFiles(c.get(), status); *madeProgress = true; } else { @@ -1958,13 +2003,19 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // Also note that, if we don't stop here, then the current compaction // writes a new file back to level 0, which will be used in successive // compaction. Hence the manual compaction will never finish. - if (options_.compaction_style == kCompactionStyleUniversal) { + // + // Stop the compaction if manual_end points to nullptr -- this means + // that we compacted the whole range. manual_end should always point + // to nullptr in case of universal compaction + if (manual_end == nullptr) { m->done = true; } if (!m->done) { // We only compacted part of the requested range. Update *m // to the range that is left to be compacted. - m->tmp_storage = manual_end; + // Universal compaction should always compact the whole range + assert(options_.compaction_style != kCompactionStyleUniversal); + m->tmp_storage = *manual_end; m->begin = &m->tmp_storage; } m->in_progress = false; // not being processed anymore @@ -1996,14 +2047,14 @@ void DBImpl::CleanupCompaction(CompactionState* compact, Status status) { } // Allocate the file numbers for the output file. We allocate as -// many output file numbers as there are files in level+1. +// many output file numbers as there are files in level+1 (at least one) // Insert them into pending_outputs so that they do not get deleted. void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) { mutex_.AssertHeld(); assert(compact != nullptr); assert(compact->builder == nullptr); int filesNeeded = compact->compaction->num_input_files(1); - for (int i = 0; i < filesNeeded; i++) { + for (int i = 0; i < std::max(filesNeeded, 1); i++) { uint64_t file_number = versions_->NewFileNumber(); pending_outputs_.insert(file_number); compact->allocated_file_numbers.push_back(file_number); @@ -2147,14 +2198,11 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) { // Add compaction outputs compact->compaction->AddInputDeletions(compact->compaction->edit()); - const int level = compact->compaction->level(); for (size_t i = 0; i < compact->outputs.size(); i++) { const CompactionState::Output& out = compact->outputs[i]; compact->compaction->edit()->AddFile( - (options_.compaction_style == kCompactionStyleUniversal) ? - level : level + 1, - out.number, out.file_size, out.smallest, out.largest, - out.smallest_seqno, out.largest_seqno); + compact->compaction->output_level(), out.number, out.file_size, + out.smallest, out.largest, out.smallest_seqno, out.largest_seqno); } return versions_->LogAndApply(compact->compaction->edit(), &mutex_); } @@ -2196,14 +2244,14 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), - compact->compaction->level() + 1, + compact->compaction->output_level(), compact->compaction->score(), options_.max_background_compactions - bg_compaction_scheduled_); char scratch[256]; compact->compaction->Summary(scratch, sizeof(scratch)); Log(options_.info_log, "Compaction start summary: %s\n", scratch); - assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); + assert(versions_->current()->NumLevelFiles(compact->compaction->level()) > 0); assert(compact->builder == nullptr); assert(!compact->outfile); @@ -2575,22 +2623,21 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, status = InstallCompactionResults(compact); InstallSuperVersion(deletion_state); } - VersionSet::LevelSummaryStorage tmp; + Version::LevelSummaryStorage tmp; Log(options_.info_log, "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " "write-amplify(%.1f) %s\n", - versions_->LevelSummary(&tmp), + versions_->current()->LevelSummary(&tmp), (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) / - (double) stats.micros, - compact->compaction->output_level(), - stats.files_in_leveln, stats.files_in_levelnp1, stats.files_out_levelnp1, - stats.bytes_readn / 1048576.0, - stats.bytes_readnp1 / 1048576.0, + (double)stats.micros, + compact->compaction->output_level(), stats.files_in_leveln, + stats.files_in_levelnp1, stats.files_out_levelnp1, + stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0, stats.bytes_written / 1048576.0, (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) / - (double) stats.bytes_readn, - stats.bytes_written / (double) stats.bytes_readn, + (double)stats.bytes_readn, + stats.bytes_written / (double)stats.bytes_readn, status.ToString().c_str()); return status; @@ -2658,6 +2705,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, Iterator* internal_iter = NewMergingIterator( env_, &internal_comparator_, memtables.data(), memtables.size() ); + cleanup->version = version; cleanup->mu = &mutex_; cleanup->db = this; @@ -2673,7 +2721,7 @@ Iterator* DBImpl::TEST_NewInternalIterator() { int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { MutexLock l(&mutex_); - return versions_->MaxNextLevelOverlappingBytes(); + return versions_->current()->MaxNextLevelOverlappingBytes(); } Status DBImpl::Get(const ReadOptions& options, @@ -2992,12 +3040,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { uint64_t last_sequence = versions_->LastSequence(); Writer* last_writer = &w; if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions - // TODO: BuildBatchGroup physically concatenate/copy all write batches into - // a new one. Mem copy is done with the lock held. Ideally, we only need - // the lock to obtain the last_writer and the references to all batches. - // Creation (copy) of the merged batch could have been done outside of the - // lock protected region. - WriteBatch* updates = BuildBatchGroup(&last_writer); + autovector write_batch_group; + BuildBatchGroup(&last_writer, &write_batch_group); // Add to log and apply to memtable. We can release the lock // during this phase since &w is currently responsible for logging @@ -3005,6 +3049,16 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { // into mem_. { mutex_.Unlock(); + WriteBatch* updates = nullptr; + if (write_batch_group.size() == 1) { + updates = write_batch_group[0]; + } else { + updates = &tmp_batch_; + for (size_t i = 0; i < write_batch_group.size(); ++i) { + WriteBatchInternal::Append(updates, write_batch_group[i]); + } + } + const SequenceNumber current_sequence = last_sequence + 1; WriteBatchInternal::SetSequence(updates, current_sequence); int my_batch_count = WriteBatchInternal::Count(updates); @@ -3056,12 +3110,12 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { SEQUENCE_NUMBER, last_sequence); } StartPerfTimer(&pre_post_process_timer); + if (updates == &tmp_batch_) tmp_batch_.Clear(); mutex_.Lock(); if (status.ok()) { versions_->SetLastSequence(last_sequence); } } - if (updates == &tmp_batch_) tmp_batch_.Clear(); } if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) { bg_error_ = status; // stop compaction & fail any further writes @@ -3091,13 +3145,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { // REQUIRES: Writer list must be non-empty // REQUIRES: First writer must have a non-nullptr batch -WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) { +void DBImpl::BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group) { assert(!writers_.empty()); Writer* first = writers_.front(); - WriteBatch* result = first->batch; - assert(result != nullptr); + assert(first->batch != nullptr); size_t size = WriteBatchInternal::ByteSize(first->batch); + write_batch_group->push_back(first->batch); // Allow the group to grow up to a maximum size, but if the // original write is small, limit the growth so we do not slow @@ -3130,18 +3185,10 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) { break; } - // Append to *reuslt - if (result == first->batch) { - // Switch to temporary batch instead of disturbing caller's batch - result = &tmp_batch_; - assert(WriteBatchInternal::Count(result) == 0); - WriteBatchInternal::Append(result, first->batch); - } - WriteBatchInternal::Append(result, w->batch); + write_batch_group->push_back(w->batch); } *last_writer = w; } - return result; } // This function computes the amount of time in microseconds by which a write @@ -3195,10 +3242,7 @@ Status DBImpl::MakeRoomForWrite(bool force, // Yield previous error s = bg_error_; break; - } else if ( - allow_delay && - versions_->NumLevelFiles(0) >= - options_.level0_slowdown_writes_trigger) { + } else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) { // We are getting close to hitting a hard limit on the number of // L0 files. Rather than delaying a single write by several // seconds when we hit the hard limit, start delaying each @@ -3210,7 +3254,7 @@ Status DBImpl::MakeRoomForWrite(bool force, { StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT); env_->SleepForMicroseconds( - SlowdownAmount(versions_->NumLevelFiles(0), + SlowdownAmount(versions_->current()->NumLevelFiles(0), options_.level0_slowdown_writes_trigger, options_.level0_stop_writes_trigger) ); @@ -3245,7 +3289,7 @@ Status DBImpl::MakeRoomForWrite(bool force, STALL_MEMTABLE_COMPACTION_MICROS, stall); stall_memtable_compaction_ += stall; stall_memtable_compaction_count_++; - } else if (versions_->NumLevelFiles(0) >= + } else if (versions_->current()->NumLevelFiles(0) >= options_.level0_stop_writes_trigger) { // There are too many level-0 files. DelayLoggingAndReset(); @@ -3321,17 +3365,13 @@ Status DBImpl::MakeRoomForWrite(bool force, EnvOptions soptions(storage_options_); soptions.use_mmap_writes = false; DelayLoggingAndReset(); - s = env_->NewWritableFile( - LogFileName(options_.wal_dir, new_log_number), - &lfile, - soptions - ); + s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number), + &lfile, soptions); if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); - new_mem = new MemTable( - internal_comparator_, mem_rep_factory_, NumberLevels(), options_); + new_mem = new MemTable(internal_comparator_, options_); new_superversion = new SuperVersion(); } } @@ -3379,6 +3419,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { value->clear(); MutexLock l(&mutex_); + Version* current = versions_->current(); Slice in = property; Slice prefix("rocksdb."); if (!in.starts_with(prefix)) return false; @@ -3393,7 +3434,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { } else { char buf[100]; snprintf(buf, sizeof(buf), "%d", - versions_->NumLevelFiles(static_cast(level))); + current->NumLevelFiles(static_cast(level))); *value = buf; return true; } @@ -3408,8 +3449,8 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, - versions_->NumLevelFiles(level), - versions_->NumLevelBytes(level) / 1048576.0); + current->NumLevelFiles(level), + current->NumLevelBytes(level) / 1048576.0); value->append(buf); } return true; @@ -3452,8 +3493,8 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n" ); value->append(buf); - for (int level = 0; level < NumberLevels(); level++) { - int files = versions_->NumLevelFiles(level); + for (int level = 0; level < current->NumberLevels(); level++) { + int files = current->NumLevelFiles(level); if (stats_[level].micros > 0 || files > 0) { int64_t bytes_read = stats_[level].bytes_readn + stats_[level].bytes_readnp1; @@ -3474,8 +3515,8 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f %10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %9.1f %9lu\n", level, files, - versions_->NumLevelBytes(level) / 1048576.0, - versions_->NumLevelBytes(level) / + current->NumLevelBytes(level) / 1048576.0, + current->NumLevelBytes(level) / versions_->MaxBytesForLevel(level), stats_[level].micros / 1e6, bytes_read / 1048576.0, @@ -3712,7 +3753,7 @@ Status DBImpl::DeleteFile(std::string name) { int level; FileMetaData* metadata; int maxlevel = NumberLevels(); - VersionEdit edit(maxlevel); + VersionEdit edit; DeletionState deletion_state(true); { MutexLock l(&mutex_); @@ -3735,7 +3776,7 @@ Status DBImpl::DeleteFile(std::string name) { // This is to make sure that any deletion tombstones are not // lost. Check that the level passed is the last level. for (int i = level + 1; i < maxlevel; i++) { - if (versions_->NumLevelFiles(i) != 0) { + if (versions_->current()->NumLevelFiles(i) != 0) { Log(options_.info_log, "DeleteFile %s FAILED. File not in last level\n", name.c_str()); return Status::InvalidArgument("File not in last level"); @@ -3834,19 +3875,19 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { return s; } impl->mutex_.Lock(); // DBImpl::Recover() requires lock being held - VersionEdit edit(impl->NumberLevels()); + VersionEdit edit; s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists if (s.ok()) { uint64_t new_log_number = impl->versions_->NewFileNumber(); unique_ptr lfile; soptions.use_mmap_writes = false; - s = options.env->NewWritableFile( + s = impl->options_.env->NewWritableFile( LogFileName(impl->options_.wal_dir, new_log_number), &lfile, soptions ); if (s.ok()) { - lfile->SetPreallocationBlockSize(1.1 * options.write_buffer_size); + lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size); edit.SetLogNumber(new_log_number); impl->logfile_number_ = new_log_number; impl->log_.reset(new log::Writer(std::move(lfile))); @@ -3860,12 +3901,11 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { impl->MaybeScheduleLogDBDeployStats(); } } - impl->mutex_.Unlock(); - if (options.compaction_style == kCompactionStyleUniversal) { - int num_files; + if (s.ok() && impl->options_.compaction_style == kCompactionStyleUniversal) { + Version* current = impl->versions_->current(); for (int i = 1; i < impl->NumberLevels(); i++) { - num_files = impl->versions_->NumLevelFiles(i); + int num_files = current->NumLevelFiles(i); if (num_files > 0) { s = Status::InvalidArgument("Not all files are at level 0. Cannot " "open with universal compaction style."); @@ -3874,6 +3914,8 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { } } + impl->mutex_.Unlock(); + if (s.ok()) { *dbptr = impl; } else { diff --git a/db/db_impl.h b/db/db_impl.h index a5e4d42bcd..c43f720b69 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -91,10 +91,17 @@ class DBImpl : public DB { virtual Status GetDbIdentity(std::string& identity); + void RunManualCompaction(int input_level, + int output_level, + const Slice* begin, + const Slice* end); + // Extra methods (for testing) that are not in the public DB interface // Compact any files in the named level that overlap [*begin, *end] - void TEST_CompactRange(int level, const Slice* begin, const Slice* end); + void TEST_CompactRange(int level, + const Slice* begin, + const Slice* end); // Force current memtable contents to be flushed. Status TEST_FlushMemTable(); @@ -124,7 +131,7 @@ class DBImpl : public DB { void TEST_PurgeObsoleteteWAL(); // get total level0 file size. Only for testing. - uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);} + uint64_t TEST_GetLevel0TotalSize(); void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL) { @@ -292,7 +299,8 @@ class DBImpl : public DB { // the superversion outside of mutex Status MakeRoomForWrite(bool force /* compact even if there is room? */, SuperVersion** superversion_to_free); - WriteBatch* BuildBatchGroup(Writer** last_writer); + void BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group); // Force current memtable contents to be flushed. Status FlushMemTable(const FlushOptions& options); @@ -405,7 +413,8 @@ class DBImpl : public DB { // Information for a manual compaction struct ManualCompaction { - int level; + int input_level; + int output_level; bool done; bool in_progress; // compaction request being processed? const InternalKey* begin; // nullptr means beginning of key range @@ -590,4 +599,7 @@ extern Options SanitizeOptions(const std::string& db, CompressionType GetCompressionType(const Options& options, int level, const bool enable_compression); +// Determine compression type for L0 file written by memtable flush. +CompressionType GetCompressionFlush(const Options& options); + } // namespace rocksdb diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index dbb297e93a..04033b2fa3 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -86,7 +86,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, DBImplReadOnly* impl = new DBImplReadOnly(options, dbname); impl->mutex_.Lock(); - VersionEdit edit(impl->NumberLevels()); + VersionEdit edit; Status s = impl->Recover(&edit, impl->GetMemTable(), error_if_log_file_exist); impl->mutex_.Unlock(); diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc index 91810abe38..db86865ca0 100644 --- a/db/db_stats_logger.cc +++ b/db/db_stats_logger.cc @@ -65,13 +65,14 @@ void DBImpl::LogDBDeployStats() { uint64_t file_total_size = 0; uint32_t file_total_num = 0; - for (int i = 0; i < versions_->NumberLevels(); i++) { - file_total_num += versions_->NumLevelFiles(i); - file_total_size += versions_->NumLevelBytes(i); + Version* current = versions_->current(); + for (int i = 0; i < current->NumberLevels(); i++) { + file_total_num += current->NumLevelFiles(i); + file_total_size += current->NumLevelBytes(i); } - VersionSet::LevelSummaryStorage scratch; - const char* file_num_summary = versions_->LevelSummary(&scratch); + Version::LevelSummaryStorage scratch; + const char* file_num_summary = current->LevelSummary(&scratch); std::string file_num_per_level(file_num_summary); std::string data_size_per_level(file_num_summary); diff --git a/db/db_test.cc b/db/db_test.cc index ad55ecb1b0..9a5d128df1 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -832,6 +832,156 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); } +TEST(DBTest, LevelLimitReopen) { + Options options = CurrentOptions(); + Reopen(&options); + + const std::string value(1024 * 1024, ' '); + int i = 0; + while (NumTableFilesAtLevel(2) == 0) { + ASSERT_OK(Put(Key(i++), value)); + } + + options.num_levels = 1; + options.max_bytes_for_level_multiplier_additional.resize(1, 1); + Status s = TryReopen(&options); + ASSERT_EQ(s.IsInvalidArgument(), true); + ASSERT_EQ(s.ToString(), + "Invalid argument: db has more levels than options.num_levels"); + + options.num_levels = 10; + options.max_bytes_for_level_multiplier_additional.resize(10, 1); + ASSERT_OK(TryReopen(&options)); +} + +TEST(DBTest, Preallocation) { + const std::string src = dbname_ + "/alloc_test"; + unique_ptr srcfile; + const EnvOptions soptions; + ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions)); + srcfile->SetPreallocationBlockSize(1024 * 1024); + + // No writes should mean no preallocation + size_t block_size, last_allocated_block; + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 0UL); + + // Small write should preallocate one block + srcfile->Append("test"); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 1UL); + + // Write an entire preallocation block, make sure we increased by two. + std::string buf(block_size, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 2UL); + + // Write five more blocks at once, ensure we're where we need to be. + buf = std::string(block_size * 5, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 7UL); +} + +TEST(DBTest, PutDeleteGet) { + do { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + ASSERT_OK(db_->Delete(WriteOptions(), "foo")); + ASSERT_EQ("NOT_FOUND", Get("foo")); + } while (ChangeOptions()); +} + + +TEST(DBTest, GetFromImmutableLayer) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + Put("k1", std::string(100000, 'x')); // Fill memtable + Put("k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_EQ("v1", Get("foo")); + env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls + } while (ChangeOptions()); +} + +TEST(DBTest, GetFromVersions) { + do { + ASSERT_OK(Put("foo", "v1")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetSnapshot) { + do { + // Try with both a short key and a long key + for (int i = 0; i < 2; i++) { + std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); + ASSERT_OK(Put(key, "v1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Put(key, "v2")); + ASSERT_EQ("v2", Get(key)); + ASSERT_EQ("v1", Get(key, s1)); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get(key)); + ASSERT_EQ("v1", Get(key, s1)); + db_->ReleaseSnapshot(s1); + } + } while (ChangeOptions()); +} + +TEST(DBTest, GetLevel0Ordering) { + do { + // Check that we process level-0 files in correct order. The code + // below generates two level-0 files where the earlier one comes + // before the later one in the level-0 file list since the earlier + // one has a smaller "smallest" key. + ASSERT_OK(Put("bar", "b")); + ASSERT_OK(Put("foo", "v1")); + dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Put("foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetOrderedByLevels) { + do { + ASSERT_OK(Put("foo", "v1")); + Compact("a", "z"); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetPicksCorrectFile) { + do { + // Arrange to have multiple files in a non-level-0 level. + ASSERT_OK(Put("a", "va")); + Compact("a", "b"); + ASSERT_OK(Put("x", "vx")); + Compact("x", "y"); + ASSERT_OK(Put("f", "vf")); + Compact("f", "g"); + ASSERT_EQ("va", Get("a")); + ASSERT_EQ("vf", Get("f")); + ASSERT_EQ("vx", Get("x")); + } while (ChangeOptions()); +} + TEST(DBTest, GetEncountersEmptyLevel) { do { // Arrange for the following to happen: @@ -3325,34 +3475,46 @@ TEST(DBTest, ManualCompaction) { ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2) << "Need to update this test to match kMaxMemCompactLevel"; - MakeTables(3, "p", "q"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q"); + ASSERT_EQ("1,1,1", FilesPerLevel()); - // Compaction range falls before files - Compact("", "c"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + // Compaction range falls before files + Compact("", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel()); - // Compaction range falls after files - Compact("r", "z"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + // Compaction range falls after files + Compact("r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel()); - // Compaction range overlaps files - Compact("p1", "p9"); - ASSERT_EQ("0,0,1", FilesPerLevel()); + // Compaction range overlaps files + Compact("p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel()); - // Populate a different range - MakeTables(3, "c", "e"); - ASSERT_EQ("1,1,2", FilesPerLevel()); + // Populate a different range + MakeTables(3, "c", "e"); + ASSERT_EQ("1,1,2", FilesPerLevel()); - // Compact just the new range - Compact("b", "f"); - ASSERT_EQ("0,0,2", FilesPerLevel()); + // Compact just the new range + Compact("b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel()); + + // Compact all + MakeTables(1, "a", "z"); + ASSERT_EQ("0,1,2", FilesPerLevel()); + db_->CompactRange(nullptr, nullptr); + ASSERT_EQ("0,0,1", FilesPerLevel()); + + if (iter == 0) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.create_if_missing = true; + DestroyAndReopen(&options); + } + } - // Compact all - MakeTables(1, "a", "z"); - ASSERT_EQ("0,1,2", FilesPerLevel()); - db_->CompactRange(nullptr, nullptr); - ASSERT_EQ("0,0,1", FilesPerLevel()); } TEST(DBTest, DBOpen_Options) { @@ -3413,7 +3575,7 @@ TEST(DBTest, DBOpen_Change_NumLevels) { opts.create_if_missing = false; opts.num_levels = 2; s = DB::Open(opts, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "Corruption") != nullptr); + ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); ASSERT_TRUE(db == nullptr); } @@ -4348,6 +4510,70 @@ TEST(DBTest, MultiThreaded) { } while (ChangeOptions()); } +// Group commit test: +namespace { + +static const int kGCNumThreads = 4; +static const int kGCNumKeys = 1000; + +struct GCThread { + DB* db; + int id; + std::atomic done; +}; + +static void GCThreadBody(void* arg) { + GCThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->db; + WriteOptions wo; + + for (int i = 0; i < kGCNumKeys; ++i) { + std::string kv(std::to_string(i + id * kGCNumKeys)); + ASSERT_OK(db->Put(wo, kv, kv)); + } + t->done = true; +} + +} // namespace + +TEST(DBTest, GroupCommitTest) { + do { + // Start threads + GCThread thread[kGCNumThreads]; + for (int id = 0; id < kGCNumThreads; id++) { + thread[id].id = id; + thread[id].db = db_; + thread[id].done = false; + env_->StartThread(GCThreadBody, &thread[id]); + } + + for (int id = 0; id < kGCNumThreads; id++) { + while (thread[id].done == false) { + env_->SleepForMicroseconds(100000); + } + } + + std::vector expected_db; + for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { + expected_db.push_back(std::to_string(i)); + } + sort(expected_db.begin(), expected_db.end()); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + for (auto x : expected_db) { + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ(itr->key().ToString(), x); + ASSERT_EQ(itr->value().ToString(), x); + itr->Next(); + } + ASSERT_TRUE(!itr->Valid()); + delete itr; + + } while (ChangeOptions()); +} + namespace { typedef std::map KVMap; } @@ -4892,7 +5118,7 @@ void BM_LogAndApply(int iters, int num_base_files) { EnvOptions sopt; VersionSet vset(dbname, &options, sopt, nullptr, &cmp); ASSERT_OK(vset.Recover()); - VersionEdit vbase(vset.NumberLevels()); + VersionEdit vbase; uint64_t fnum = 1; for (int i = 0; i < num_base_files; i++) { InternalKey start(MakeKey(2*fnum), 1, kTypeValue); @@ -4904,7 +5130,7 @@ void BM_LogAndApply(int iters, int num_base_files) { uint64_t start_micros = env->NowMicros(); for (int i = 0; i < iters; i++) { - VersionEdit vedit(vset.NumberLevels()); + VersionEdit vedit; vedit.DeleteFile(2, fnum); InternalKey start(MakeKey(2*fnum), 1, kTypeValue); InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); diff --git a/db/memtable.cc b/db/memtable.cc index c556412ea5..430c3589bc 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -35,27 +35,22 @@ struct hash { namespace rocksdb { -MemTable::MemTable(const InternalKeyComparator& cmp, - MemTableRepFactory* table_factory, - int numlevel, - const Options& options) +MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) : comparator_(cmp), refs_(0), arena_impl_(options.arena_block_size), - table_(table_factory->CreateMemTableRep(comparator_, &arena_impl_)), + table_(options.memtable_factory->CreateMemTableRep(comparator_, + &arena_impl_)), flush_in_progress_(false), flush_completed_(false), file_number_(0), - edit_(numlevel), first_seqno_(0), mem_next_logfile_number_(0), mem_logfile_number_(0), - locks_(options.inplace_update_support - ? options.inplace_update_num_locks - : 0), + locks_(options.inplace_update_support ? options.inplace_update_num_locks + : 0), prefix_extractor_(options.prefix_extractor) { - - if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) { + if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) { prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits, options.memtable_prefix_bloom_probes)); } @@ -67,7 +62,7 @@ MemTable::~MemTable() { size_t MemTable::ApproximateMemoryUsage() { return arena_impl_.ApproximateMemoryUsage() + - table_->ApproximateMemoryUsage(); + table_->ApproximateMemoryUsage(); } int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) @@ -98,12 +93,11 @@ class MemTableIterator: public Iterator { MemTableIterator(const MemTable& mem, const ReadOptions& options) : mem_(mem), iter_(), dynamic_prefix_seek_(false), valid_(false) { if (options.prefix) { - iter_ = mem_.table_->GetPrefixIterator(*options.prefix); + iter_.reset(mem_.table_->GetPrefixIterator(*options.prefix)); } else if (options.prefix_seek) { - dynamic_prefix_seek_ = true; - iter_ = mem_.table_->GetDynamicPrefixIterator(); + iter_.reset(mem_.table_->GetDynamicPrefixIterator()); } else { - iter_ = mem_.table_->GetIterator(); + iter_.reset(mem_.table_->GetIterator()); } } @@ -212,12 +206,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, Slice mem_key = key.memtable_key(); Slice user_key = key.user_key(); - std::shared_ptr iter; + std::unique_ptr iter; if (prefix_bloom_ && !prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) { // iter is null if prefix bloom says the key does not exist } else { - iter = table_->GetIterator(user_key); + iter.reset(table_->GetIterator(user_key)); iter->Seek(user_key, mem_key.data()); } @@ -328,11 +322,10 @@ void MemTable::Update(SequenceNumber seq, LookupKey lkey(key, seq); Slice mem_key = lkey.memtable_key(); - std::shared_ptr iter( + std::unique_ptr iter( table_->GetIterator(lkey.user_key())); iter->Seek(key, mem_key.data()); - if (iter->Valid()) { // entry format is: // key_length varint32 @@ -462,4 +455,37 @@ bool MemTable::UpdateCallback(SequenceNumber seq, // or key doesn't exist return false; } + +size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { + Slice memkey = key.memtable_key(); + + // A total ordered iterator is costly for some memtablerep (prefix aware + // reps). By passing in the user key, we allow efficient iterator creation. + // The iterator only needs to be ordered within the same user key. + std::unique_ptr iter( + table_->GetIterator(key.user_key())); + iter->Seek(key.user_key(), memkey.data()); + + size_t num_successive_merges = 0; + + for (; iter->Valid(); iter->Next()) { + const char* entry = iter->key(); + uint32_t key_length; + const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (!comparator_.comparator.user_comparator()->Compare( + Slice(iter_key_ptr, key_length - 8), key.user_key()) == 0) { + break; + } + + const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8); + if (static_cast(tag & 0xff) != kTypeMerge) { + break; + } + + ++num_successive_merges; + } + + return num_successive_merges; +} + } // namespace rocksdb diff --git a/db/memtable.h b/db/memtable.h index c94bf0b0b3..5e7eeb4a18 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -35,11 +35,8 @@ class MemTable { // MemTables are reference counted. The initial reference count // is zero and the caller must call Ref() at least once. - explicit MemTable( - const InternalKeyComparator& comparator, - MemTableRepFactory* table_factory, - int numlevel = 7, - const Options& options = Options()); + explicit MemTable(const InternalKeyComparator& comparator, + const Options& options = Options()); ~MemTable(); @@ -123,6 +120,11 @@ class MemTable { const Slice& delta, const Options& options); + // Returns the number of successive merge entries starting from the newest + // entry for the key up to the last non-merge entry or last entry for the + // key in the memtable. + size_t CountSuccessiveMergeEntries(const LookupKey& key); + // Returns the edits area that is needed for flushing the memtable VersionEdit* GetEdits() { return &edit_; } @@ -157,7 +159,7 @@ class MemTable { KeyComparator comparator_; int refs_; ArenaImpl arena_impl_; - shared_ptr table_; + unique_ptr table_; // These are used to manage memtable flushes to storage bool flush_in_progress_; // started the flush diff --git a/db/merge_test.cc b/db/merge_test.cc index 0c14aff2ce..887d8ad425 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -14,6 +14,7 @@ #include "rocksdb/merge_operator.h" #include "db/dbformat.h" #include "db/db_impl.h" +#include "db/write_batch_internal.h" #include "utilities/merge_operators.h" #include "util/testharness.h" #include "utilities/utility_db.h" @@ -21,13 +22,52 @@ using namespace std; using namespace rocksdb; +namespace { + int numMergeOperatorCalls; -std::shared_ptr OpenDb(const string& dbname, const bool ttl = false) { + void resetNumMergeOperatorCalls() { + numMergeOperatorCalls = 0; + } +} + +class CountMergeOperator : public AssociativeMergeOperator { + public: + CountMergeOperator() { + mergeOperator_ = MergeOperators::CreateUInt64AddOperator(); + } + + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const override { + ++numMergeOperatorCalls; + return mergeOperator_->PartialMerge( + key, + *existing_value, + value, + new_value, + logger); + } + + virtual const char* Name() const override { + return "UInt64AddOperator"; + } + + private: + std::shared_ptr mergeOperator_; +}; + +std::shared_ptr OpenDb( + const string& dbname, + const bool ttl = false, + const unsigned max_successive_merges = 0) { DB* db; StackableDB* sdb; Options options; options.create_if_missing = true; - options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.merge_operator = std::make_shared(); + options.max_successive_merges = max_successive_merges; Status s; DestroyDB(dbname, Options()); if (ttl) { @@ -243,6 +283,67 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) { } } +void testSuccessiveMerge( + Counters& counters, int max_num_merges, int num_merges) { + + counters.assert_remove("z"); + uint64_t sum = 0; + + for (int i = 1; i <= num_merges; ++i) { + resetNumMergeOperatorCalls(); + counters.assert_add("z", i); + sum += i; + + if (i % (max_num_merges + 1) == 0) { + assert(numMergeOperatorCalls == max_num_merges + 1); + } else { + assert(numMergeOperatorCalls == 0); + } + + resetNumMergeOperatorCalls(); + assert(counters.assert_get("z") == sum); + assert(numMergeOperatorCalls == i % (max_num_merges + 1)); + } +} + +void testSingleBatchSuccessiveMerge( + DB* db, + int max_num_merges, + int num_merges) { + assert(num_merges > max_num_merges); + + Slice key("BatchSuccessiveMerge"); + uint64_t merge_value = 1; + Slice merge_value_slice((char *)&merge_value, sizeof(merge_value)); + + // Create the batch + WriteBatch batch; + for (int i = 0; i < num_merges; ++i) { + batch.Merge(key, merge_value_slice); + } + + // Apply to memtable and count the number of merges + resetNumMergeOperatorCalls(); + { + Status s = db->Write(WriteOptions(), &batch); + assert(s.ok()); + } + assert(numMergeOperatorCalls == + num_merges - (num_merges % (max_num_merges + 1))); + + // Get the value + resetNumMergeOperatorCalls(); + string get_value_str; + { + Status s = db->Get(ReadOptions(), key, &get_value_str); + assert(s.ok()); + } + assert(get_value_str.size() == sizeof(uint64_t)); + uint64_t get_value = DecodeFixed64(&get_value_str[0]); + ASSERT_EQ(get_value, num_merges * merge_value); + ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1))); +} + void runTest(int argc, const string& dbname, const bool use_ttl = false) { auto db = OpenDb(dbname, use_ttl); @@ -265,6 +366,19 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) { } DestroyDB(dbname, Options()); + db.reset(); + + { + cout << "Test merge in memtable... \n"; + unsigned maxMerge = 5; + auto db = OpenDb(dbname, use_ttl, maxMerge); + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), compact); + testSuccessiveMerge(counters, maxMerge, maxMerge * 2); + testSingleBatchSuccessiveMerge(db.get(), 5, 7); + DestroyDB(dbname, Options()); + } + } int main(int argc, char *argv[]) { diff --git a/db/repair.cc b/db/repair.cc index 802c04fc4f..f72bca6b73 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -58,7 +58,7 @@ class Repairer { next_file_number_(1) { // TableCache can be small since we expect each table to be opened once. table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10); - edit_ = new VersionEdit(options.num_levels); + edit_ = new VersionEdit(); } ~Repairer() { @@ -196,8 +196,7 @@ class Repairer { std::string scratch; Slice record; WriteBatch batch; - MemTable* mem = new MemTable(icmp_, options_.memtable_factory.get(), - options_.num_levels); + MemTable* mem = new MemTable(icmp_, options_); mem->Ref(); int counter = 0; while (reader.ReadRecord(&record, &scratch)) { @@ -225,7 +224,8 @@ class Repairer { Iterator* iter = mem->NewIterator(); status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_, iter, &meta, - icmp_.user_comparator(), 0, 0, true); + icmp_.user_comparator(), 0, 0, + kNoCompression); delete iter; delete mem->Unref(); mem = nullptr; diff --git a/db/version_edit.cc b/db/version_edit.cc index 1a6a119564..6645998852 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -33,6 +33,7 @@ enum Tag { void VersionEdit::Clear() { comparator_.clear(); + max_level_ = 0; log_number_ = 0; prev_log_number_ = 0; last_sequence_ = 0; @@ -105,14 +106,13 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) { bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) { uint32_t v; - if (GetVarint32(input, &v) && - (int)v < number_levels_) { + if (GetVarint32(input, &v)) { *level = v; + if (max_level_ < *level) { + max_level_ = *level; + } return true; } else { - if ((int)v >= number_levels_) { - *msg = "db already has more levels than options.num_levels"; - } return false; } } diff --git a/db/version_edit.h b/db/version_edit.h index 7751ad92ef..ddba78bf26 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -41,10 +41,7 @@ struct FileMetaData { class VersionEdit { public: - explicit VersionEdit(int number_levels) : - number_levels_(number_levels) { - Clear(); - } + VersionEdit() { Clear(); } ~VersionEdit() { } void Clear(); @@ -115,7 +112,7 @@ class VersionEdit { bool GetLevel(Slice* input, int* level, const char** msg); - int number_levels_; + int max_level_; std::string comparator_; uint64_t log_number_; uint64_t prev_log_number_; @@ -127,9 +124,9 @@ class VersionEdit { bool has_next_file_number_; bool has_last_sequence_; - std::vector> compact_pointers_; + std::vector > compact_pointers_; DeletedFileSet deleted_files_; - std::vector> new_files_; + std::vector > new_files_; }; } // namespace rocksdb diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 4a00822f79..63aa32e8f6 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -15,7 +15,7 @@ namespace rocksdb { static void TestEncodeDecode(const VersionEdit& edit) { std::string encoded, encoded2; edit.EncodeTo(&encoded); - VersionEdit parsed(7); + VersionEdit parsed; Status s = parsed.DecodeFrom(encoded); ASSERT_TRUE(s.ok()) << s.ToString(); parsed.EncodeTo(&encoded2); @@ -27,7 +27,7 @@ class VersionEditTest { }; TEST(VersionEditTest, EncodeDecode) { static const uint64_t kBig = 1ull << 50; - VersionEdit edit(7); + VersionEdit edit; for (int i = 0; i < 4; i++) { TestEncodeDecode(edit); edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, diff --git a/db/version_set.cc b/db/version_set.cc index eb58e27804..f86de6ae71 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -18,6 +18,7 @@ #include "db/memtable.h" #include "db/merge_context.h" #include "db/table_cache.h" +#include "db/compaction.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/table.h" @@ -45,7 +46,7 @@ Version::~Version() { next_->prev_ = prev_; // Drop references to files - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < num_levels_; level++) { for (size_t i = 0; i < files_[level].size(); i++) { FileMetaData* f = files_[level][i]; assert(f->refs > 0); @@ -269,7 +270,7 @@ void Version::AddIterators(const ReadOptions& options, // For levels > 0, we can use a concatenating iterator that sequentially // walks through the non-overlapping files in the level, opening them // lazily. - for (int level = 1; level < vset_->NumberLevels(); level++) { + for (int level = 1; level < num_levels_; level++) { if (!files_[level].empty()) { iters->push_back(NewConcatenatingIterator(options, soptions, level)); } @@ -408,17 +409,19 @@ static bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { } Version::Version(VersionSet* vset, uint64_t version_number) - : vset_(vset), next_(this), prev_(this), refs_(0), - files_(new std::vector[vset->NumberLevels()]), - files_by_size_(vset->NumberLevels()), - next_file_to_compact_by_size_(vset->NumberLevels()), + : vset_(vset), + next_(this), + prev_(this), + refs_(0), + num_levels_(vset->num_levels_), + files_(new std::vector[num_levels_]), + files_by_size_(num_levels_), + next_file_to_compact_by_size_(num_levels_), file_to_compact_(nullptr), file_to_compact_level_(-1), - compaction_score_(vset->NumberLevels()), - compaction_level_(vset->NumberLevels()), - offset_manifest_file_(0), - version_number_(version_number) { -} + compaction_score_(num_levels_), + compaction_level_(num_levels_), + version_number_(version_number) {} void Version::Get(const ReadOptions& options, const LookupKey& k, @@ -457,7 +460,7 @@ void Version::Get(const ReadOptions& options, // levels. Therefore we are guaranteed that if we find data // in an smaller level, later levels are irrelevant (unless we // are MergeInProgress). - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < num_levels_; level++) { size_t num_files = files_[level].size(); if (num_files == 0) continue; @@ -590,6 +593,159 @@ bool Version::UpdateStats(const GetStats& stats) { return false; } +void Version::Finalize(std::vector& size_being_compacted) { + // Pre-sort level0 for Get() + if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + std::sort(files_[0].begin(), files_[0].end(), NewestFirstBySeqNo); + } else { + std::sort(files_[0].begin(), files_[0].end(), NewestFirst); + } + + double max_score = 0; + int max_score_level = 0; + + int num_levels_to_check = + (vset_->options_->compaction_style != kCompactionStyleUniversal) + ? NumberLevels() - 1 + : 1; + + for (int level = 0; level < num_levels_to_check; level++) { + double score; + if (level == 0) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + int numfiles = 0; + for (unsigned int i = 0; i < files_[level].size(); i++) { + if (!files_[level][i]->being_compacted) { + numfiles++; + } + } + + // If we are slowing down writes, then we better compact that first + if (numfiles >= vset_->options_->level0_stop_writes_trigger) { + score = 1000000; + // Log(options_->info_log, "XXX score l0 = 1000000000 max"); + } else if (numfiles >= vset_->options_->level0_slowdown_writes_trigger) { + score = 10000; + // Log(options_->info_log, "XXX score l0 = 1000000 medium"); + } else { + score = static_cast(numfiles) / + vset_->options_->level0_file_num_compaction_trigger; + if (score >= 1) { + // Log(options_->info_log, "XXX score l0 = %d least", (int)score); + } + } + } else { + // Compute the ratio of current size to size limit. + const uint64_t level_bytes = + TotalFileSize(files_[level]) - size_being_compacted[level]; + score = static_cast(level_bytes) / vset_->MaxBytesForLevel(level); + if (score > 1) { + // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score); + } + if (max_score < score) { + max_score = score; + max_score_level = level; + } + } + compaction_level_[level] = level; + compaction_score_[level] = score; + } + + // update the max compaction score in levels 1 to n-1 + max_compaction_score_ = max_score; + max_compaction_score_level_ = max_score_level; + + // sort all the levels based on their score. Higher scores get listed + // first. Use bubble sort because the number of entries are small. + for (int i = 0; i < NumberLevels() - 2; i++) { + for (int j = i + 1; j < NumberLevels() - 1; j++) { + if (compaction_score_[i] < compaction_score_[j]) { + double score = compaction_score_[i]; + int level = compaction_level_[i]; + compaction_score_[i] = compaction_score_[j]; + compaction_level_[i] = compaction_level_[j]; + compaction_score_[j] = score; + compaction_level_[j] = level; + } + } + } +} + +namespace { + +// Compator that is used to sort files based on their size +// In normal mode: descending size +bool CompareSizeDescending(const Version::Fsize& first, + const Version::Fsize& second) { + return (first.file->file_size > second.file->file_size); +} +// A static compator used to sort files based on their seqno +// In universal style : descending seqno +bool CompareSeqnoDescending(const Version::Fsize& first, + const Version::Fsize& second) { + if (first.file->smallest_seqno > second.file->smallest_seqno) { + assert(first.file->largest_seqno > second.file->largest_seqno); + return true; + } + assert(first.file->largest_seqno <= second.file->largest_seqno); + return false; +} + +} // anonymous namespace + +void Version::UpdateFilesBySize() { + // No need to sort the highest level because it is never compacted. + int max_level = + (vset_->options_->compaction_style == kCompactionStyleUniversal) + ? NumberLevels() + : NumberLevels() - 1; + + for (int level = 0; level < max_level; level++) { + const std::vector& files = files_[level]; + std::vector& files_by_size = files_by_size_[level]; + assert(files_by_size.size() == 0); + + // populate a temp vector for sorting based on size + std::vector temp(files.size()); + for (unsigned int i = 0; i < files.size(); i++) { + temp[i].index = i; + temp[i].file = files[i]; + } + + // sort the top number_of_files_to_sort_ based on file size + if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + int num = temp.size(); + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareSeqnoDescending); + } else { + int num = Version::number_of_files_to_sort_; + if (num > (int)temp.size()) { + num = temp.size(); + } + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareSizeDescending); + } + assert(temp.size() == files.size()); + + // initialize files_by_size_ + for (unsigned int i = 0; i < temp.size(); i++) { + files_by_size.push_back(temp[i].index); + } + next_file_to_compact_by_size_[level] = 0; + assert(files_[level].size() == files_by_size_[level].size()); + } +} + void Version::Ref() { ++refs_; } @@ -627,7 +783,7 @@ int Version::PickLevelForMemTableOutput( if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { break; } - if (level + 2 >= vset_->NumberLevels()) { + if (level + 2 >= num_levels_) { level++; break; } @@ -860,9 +1016,70 @@ bool Version::HasOverlappingUserKey( return false; } +int64_t Version::NumLevelBytes(int level) const { + assert(level >= 0); + assert(level < NumberLevels()); + return TotalFileSize(files_[level]); +} + +const char* Version::LevelSummary(LevelSummaryStorage* scratch) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files["); + for (int i = 0; i < NumberLevels(); i++) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size())); + if (ret < 0 || ret >= sz) break; + len += ret; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +const char* Version::LevelFileSummary(FileSummaryStorage* scratch, + int level) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); + for (const auto& f : files_[level]) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, + "#%lu(seq=%lu,sz=%lu,%lu) ", + (unsigned long)f->number, + (unsigned long)f->smallest_seqno, + (unsigned long)f->file_size, + (unsigned long)f->being_compacted); + if (ret < 0 || ret >= sz) + break; + len += ret; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +int64_t Version::MaxNextLevelOverlappingBytes() { + uint64_t result = 0; + std::vector overlaps; + for (int level = 1; level < NumberLevels() - 1; level++) { + for (const auto& f : files_[level]) { + GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > result) { + result = sum; + } + } + } + return result; +} + +void Version::AddLiveFiles(std::set* live) { + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = files_[level]; + for (const auto& file : files) { + live->insert(file->number); + } + } +} + std::string Version::DebugString(bool hex) const { std::string r; - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < num_levels_; level++) { // E.g., // --- level 1 --- // 17:123['a' .. 'd'] @@ -931,20 +1148,18 @@ class VersionSet::Builder { public: // Initialize a builder with the files from *base and other info from *vset - Builder(VersionSet* vset, Version* base) - : vset_(vset), - base_(base) { + Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) { base_->Ref(); - levels_ = new LevelState[vset_->NumberLevels()]; + levels_ = new LevelState[base->NumberLevels()]; BySmallestKey cmp; cmp.internal_comparator = &vset_->icmp_; - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < base->NumberLevels(); level++) { levels_[level].added_files = new FileSet(cmp); } } ~Builder() { - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < base_->NumberLevels(); level++) { const FileSet* added = levels_[level].added_files; std::vector to_unref; to_unref.reserve(added->size()); @@ -973,7 +1188,7 @@ class VersionSet::Builder { void CheckConsistency(Version* v) { #ifndef NDEBUG - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < v->NumberLevels(); level++) { // Make sure there is no overlap in levels > 0 if (level > 0) { for (uint32_t i = 1; i < v->files_[level].size(); i++) { @@ -991,14 +1206,12 @@ class VersionSet::Builder { #endif } - void CheckConsistencyForDeletes( - VersionEdit* edit, - unsigned int number, - int level) { + void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number, + int level) { #ifndef NDEBUG // a file to be deleted better exist in the previous version bool found = false; - for (int l = 0; !found && l < edit->number_levels_; l++) { + for (int l = 0; !found && l < base_->NumberLevels(); l++) { const std::vector& base_files = base_->files_[l]; for (unsigned int i = 0; i < base_files.size(); i++) { FileMetaData* f = base_files[i]; @@ -1011,7 +1224,7 @@ class VersionSet::Builder { // if the file did not exist in the previous version, then it // is possibly moved from lower level to higher level in current // version - for (int l = level+1; !found && l < edit->number_levels_; l++) { + for (int l = level+1; !found && l < base_->NumberLevels(); l++) { const FileSet* added = levels_[l].added_files; for (FileSet::const_iterator added_iter = added->begin(); added_iter != added->end(); ++added_iter) { @@ -1092,7 +1305,7 @@ class VersionSet::Builder { CheckConsistency(v); BySmallestKey cmp; cmp.internal_comparator = &vset_->icmp_; - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < base_->NumberLevels(); level++) { // Merge the set of added files with the set of pre-existing files. // Drop any deleted files. Store the result in *v. const auto& base_files = base_->files_[level]; @@ -1151,8 +1364,7 @@ class VersionSet::Builder { } }; -VersionSet::VersionSet(const std::string& dbname, - const Options* options, +VersionSet::VersionSet(const std::string& dbname, const Options* options, const EnvOptions& storage_options, TableCache* table_cache, const InternalKeyComparator* cmp) @@ -1169,11 +1381,12 @@ VersionSet::VersionSet(const std::string& dbname, num_levels_(options_->num_levels), dummy_versions_(this), current_(nullptr), + need_slowdown_for_num_level0_files_(false), compactions_in_progress_(options_->num_levels), current_version_number_(0), - last_observed_manifest_size_(0), + manifest_file_size_(0), storage_options_(storage_options), - storage_options_compactions_(storage_options_) { + storage_options_compactions_(storage_options_) { compact_pointer_ = new std::string[options_->num_levels]; Init(options_->num_levels); AppendVersion(new Version(this, current_version_number_++)); @@ -1220,6 +1433,9 @@ void VersionSet::AppendVersion(Version* v) { current_->Unref(); } current_ = v; + need_slowdown_for_num_level0_files_ = + (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr && + v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger); v->Ref(); // Append to linked list @@ -1229,10 +1445,8 @@ void VersionSet::AppendVersion(Version* v) { v->next_->prev_ = v; } -Status VersionSet::LogAndApply( - VersionEdit* edit, - port::Mutex* mu, - bool new_descriptor_log) { +Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, + bool new_descriptor_log) { mu->AssertHeld(); // queue our request @@ -1270,7 +1484,7 @@ Status VersionSet::LogAndApply( // No need to perform this check if a new Manifest is being created anyways. if (!descriptor_log_ || - last_observed_manifest_size_ > options_->max_manifest_file_size) { + manifest_file_size_ > options_->max_manifest_file_size) { new_descriptor_log = true; manifest_file_number_ = NewFileNumber(); // Change manifest file no. } @@ -1284,7 +1498,7 @@ Status VersionSet::LogAndApply( // because &w is ensuring that all new writes get queued. { // calculate the amount of data being compacted at every level - std::vector size_being_compacted(NumberLevels()-1); + std::vector size_being_compacted(v->NumberLevels() - 1); SizeBeingCompacted(size_being_compacted); mu->Unlock(); @@ -1310,8 +1524,8 @@ Status VersionSet::LogAndApply( // The calls to Finalize and UpdateFilesBySize are cpu-heavy // and is best called outside the mutex. - Finalize(v, size_being_compacted); - UpdateFilesBySize(v); + v->Finalize(size_being_compacted); + v->UpdateFilesBySize(); // Write new record to MANIFEST log if (s.ok()) { @@ -1367,14 +1581,11 @@ Status VersionSet::LogAndApply( LogFlush(options_->info_log); mu->Lock(); - // cache the manifest_file_size so that it can be used to rollover in the - // next call to LogAndApply - last_observed_manifest_size_ = new_manifest_file_size; } // Install the new version if (s.ok()) { - v->offset_manifest_file_ = new_manifest_file_size; + manifest_file_size_ = new_manifest_file_size; AppendVersion(v); log_number_ = edit->log_number_; prev_log_number_ = edit->prev_log_number_; @@ -1408,7 +1619,7 @@ Status VersionSet::LogAndApply( } void VersionSet::LogAndApplyHelper(Builder* builder, Version* v, - VersionEdit* edit, port::Mutex* mu) { + VersionEdit* edit, port::Mutex* mu) { mu->AssertHeld(); if (edit->has_log_number_) { @@ -1486,21 +1697,28 @@ Status VersionSet::Recover() { Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit(NumberLevels()); + VersionEdit edit; s = edit.DecodeFrom(record); - if (s.ok()) { - if (edit.has_comparator_ && - edit.comparator_ != icmp_.user_comparator()->Name()) { - s = Status::InvalidArgument(icmp_.user_comparator()->Name(), - "does not match existing comparator " + - edit.comparator_); - } + if (!s.ok()) { + break; } - if (s.ok()) { - builder.Apply(&edit); + if (edit.max_level_ >= current_->NumberLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; } + if (edit.has_comparator_ && + edit.comparator_ != icmp_.user_comparator()->Name()) { + s = Status::InvalidArgument(icmp_.user_comparator()->Name(), + "does not match existing comparator " + + edit.comparator_); + break; + } + + builder.Apply(&edit); + if (edit.has_log_number_) { log_number = edit.log_number_; have_log_number = true; @@ -1545,11 +1763,11 @@ Status VersionSet::Recover() { builder.SaveTo(v); // Install recovered version - std::vector size_being_compacted(NumberLevels()-1); + std::vector size_being_compacted(v->NumberLevels() - 1); SizeBeingCompacted(size_being_compacted); - Finalize(v, size_being_compacted); + v->Finalize(size_being_compacted); - v->offset_manifest_file_ = manifest_file_size; + manifest_file_size_ = manifest_file_size; AppendVersion(v); manifest_file_number_ = next_file; next_file_number_ = next_file + 1; @@ -1573,7 +1791,7 @@ Status VersionSet::Recover() { } Status VersionSet::DumpManifest(Options& options, std::string& dscname, - bool verbose, bool hex) { + bool verbose, bool hex) { struct LogReporter : public log::Reader::Reporter { Status* status; virtual void Corruption(size_t bytes, const Status& s) { @@ -1607,7 +1825,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit(NumberLevels()); + VersionEdit edit; s = edit.DecodeFrom(record); if (s.ok()) { if (edit.has_comparator_ && @@ -1677,9 +1895,9 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, builder.SaveTo(v); // Install recovered version - std::vector size_being_compacted(NumberLevels()-1); + std::vector size_being_compacted(v->NumberLevels() - 1); SizeBeingCompacted(size_being_compacted); - Finalize(v, size_being_compacted); + v->Finalize(size_being_compacted); AppendVersion(v); manifest_file_number_ = next_file; @@ -1707,162 +1925,11 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) { } } -void VersionSet::Finalize(Version* v, - std::vector& size_being_compacted) { - // Pre-sort level0 for Get() - if (options_->compaction_style == kCompactionStyleUniversal) { - std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo); - } else { - std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst); - } - - double max_score = 0; - int max_score_level = 0; - - int num_levels_to_check = - (options_->compaction_style != kCompactionStyleUniversal) ? - NumberLevels() - 1 : 1; - - for (int level = 0; level < num_levels_to_check; level++) { - - double score; - if (level == 0) { - // We treat level-0 specially by bounding the number of files - // instead of number of bytes for two reasons: - // - // (1) With larger write-buffer sizes, it is nice not to do too - // many level-0 compactions. - // - // (2) The files in level-0 are merged on every read and - // therefore we wish to avoid too many files when the individual - // file size is small (perhaps because of a small write-buffer - // setting, or very high compression ratios, or lots of - // overwrites/deletions). - int numfiles = 0; - for (unsigned int i = 0; i < v->files_[level].size(); i++) { - if (!v->files_[level][i]->being_compacted) { - numfiles++; - } - } - - // If we are slowing down writes, then we better compact that first - if (numfiles >= options_->level0_stop_writes_trigger) { - score = 1000000; - // Log(options_->info_log, "XXX score l0 = 1000000000 max"); - } else if (numfiles >= options_->level0_slowdown_writes_trigger) { - score = 10000; - // Log(options_->info_log, "XXX score l0 = 1000000 medium"); - } else { - score = numfiles / - static_cast(options_->level0_file_num_compaction_trigger); - if (score >= 1) { - // Log(options_->info_log, "XXX score l0 = %d least", (int)score); - } - } - } else { - // Compute the ratio of current size to size limit. - const uint64_t level_bytes = TotalFileSize(v->files_[level]) - - size_being_compacted[level]; - score = static_cast(level_bytes) / MaxBytesForLevel(level); - if (score > 1) { - // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score); - } - if (max_score < score) { - max_score = score; - max_score_level = level; - } - } - v->compaction_level_[level] = level; - v->compaction_score_[level] = score; - } - - // update the max compaction score in levels 1 to n-1 - v->max_compaction_score_ = max_score; - v->max_compaction_score_level_ = max_score_level; - - // sort all the levels based on their score. Higher scores get listed - // first. Use bubble sort because the number of entries are small. - for (int i = 0; i < NumberLevels()-2; i++) { - for (int j = i+1; j < NumberLevels()-1; j++) { - if (v->compaction_score_[i] < v->compaction_score_[j]) { - double score = v->compaction_score_[i]; - int level = v->compaction_level_[i]; - v->compaction_score_[i] = v->compaction_score_[j]; - v->compaction_level_[i] = v->compaction_level_[j]; - v->compaction_score_[j] = score; - v->compaction_level_[j] = level; - } - } - } -} - -// A static compator used to sort files based on their size -// In normal mode: descending size -static bool compareSizeDescending(const VersionSet::Fsize& first, - const VersionSet::Fsize& second) { - return (first.file->file_size > second.file->file_size); -} -// A static compator used to sort files based on their seqno -// In universal style : descending seqno -static bool compareSeqnoDescending(const VersionSet::Fsize& first, - const VersionSet::Fsize& second) { - if (first.file->smallest_seqno > second.file->smallest_seqno) { - assert(first.file->largest_seqno > second.file->largest_seqno); - return true; - } - assert(first.file->largest_seqno <= second.file->largest_seqno); - return false; -} - -// sort all files in level1 to level(n-1) based on file size -void VersionSet::UpdateFilesBySize(Version* v) { - - // No need to sort the highest level because it is never compacted. - int max_level = (options_->compaction_style == kCompactionStyleUniversal) ? - NumberLevels() : NumberLevels() - 1; - - for (int level = 0; level < max_level; level++) { - - const std::vector& files = v->files_[level]; - std::vector& files_by_size = v->files_by_size_[level]; - assert(files_by_size.size() == 0); - - // populate a temp vector for sorting based on size - std::vector temp(files.size()); - for (unsigned int i = 0; i < files.size(); i++) { - temp[i].index = i; - temp[i].file = files[i]; - } - - // sort the top number_of_files_to_sort_ based on file size - if (options_->compaction_style == kCompactionStyleUniversal) { - int num = temp.size(); - std::partial_sort(temp.begin(), temp.begin() + num, - temp.end(), compareSeqnoDescending); - } else { - int num = Version::number_of_files_to_sort_; - if (num > (int)temp.size()) { - num = temp.size(); - } - std::partial_sort(temp.begin(), temp.begin() + num, - temp.end(), compareSizeDescending); - } - assert(temp.size() == files.size()); - - // initialize files_by_size_ - for (unsigned int i = 0; i < temp.size(); i++) { - files_by_size.push_back(temp[i].index); - } - v->next_file_to_compact_by_size_[level] = 0; - assert(v->files_[level].size() == v->files_by_size_[level].size()); - } -} - Status VersionSet::WriteSnapshot(log::Writer* log) { // TODO: Break up into multiple records to reduce memory usage on recovery? // Save metadata - VersionEdit edit(NumberLevels()); + VersionEdit edit; edit.SetComparatorName(icmp_.user_comparator()->Name()); // Save compaction pointers @@ -1875,7 +1942,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { } // Save files - for (int level = 0; level < NumberLevels(); level++) { + for (int level = 0; level < current_->NumberLevels(); level++) { const auto& files = current_->files_[level]; for (size_t i = 0; i < files.size(); i++) { const auto f = files[i]; @@ -1889,61 +1956,6 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { return log->AddRecord(record); } -int VersionSet::NumLevelFiles(int level) const { - assert(level >= 0); - assert(level < NumberLevels()); - return current_->files_[level].size(); -} - -const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const { - int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files["); - for (int i = 0; i < NumberLevels(); i++) { - int sz = sizeof(scratch->buffer) - len; - int ret = snprintf(scratch->buffer + len, sz, "%d ", - int(current_->files_[i].size())); - if (ret < 0 || ret >= sz) - break; - len += ret; - } - snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); - return scratch->buffer; -} - -const char* VersionSet::LevelDataSizeSummary( - LevelSummaryStorage* scratch) const { - int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); - for (int i = 0; i < NumberLevels(); i++) { - int sz = sizeof(scratch->buffer) - len; - int ret = snprintf(scratch->buffer + len, sz, "%lu ", - (unsigned long)NumLevelBytes(i)); - if (ret < 0 || ret >= sz) - break; - len += ret; - } - snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); - return scratch->buffer; -} - -const char* VersionSet::LevelFileSummary( - FileSummaryStorage* scratch, int level) const { - int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); - for (unsigned int i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - int sz = sizeof(scratch->buffer) - len; - int ret = snprintf(scratch->buffer + len, sz, - "#%lu(seq=%lu,sz=%lu,%lu) ", - (unsigned long)f->number, - (unsigned long)f->smallest_seqno, - (unsigned long)f->file_size, - (unsigned long)f->being_compacted); - if (ret < 0 || ret >= sz) - break; - len += ret; - } - snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); - return scratch->buffer; -} - // Opens the mainfest file and reads all records // till it finds the record we are looking for. bool VersionSet::ManifestContains(const std::string& record) const { @@ -1975,7 +1987,7 @@ bool VersionSet::ManifestContains(const std::string& record) const { uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { uint64_t result = 0; - for (int level = 0; level < NumberLevels(); level++) { + for (int level = 0; level < v->NumberLevels(); level++) { const std::vector& files = v->files_[level]; for (size_t i = 0; i < files.size(); i++) { if (icmp_.Compare(files[i]->largest, ikey) <= 0) { @@ -2011,7 +2023,7 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { for (Version* v = dummy_versions_.next_; v != &dummy_versions_; v = v->next_) { - for (int level = 0; level < NumberLevels(); level++) { + for (int level = 0; level < v->NumberLevels(); level++) { total_files += v->files_[level].size(); } } @@ -2022,7 +2034,7 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { for (Version* v = dummy_versions_.next_; v != &dummy_versions_; v = v->next_) { - for (int level = 0; level < NumberLevels(); level++) { + for (int level = 0; level < v->NumberLevels(); level++) { for (const auto& f : v->files_[level]) { live_list->push_back(f->number); } @@ -2030,40 +2042,6 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { } } -void VersionSet::AddLiveFilesCurrentVersion(std::set* live) { - Version* v = current_; - for (int level = 0; level < NumberLevels(); level++) { - const std::vector& files = v->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - live->insert(files[i]->number); - } - } -} - -int64_t VersionSet::NumLevelBytes(int level) const { - assert(level >= 0); - assert(level < NumberLevels()); - assert(current_); - return TotalFileSize(current_->files_[level]); -} - -int64_t VersionSet::MaxNextLevelOverlappingBytes() { - uint64_t result = 0; - std::vector overlaps; - for (int level = 1; level < NumberLevels() - 1; level++) { - for (size_t i = 0; i < current_->files_[level].size(); i++) { - const FileMetaData* f = current_->files_[level][i]; - current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest, - &overlaps); - const uint64_t sum = TotalFileSize(overlaps); - if (sum > result) { - result = sum; - } - } - } - return result; -} - // Stores the minimal range that covers all entries in inputs in // *smallest, *largest. // REQUIRES: inputs is not empty @@ -2223,7 +2201,7 @@ void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) { // The total size of files that are currently being compacted // at at every level upto the penultimate level. void VersionSet::SizeBeingCompacted(std::vector& sizes) { - for (int level = 0; level < NumberLevels()-1; level++) { + for (int level = 0; level < NumberLevels() - 1; level++) { uint64_t total = 0; for (std::set::iterator it = compactions_in_progress_[level].begin(); @@ -2246,8 +2224,8 @@ void VersionSet::SizeBeingCompacted(std::vector& sizes) { // base file (overrides configured values of file-size ratios, // min_merge_width and max_merge_width). // -Compaction* VersionSet::PickCompactionUniversalSizeAmp( - int level, double score) { +Compaction* VersionSet::PickCompactionUniversalSizeAmp(int level, + double score) { assert (level == 0); // percentage flexibilty while reducing size amplification @@ -2329,13 +2307,13 @@ Compaction* VersionSet::PickCompactionUniversalSizeAmp( // create a compaction request // We always compact all the files, so always compress. - Compaction* c = new Compaction(level, level, MaxFileSizeForLevel(level), - LLONG_MAX, NumberLevels(), false, - true); + Compaction* c = + new Compaction(current_, level, level, MaxFileSizeForLevel(level), + LLONG_MAX, false, true); c->score_ = score; for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) { int index = file_by_time[loop]; - f = current_->files_[level][index]; + f = c->input_version_->files_[level][index]; c->inputs_[0].push_back(f); Log(options_->info_log, "Universal: size amp picking file %lu[%d] with size %lu", @@ -2459,14 +2437,14 @@ Compaction* VersionSet::PickCompactionUniversalReadAmp( } } } - Compaction* c = new Compaction(level, level, MaxFileSizeForLevel(level), - LLONG_MAX, NumberLevels(), false, - enable_compression); + Compaction* c = + new Compaction(current_, level, level, MaxFileSizeForLevel(level), + LLONG_MAX, false, enable_compression); c->score_ = score; for (unsigned int i = start_index; i < first_index_after; i++) { int index = file_by_time[i]; - FileMetaData* f = current_->files_[level][index]; + FileMetaData* f = c->input_version_->files_[level][index]; c->inputs_[0].push_back(f); Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n", (unsigned long)f->number, @@ -2488,10 +2466,10 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) { Log(options_->info_log, "Universal: nothing to do\n"); return nullptr; } - VersionSet::FileSummaryStorage tmp; + Version::FileSummaryStorage tmp; Log(options_->info_log, "Universal: candidate files(%lu): %s\n", current_->files_[level].size(), - LevelFileSummary(&tmp, 0)); + current_->LevelFileSummary(&tmp, 0)); // Check for size amplification first. Compaction* c = PickCompactionUniversalSizeAmp(level, score); @@ -2528,11 +2506,11 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) { } // The files are sorted from newest first to oldest last. - std::vector& file_by_time = current_->files_by_size_[level]; + std::vector& file_by_time = c->input_version_->files_by_size_[level]; // Is the earliest file part of this compaction? int last_index = file_by_time[file_by_time.size()-1]; - FileMetaData* last_file = current_->files_[level][last_index]; + FileMetaData* last_file = c->input_version_->files_[level][last_index]; if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) { c->bottommost_level_ = true; } @@ -2543,9 +2521,6 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) { c->inputs_[0].size()); } - c->input_version_ = current_; - c->input_version_->Ref(); - // mark all the files that are being compacted c->MarkFilesBeingCompacted(true); @@ -2554,7 +2529,8 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) { // Record whether this compaction includes all sst files. // For now, it is only relevant in universal compaction mode. - c->is_full_compaction_ = (c->inputs_[0].size() == current_->files_[0].size()); + c->is_full_compaction_ = + (c->inputs_[0].size() == c->input_version_->files_[0].size()); return c; } @@ -2571,27 +2547,28 @@ Compaction* VersionSet::PickCompactionBySize(int level, double score) { } assert(level >= 0); - assert(level+1 < NumberLevels()); - c = new Compaction(level, level+1, MaxFileSizeForLevel(level+1), - MaxGrandParentOverlapBytes(level), NumberLevels()); + assert(level + 1 < current_->NumberLevels()); + c = new Compaction(current_, level, level + 1, MaxFileSizeForLevel(level + 1), + MaxGrandParentOverlapBytes(level)); c->score_ = score; // Pick the largest file in this level that is not already // being compacted - std::vector& file_size = current_->files_by_size_[level]; + std::vector& file_size = c->input_version_->files_by_size_[level]; // record the first file that is not yet compacted int nextIndex = -1; - for (unsigned int i = current_->next_file_to_compact_by_size_[level]; + for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level]; i < file_size.size(); i++) { int index = file_size[i]; - FileMetaData* f = current_->files_[level][index]; + FileMetaData* f = c->input_version_->files_[level][index]; // check to verify files are arranged in descending size assert((i == file_size.size() - 1) || - (i >= Version::number_of_files_to_sort_-1) || - (f->file_size >= current_->files_[level][file_size[i+1]]->file_size)); + (i >= Version::number_of_files_to_sort_ - 1) || + (f->file_size >= + c->input_version_->files_[level][file_size[i + 1]]->file_size)); // do not pick a file to compact if it is being compacted // from n-1 level. @@ -2627,7 +2604,7 @@ Compaction* VersionSet::PickCompactionBySize(int level, double score) { } // store where to start the iteration in the next call to PickCompaction - current_->next_file_to_compact_by_size_[level] = nextIndex; + c->input_version_->next_file_to_compact_by_size_[level] = nextIndex; return c; } @@ -2640,7 +2617,7 @@ Compaction* VersionSet::PickCompaction() { // and also in LogAndApply(), otherwise the values could be stale. std::vector size_being_compacted(NumberLevels()-1); current_->vset_->SizeBeingCompacted(size_being_compacted); - Finalize(current_, size_being_compacted); + current_->Finalize(size_being_compacted); // In universal style of compaction, compact L0 files back into L0. if (options_->compaction_style == kCompactionStyleUniversal) { @@ -2678,11 +2655,12 @@ Compaction* VersionSet::PickCompaction() { if (level != 0 || compactions_in_progress_[0].empty()) { if(!ParentRangeInCompaction(&f->smallest, &f->largest, level, &parent_index)) { - c = new Compaction(level, level+1, MaxFileSizeForLevel(level+1), - MaxGrandParentOverlapBytes(level), NumberLevels(), true); + c = new Compaction(current_, level, level + 1, + MaxFileSizeForLevel(level + 1), + MaxGrandParentOverlapBytes(level), true); c->inputs_[0].push_back(f); c->parent_index_ = parent_index; - current_->file_to_compact_ = nullptr; + c->input_version_->file_to_compact_ = nullptr; ExpandWhileOverlapping(c); } } @@ -2692,9 +2670,6 @@ Compaction* VersionSet::PickCompaction() { return nullptr; } - c->input_version_ = current_; - c->input_version_->Ref(); - // Two level 0 compaction won't run at the same time, so don't need to worry // about files on level 0 being compacted. if (level == 0) { @@ -2705,7 +2680,8 @@ Compaction* VersionSet::PickCompaction() { // c->inputs_[0] earlier and replace it with an overlapping set // which will include the picked file. c->inputs_[0].clear(); - current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]); + c->input_version_->GetOverlappingInputs(0, &smallest, &largest, + &c->inputs_[0]); // If we include more L0 files in the same compaction run it can // cause the 'smallest' and 'largest' key to get extended to a @@ -2736,11 +2712,13 @@ Compaction* VersionSet::PickCompaction() { // Returns true if any one of the parent files are being compacted bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest, - const InternalKey* largest, int level, int* parent_index) { + const InternalKey* largest, int level, + int* parent_index) { std::vector inputs; + assert(level + 1 < current_->NumberLevels()); - current_->GetOverlappingInputs(level+1, smallest, largest, - &inputs, *parent_index, parent_index); + current_->GetOverlappingInputs(level + 1, smallest, largest, &inputs, + *parent_index, parent_index); return FilesInCompaction(inputs); } @@ -2788,8 +2766,8 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) { old_size = c->inputs_[0].size(); GetRange(c->inputs_[0], &smallest, &largest); c->inputs_[0].clear(); - current_->GetOverlappingInputs(level, &smallest, &largest, &c->inputs_[0], - hint_index, &hint_index); + c->input_version_->GetOverlappingInputs( + level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index); } while(c->inputs_[0].size() > old_size); // Get the new range @@ -2799,7 +2777,8 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) { // compaction, then we must drop/cancel this compaction. int parent_index = -1; if (FilesInCompaction(c->inputs_[0]) || - ParentRangeInCompaction(&smallest, &largest, level, &parent_index)) { + (c->level() != c->output_level() && + ParentRangeInCompaction(&smallest, &largest, level, &parent_index))) { c->inputs_[0].clear(); c->inputs_[1].clear(); delete c; @@ -2813,7 +2792,9 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) { // user-key with another file. void VersionSet::SetupOtherInputs(Compaction* c) { // If inputs are empty, then there is nothing to expand. - if (c->inputs_[0].empty()) { + // If both input and output levels are the same, no need to consider + // files at level "level+1" + if (c->inputs_[0].empty() || c->level() == c->output_level()) { return; } @@ -2824,8 +2805,9 @@ void VersionSet::SetupOtherInputs(Compaction* c) { GetRange(c->inputs_[0], &smallest, &largest); // Populate the set of next-level files (inputs_[1]) to include in compaction - current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1], - c->parent_index_, &c->parent_index_); + c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest, + &c->inputs_[1], c->parent_index_, + &c->parent_index_); // Get entire range covered by compaction InternalKey all_start, all_limit; @@ -2838,8 +2820,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) { // can happen when one user key spans multiple files. if (!c->inputs_[1].empty()) { std::vector expanded0; - current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0, - c->base_index_, nullptr); + c->input_version_->GetOverlappingInputs( + level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr); const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]); const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]); const uint64_t expanded0_size = TotalFileSize(expanded0); @@ -2847,13 +2829,13 @@ void VersionSet::SetupOtherInputs(Compaction* c) { if (expanded0.size() > c->inputs_[0].size() && inputs1_size + expanded0_size < limit && !FilesInCompaction(expanded0) && - !current_->HasOverlappingUserKey(&expanded0, level)) { + !c->input_version_->HasOverlappingUserKey(&expanded0, level)) { InternalKey new_start, new_limit; GetRange(expanded0, &new_start, &new_limit); std::vector expanded1; - current_->GetOverlappingInputs(level+1, &new_start, &new_limit, - &expanded1, c->parent_index_, - &c->parent_index_); + c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit, + &expanded1, c->parent_index_, + &c->parent_index_); if (expanded1.size() == c->inputs_[1].size() && !FilesInCompaction(expanded1)) { Log(options_->info_log, @@ -2880,8 +2862,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) { // Compute the set of grandparent files that overlap this compaction // (parent == level+1; grandparent == level+2) if (level + 2 < NumberLevels()) { - current_->GetOverlappingInputs(level + 2, &all_start, &all_limit, - &c->grandparents_); + c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit, + &c->grandparents_); } if (false) { @@ -2899,10 +2881,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) { c->edit_->SetCompactPointer(level, largest); } -Status VersionSet::GetMetadataForFile( - uint64_t number, - int *filelevel, - FileMetaData **meta) { +Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData** meta) { for (int level = 0; level < NumberLevels(); level++) { const std::vector& files = current_->files_[level]; for (size_t i = 0; i < files.size(); i++) { @@ -2916,8 +2896,7 @@ Status VersionSet::GetMetadataForFile( return Status::NotFound("File not present in any level"); } -void VersionSet::GetLiveFilesMetaData( - std::vector * metadata) { +void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { for (int level = 0; level < NumberLevels(); level++) { const std::vector& files = current_->files_[level]; for (size_t i = 0; i < files.size(); i++) { @@ -2941,11 +2920,13 @@ void VersionSet::GetObsoleteFiles(std::vector* files) { obsolete_files_.clear(); } -Compaction* VersionSet::CompactRange( - int level, - const InternalKey* begin, - const InternalKey* end) { +Compaction* VersionSet::CompactRange(int input_level, + int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { std::vector inputs; + bool covering_the_whole_range = true; // All files are 'overlapping' in universal style compaction. // We have to compact the entire range in one shot. @@ -2953,7 +2934,7 @@ Compaction* VersionSet::CompactRange( begin = nullptr; end = nullptr; } - current_->GetOverlappingInputs(level, begin, end, &inputs); + current_->GetOverlappingInputs(input_level, begin, end, &inputs); if (inputs.empty()) { return nullptr; } @@ -2962,24 +2943,24 @@ Compaction* VersionSet::CompactRange( // But we cannot do this for level-0 since level-0 files can overlap // and we must not pick one file and drop another older file if the // two files overlap. - if (level > 0) { - const uint64_t limit = MaxFileSizeForLevel(level) * - options_->source_compaction_factor; + if (input_level > 0) { + const uint64_t limit = + MaxFileSizeForLevel(input_level) * options_->source_compaction_factor; uint64_t total = 0; - for (size_t i = 0; i < inputs.size(); ++i) { + for (size_t i = 0; i + 1 < inputs.size(); ++i) { uint64_t s = inputs[i]->file_size; total += s; if (total >= limit) { + **compaction_end = inputs[i + 1]->smallest; + covering_the_whole_range = false; inputs.resize(i + 1); break; } } } - int out_level = (options_->compaction_style == kCompactionStyleUniversal) ? - level : level+1; - - Compaction* c = new Compaction(level, out_level, MaxFileSizeForLevel(out_level), - MaxGrandParentOverlapBytes(level), NumberLevels()); + Compaction* c = new Compaction(current_, input_level, output_level, + MaxFileSizeForLevel(output_level), + MaxGrandParentOverlapBytes(input_level)); c->inputs_[0] = inputs; ExpandWhileOverlapping(c); @@ -2988,10 +2969,12 @@ Compaction* VersionSet::CompactRange( return nullptr; } - c->input_version_ = current_; - c->input_version_->Ref(); SetupOtherInputs(c); + if (covering_the_whole_range) { + *compaction_end = nullptr; + } + // These files that are to be manaully compacted do not trample // upon other files because manual compactions are processed when // the system has a max of 1 background compaction thread. @@ -3002,191 +2985,4 @@ Compaction* VersionSet::CompactRange( return c; } -Compaction::Compaction(int level, int out_level, uint64_t target_file_size, - uint64_t max_grandparent_overlap_bytes, int number_levels, - bool seek_compaction, bool enable_compression) - : level_(level), - out_level_(out_level), - max_output_file_size_(target_file_size), - maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes), - input_version_(nullptr), - number_levels_(number_levels), - seek_compaction_(seek_compaction), - enable_compression_(enable_compression), - grandparent_index_(0), - seen_key_(false), - overlapped_bytes_(0), - base_index_(-1), - parent_index_(-1), - score_(0), - bottommost_level_(false), - is_full_compaction_(false), - level_ptrs_(std::vector(number_levels)) { - edit_ = new VersionEdit(number_levels_); - for (int i = 0; i < number_levels_; i++) { - level_ptrs_[i] = 0; - } -} - -Compaction::~Compaction() { - delete edit_; - if (input_version_ != nullptr) { - input_version_->Unref(); - } -} - -bool Compaction::IsTrivialMove() const { - // Avoid a move if there is lots of overlapping grandparent data. - // Otherwise, the move could create a parent file that will require - // a very expensive merge later on. - return (num_input_files(0) == 1 && - num_input_files(1) == 0 && - TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_); -} - -void Compaction::AddInputDeletions(VersionEdit* edit) { - for (int which = 0; which < 2; which++) { - for (size_t i = 0; i < inputs_[which].size(); i++) { - edit->DeleteFile(level_ + which, inputs_[which][i]->number); - } - } -} - -bool Compaction::IsBaseLevelForKey(const Slice& user_key) { - if (input_version_->vset_->options_->compaction_style == - kCompactionStyleUniversal) { - return bottommost_level_; - } - // Maybe use binary search to find right entry instead of linear search? - const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); - for (int lvl = level_ + 2; lvl < number_levels_; lvl++) { - const std::vector& files = input_version_->files_[lvl]; - for (; level_ptrs_[lvl] < files.size(); ) { - FileMetaData* f = files[level_ptrs_[lvl]]; - if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { - // We've advanced far enough - if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { - // Key falls in this file's range, so definitely not base level - return false; - } - break; - } - level_ptrs_[lvl]++; - } - } - return true; -} - -bool Compaction::ShouldStopBefore(const Slice& internal_key) { - // Scan to find earliest grandparent file that contains key. - const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; - while (grandparent_index_ < grandparents_.size() && - icmp->Compare(internal_key, - grandparents_[grandparent_index_]->largest.Encode()) > 0) { - if (seen_key_) { - overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; - } - assert(grandparent_index_ + 1 >= grandparents_.size() || - icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(), - grandparents_[grandparent_index_+1]->smallest.Encode()) - < 0); - grandparent_index_++; - } - seen_key_ = true; - - if (overlapped_bytes_ > maxGrandParentOverlapBytes_) { - // Too much overlap for current output; start new output - overlapped_bytes_ = 0; - return true; - } else { - return false; - } -} - -// Mark (or clear) each file that is being compacted -void Compaction::MarkFilesBeingCompacted(bool value) { - for (int i = 0; i < 2; i++) { - std::vector v = inputs_[i]; - for (unsigned int j = 0; j < inputs_[i].size(); j++) { - assert(value ? !inputs_[i][j]->being_compacted : - inputs_[i][j]->being_compacted); - inputs_[i][j]->being_compacted = value; - } - } -} - -// Is this compaction producing files at the bottommost level? -void Compaction::SetupBottomMostLevel(bool isManual) { - if (input_version_->vset_->options_->compaction_style == - kCompactionStyleUniversal) { - // If universal compaction style is used and manual - // compaction is occuring, then we are guaranteed that - // all files will be picked in a single compaction - // run. We can safely set bottommost_level_ = true. - // If it is not manual compaction, then bottommost_level_ - // is already set when the Compaction was created. - if (isManual) { - bottommost_level_ = true; - } - return; - } - bottommost_level_ = true; - int num_levels = input_version_->vset_->NumberLevels(); - for (int i = level() + 2; i < num_levels; i++) { - if (input_version_->vset_->NumLevelFiles(i) > 0) { - bottommost_level_ = false; - break; - } - } -} - -void Compaction::ReleaseInputs() { - if (input_version_ != nullptr) { - input_version_->Unref(); - input_version_ = nullptr; - } -} - -void Compaction::ResetNextCompactionIndex() { - input_version_->ResetNextCompactionIndex(level_); -} - -static void InputSummary(std::vector& files, - char* output, - int len) { - int write = 0; - for (unsigned int i = 0; i < files.size(); i++) { - int sz = len - write; - int ret = snprintf(output + write, sz, "%lu(%lu) ", - (unsigned long)files.at(i)->number, - (unsigned long)files.at(i)->file_size); - if (ret < 0 || ret >= sz) - break; - write += ret; - } -} - -void Compaction::Summary(char* output, int len) { - int write = snprintf(output, len, - "Base version %lu Base level %d, seek compaction:%d, inputs:", - (unsigned long)input_version_->GetVersionNumber(), - level_, - seek_compaction_); - if (write < 0 || write > len) { - return; - } - - char level_low_summary[100]; - InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary)); - char level_up_summary[100]; - if (inputs_[1].size()) { - InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary)); - } else { - level_up_summary[0] = '\0'; - } - - snprintf(output + write, len - write, "[%s],[%s]", - level_low_summary, level_up_summary); -} - } // namespace rocksdb diff --git a/db/version_set.h b/db/version_set.h index d5dc2cb13c..e059fc3afb 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -27,6 +27,7 @@ #include "db/version_edit.h" #include "port/port.h" #include "db/table_cache.h" +#include "db/compaction.h" namespace rocksdb { @@ -86,6 +87,11 @@ class Version { // REQUIRES: lock is held bool UpdateStats(const GetStats& stats); + // Updates internal structures that keep track of compaction scores + // We use compaction scores to figure out which compaction to do next + // Also pre-sorts level0 files for Get() + void Finalize(std::vector& size_being_compacted); + // Reference count management (so Versions do not disappear out from // under live iterators) void Ref(); @@ -137,15 +143,45 @@ class Version { int PickLevelForMemTableOutput(const Slice& smallest_user_key, const Slice& largest_user_key); - int NumFiles(int level) const { return files_[level].size(); } + int NumberLevels() const { return num_levels_; } + + // REQUIRES: lock is held + int NumLevelFiles(int level) const { return files_[level].size(); } + + // Return the combined file size of all files at the specified level. + int64_t NumLevelBytes(int level) const; + + // Return a human-readable short (single-line) summary of the number + // of files per level. Uses *scratch as backing store. + struct LevelSummaryStorage { + char buffer[100]; + }; + struct FileSummaryStorage { + char buffer[1000]; + }; + const char* LevelSummary(LevelSummaryStorage* scratch) const; + // Return a human-readable short (single-line) summary of files + // in a specified level. Uses *scratch as backing store. + const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t MaxNextLevelOverlappingBytes(); + + // Add all files listed in the current version to *live. + void AddLiveFiles(std::set* live); // Return a human readable string that describes this version's contents. std::string DebugString(bool hex = false) const; // Returns the version nuber of this version - uint64_t GetVersionNumber() { - return version_number_; - } + uint64_t GetVersionNumber() const { return version_number_; } + + // used to sort files by size + struct Fsize { + int index; + FileMetaData* file; + }; private: friend class Compaction; @@ -159,10 +195,15 @@ class Version { bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions, const Slice& internal_prefix, Iterator* level_iter) const; + // Sort all files for this version based on their file size and + // record results in files_by_size_. The largest files are listed first. + void UpdateFilesBySize(); + VersionSet* vset_; // VersionSet to which this Version belongs Version* next_; // Next version in linked list Version* prev_; // Previous version in linked list int refs_; // Number of live refs to this version + int num_levels_; // Number of levels // List of files per level, files in each level are arranged // in increasing order of keys @@ -199,9 +240,6 @@ class Version { double max_compaction_score_; // max score in l1 to ln-1 int max_compaction_score_level_; // level on which max score occurs - // The offset in the manifest file where this version is stored. - uint64_t offset_manifest_file_; - // A version number that uniquely represents this version. This is // used for debugging and logging purposes only. uint64_t version_number_; @@ -223,10 +261,8 @@ class Version { class VersionSet { public: - VersionSet(const std::string& dbname, - const Options* options, - const EnvOptions& storage_options, - TableCache* table_cache, + VersionSet(const std::string& dbname, const Options* options, + const EnvOptions& storage_options, TableCache* table_cache, const InternalKeyComparator*); ~VersionSet(); @@ -236,7 +272,7 @@ class VersionSet { // REQUIRES: *mu is held on entry. // REQUIRES: no other thread concurrently calls LogAndApply() Status LogAndApply(VersionEdit* edit, port::Mutex* mu, - bool new_descriptor_log = false); + bool new_descriptor_log = false); // Recover the last saved descriptor from persistent storage. Status Recover(); @@ -252,6 +288,12 @@ class VersionSet { // Return the current version. Version* current() const { return current_; } + // A Flag indicating whether write needs to slowdown because of there are + // too many number of level0 files. + bool NeedSlowdownForNumLevel0Files() const { + return need_slowdown_for_num_level0_files_; + } + // Return the current manifest file number uint64_t ManifestFileNumber() const { return manifest_file_number_; } @@ -267,12 +309,6 @@ class VersionSet { } } - // Return the number of Table files at the specified level. - int NumLevelFiles(int level) const; - - // Return the combined file size of all files at the specified level. - int64_t NumLevelBytes(int level) const; - // Return the last sequence number. uint64_t LastSequence() const { return last_sequence_.load(std::memory_order_acquire); @@ -306,14 +342,18 @@ class VersionSet { // the specified level. Returns nullptr if there is nothing in that // level that overlaps the specified range. Caller should delete // the result. - Compaction* CompactRange( - int level, - const InternalKey* begin, - const InternalKey* end); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t MaxNextLevelOverlappingBytes(); + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + Compaction* CompactRange(int input_level, + int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end); // Create an iterator that reads over the compaction inputs for "*c". // The caller should delete the iterator when no longer needed. @@ -358,37 +398,16 @@ class VersionSet { // Add all files listed in any live version to *live. void AddLiveFiles(std::vector* live_list); - // Add all files listed in the current version to *live. - void AddLiveFilesCurrentVersion(std::set* live); - // Return the approximate offset in the database of the data for // "key" as of version "v". uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); - // Return a human-readable short (single-line) summary of the number - // of files per level. Uses *scratch as backing store. - struct LevelSummaryStorage { - char buffer[100]; - }; - struct FileSummaryStorage { - char buffer[1000]; - }; - const char* LevelSummary(LevelSummaryStorage* scratch) const; - // printf contents (for debugging) Status DumpManifest(Options& options, std::string& manifestFileName, bool verbose, bool hex = false); - // Return a human-readable short (single-line) summary of the data size - // of files per level. Uses *scratch as backing store. - const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const; - - // Return a human-readable short (single-line) summary of files - // in a specified level. Uses *scratch as backing store. - const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; - // Return the size of the current manifest file - const uint64_t ManifestFileSize() { return current_->offset_manifest_file_; } + uint64_t ManifestFileSize() const { return manifest_file_size_; } // For the specfied level, pick a compaction. // Returns nullptr if there is no compaction to be done. @@ -415,16 +434,6 @@ class VersionSet { // pick the same files to compact. bool VerifyCompactionFileConsistency(Compaction* c); - // used to sort files by size - typedef struct fsize { - int index; - FileMetaData* file; - } Fsize; - - // Sort all files for this version based on their file size and - // record results in files_by_size_. The largest files are listed first. - void UpdateFilesBySize(Version *v); - // Get the max file size in a given level. uint64_t MaxFileSizeForLevel(int level); @@ -447,8 +456,6 @@ class VersionSet { void Init(int num_levels); - void Finalize(Version* v, std::vector&); - void GetRange(const std::vector& inputs, InternalKey* smallest, InternalKey* largest); @@ -491,6 +498,10 @@ class VersionSet { Version dummy_versions_; // Head of circular doubly-linked list of versions. Version* current_; // == dummy_versions_.prev_ + // A flag indicating whether we should delay writes because + // we have too many level 0 files + bool need_slowdown_for_num_level0_files_; + // Per-level key at which the next compaction at that level should start. // Either an empty string, or a valid InternalKey. std::string* compact_pointer_; @@ -510,9 +521,8 @@ class VersionSet { // Queue of writers to the manifest file std::deque manifest_writers_; - // Store the manifest file size when it is checked. - // Save us the cost of checking file size twice in LogAndApply - uint64_t last_observed_manifest_size_; + // Current size of manifest file + uint64_t manifest_file_size_; std::vector obsolete_files_; @@ -542,118 +552,4 @@ class VersionSet { VersionEdit* edit, port::Mutex* mu); }; -// A Compaction encapsulates information about a compaction. -class Compaction { - public: - ~Compaction(); - - // Return the level that is being compacted. Inputs from "level" - // will be merged. - int level() const { return level_; } - - // Outputs will go to this level - int output_level() const { return out_level_; } - - // Return the object that holds the edits to the descriptor done - // by this compaction. - VersionEdit* edit() { return edit_; } - - // "which" must be either 0 or 1 - int num_input_files(int which) const { return inputs_[which].size(); } - - // Return the ith input file at "level()+which" ("which" must be 0 or 1). - FileMetaData* input(int which, int i) const { return inputs_[which][i]; } - - // Maximum size of files to build during this compaction. - uint64_t MaxOutputFileSize() const { return max_output_file_size_; } - - // Whether compression will be enabled for compaction outputs - bool enable_compression() const { return enable_compression_; } - - // Is this a trivial compaction that can be implemented by just - // moving a single input file to the next level (no merging or splitting) - bool IsTrivialMove() const; - - // Add all inputs to this compaction as delete operations to *edit. - void AddInputDeletions(VersionEdit* edit); - - // Returns true if the information we have available guarantees that - // the compaction is producing data in "level+1" for which no data exists - // in levels greater than "level+1". - bool IsBaseLevelForKey(const Slice& user_key); - - // Returns true iff we should stop building the current output - // before processing "internal_key". - bool ShouldStopBefore(const Slice& internal_key); - - // Release the input version for the compaction, once the compaction - // is successful. - void ReleaseInputs(); - - void Summary(char* output, int len); - - // Return the score that was used to pick this compaction run. - double score() const { return score_; } - - // Is this compaction creating a file in the bottom most level? - bool BottomMostLevel() { return bottommost_level_; } - - // Does this compaction include all sst files? - bool IsFullCompaction() { return is_full_compaction_; } - - private: - friend class Version; - friend class VersionSet; - - explicit Compaction(int level, int out_level, uint64_t target_file_size, - uint64_t max_grandparent_overlap_bytes, int number_levels, - bool seek_compaction = false, bool enable_compression = true); - - int level_; - int out_level_; // levels to which output files are stored - uint64_t max_output_file_size_; - uint64_t maxGrandParentOverlapBytes_; - Version* input_version_; - VersionEdit* edit_; - int number_levels_; - - bool seek_compaction_; - bool enable_compression_; - - // Each compaction reads inputs from "level_" and "level_+1" - std::vector inputs_[2]; // The two sets of inputs - - // State used to check for number of of overlapping grandparent files - // (parent == level_ + 1, grandparent == level_ + 2) - std::vector grandparents_; - size_t grandparent_index_; // Index in grandparent_starts_ - bool seen_key_; // Some output key has been seen - uint64_t overlapped_bytes_; // Bytes of overlap between current output - // and grandparent files - int base_index_; // index of the file in files_[level_] - int parent_index_; // index of some file with same range in files_[level_+1] - double score_; // score that was used to pick this compaction. - - // Is this compaction creating a file in the bottom most level? - bool bottommost_level_; - // Does this compaction include all sst files? - bool is_full_compaction_; - - // level_ptrs_ holds indices into input_version_->levels_: our state - // is that we are positioned at one of the file ranges for each - // higher level than the ones involved in this compaction (i.e. for - // all L >= level_ + 2). - std::vector level_ptrs_; - - // mark (or clear) all files that are being compacted - void MarkFilesBeingCompacted(bool); - - // Initialize whether compaction producing files at the bottommost level - void SetupBottomMostLevel(bool isManual); - - // In case of compaction error, reset the nextIndex that is used - // to pick up the next file to be compacted from files_by_size_ - void ResetNextCompactionIndex(); -}; - } // namespace rocksdb diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc index d13a4aed91..2ca689809d 100644 --- a/db/version_set_reduce_num_levels.cc +++ b/db/version_set_reduce_num_levels.cc @@ -25,7 +25,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) { } Version* current_version = current_; - int current_levels = NumberLevels(); + int current_levels = current_version->NumberLevels(); if (current_levels <= new_levels) { return Status::OK(); @@ -36,7 +36,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) { int first_nonempty_level = -1; int first_nonempty_level_filenum = 0; for (int i = new_levels - 1; i < current_levels; i++) { - int file_num = NumLevelFiles(i); + int file_num = current_version->NumLevelFiles(i); if (file_num != 0) { if (first_nonempty_level < 0) { first_nonempty_level = i; @@ -65,6 +65,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) { delete[] current_version->files_; current_version->files_ = new_files_list; + current_version->num_levels_ = new_levels; delete[] compact_pointer_; delete[] max_file_size_; @@ -72,8 +73,8 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) { num_levels_ = new_levels; compact_pointer_ = new std::string[new_levels]; Init(new_levels); - VersionEdit ve(new_levels); - st = LogAndApply(&ve , mu, true); + VersionEdit ve; + st = LogAndApply(&ve, mu, true); return st; } diff --git a/db/write_batch.cc b/db/write_batch.cc index 4f8392d843..6a427f1a63 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -21,6 +21,7 @@ #include "rocksdb/write_batch.h" #include "rocksdb/options.h" +#include "rocksdb/merge_operator.h" #include "db/dbformat.h" #include "db/db_impl.h" #include "db/memtable.h" @@ -234,7 +235,62 @@ class MemTableInserter : public WriteBatch::Handler { } virtual void Merge(const Slice& key, const Slice& value) { - mem_->Add(sequence_, kTypeMerge, key, value); + bool perform_merge = false; + + if (options_->max_successive_merges > 0 && db_ != nullptr) { + LookupKey lkey(key, sequence_); + + // Count the number of successive merges at the head + // of the key in the memtable + size_t num_merges = mem_->CountSuccessiveMergeEntries(lkey); + + if (num_merges >= options_->max_successive_merges) { + perform_merge = true; + } + } + + if (perform_merge) { + // 1) Get the existing value + std::string get_value; + + // Pass in the sequence number so that we also include previous merge + // operations in the same batch. + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions read_options; + read_options.snapshot = &read_from_snapshot; + + db_->Get(read_options, key, &get_value); + Slice get_value_slice = Slice(get_value); + + // 2) Apply this merge + auto merge_operator = options_->merge_operator.get(); + assert(merge_operator); + + std::deque operands; + operands.push_front(value.ToString()); + std::string new_value; + if (!merge_operator->FullMerge(key, + &get_value_slice, + operands, + &new_value, + options_->info_log.get())) { + // Failed to merge! + RecordTick(options_->statistics.get(), NUMBER_MERGE_FAILURES); + + // Store the delta in memtable + perform_merge = false; + } else { + // 3) Add value to memtable + mem_->Add(sequence_, kTypeValue, key, new_value); + } + } + + if (!perform_merge) { + // Add merge operator to memtable + mem_->Add(sequence_, kTypeMerge, key, value); + } + sequence_++; } virtual void Delete(const Slice& key) { diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index ff9aa63eec..931d8f3f59 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -22,10 +22,11 @@ namespace rocksdb { static std::string PrintContents(WriteBatch* b) { InternalKeyComparator cmp(BytewiseComparator()); auto factory = std::make_shared(); - MemTable* mem = new MemTable(cmp, factory.get()); + Options options; + options.memtable_factory = factory; + MemTable* mem = new MemTable(cmp, options); mem->Ref(); std::string state; - Options options; Status s = WriteBatchInternal::InsertInto(b, mem, &options); int count = 0; Iterator* iter = mem->NewIterator(); diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index a3b18084a8..bd22e191b5 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -311,6 +311,7 @@ extern void rocksdb_cache_destroy(rocksdb_cache_t* cache); extern rocksdb_env_t* rocksdb_create_default_env(); extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n); +extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n); extern void rocksdb_env_destroy(rocksdb_env_t*); /* Universal Compaction options */ diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index dd17d9e9b5..4bf095756c 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -199,6 +199,7 @@ class DB { uint64_t* sizes) = 0; // Compact the underlying storage for the key range [*begin,*end]. + // The actual compaction interval might be superset of [*begin, *end]. // In particular, deleted and overwritten versions are discarded, // and the data is rearranged to reduce the cost of operations // needed to access the data. This operation should typically only diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index e24030ddcd..1a16a30cc2 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -111,27 +111,23 @@ class MemTableRep { }; // Return an iterator over the keys in this representation. - virtual std::shared_ptr GetIterator() = 0; + virtual Iterator* GetIterator() = 0; // Return an iterator over at least the keys with the specified user key. The // iterator may also allow access to other keys, but doesn't have to. Default: // GetIterator(). - virtual std::shared_ptr GetIterator(const Slice& user_key) { - return GetIterator(); - } + virtual Iterator* GetIterator(const Slice& user_key) { return GetIterator(); } // Return an iterator over at least the keys with the specified prefix. The // iterator may also allow access to other keys, but doesn't have to. Default: // GetIterator(). - virtual std::shared_ptr GetPrefixIterator(const Slice& prefix) { + virtual Iterator* GetPrefixIterator(const Slice& prefix) { return GetIterator(); } // Return an iterator that has a special Seek semantics. The result of // a Seek might only include keys with the same prefix as the target key. - virtual std::shared_ptr GetDynamicPrefixIterator() { - return GetIterator(); - } + virtual Iterator* GetDynamicPrefixIterator() { return GetIterator(); } protected: // When *key is an internal key concatenated with the value, returns the @@ -144,8 +140,8 @@ class MemTableRep { class MemTableRepFactory { public: virtual ~MemTableRepFactory() { }; - virtual std::shared_ptr CreateMemTableRep( - MemTableRep::KeyComparator&, Arena*) = 0; + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, + Arena*) = 0; virtual const char* Name() const = 0; }; @@ -161,8 +157,8 @@ class VectorRepFactory : public MemTableRepFactory { const size_t count_; public: explicit VectorRepFactory(size_t count = 0) : count_(count) { } - virtual std::shared_ptr CreateMemTableRep( - MemTableRep::KeyComparator&, Arena*) override; + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, + Arena*) override; virtual const char* Name() const override { return "VectorRepFactory"; } @@ -171,8 +167,8 @@ public: // This uses a skip list to store keys. It is the default. class SkipListFactory : public MemTableRepFactory { public: - virtual std::shared_ptr CreateMemTableRep( - MemTableRep::KeyComparator&, Arena*) override; + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, + Arena*) override; virtual const char* Name() const override { return "SkipListFactory"; } @@ -206,8 +202,8 @@ class TransformRepFactory : public MemTableRepFactory { virtual ~TransformRepFactory() { delete transform_; } - virtual std::shared_ptr CreateMemTableRep( - MemTableRep::KeyComparator&, Arena*) override; + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, + Arena*) override; virtual const char* Name() const override { return "TransformRepFactory"; @@ -247,8 +243,8 @@ public: : TransformRepFactory(prefix_extractor, bucket_count, num_locks) { } - virtual std::shared_ptr CreateMemTableRep( - MemTableRep::KeyComparator&, Arena*) override; + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, + Arena*) override; virtual const char* Name() const override { return "PrefixHashRepFactory"; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index c742d932d6..8499f60257 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -44,15 +44,15 @@ using std::shared_ptr; enum CompressionType : char { // NOTE: do not change the values of existing entries, as these are // part of the persistent format on disk. - kNoCompression = 0x0, + kNoCompression = 0x0, kSnappyCompression = 0x1, kZlibCompression = 0x2, kBZip2Compression = 0x3 }; enum CompactionStyle : char { - kCompactionStyleLevel = 0x0, // level based compaction style - kCompactionStyleUniversal = 0x1 // Universal compaction style + kCompactionStyleLevel = 0x0, // level based compaction style + kCompactionStyleUniversal = 0x1 // Universal compaction style }; // Compression options for different compression algorithms like Zlib @@ -60,12 +60,9 @@ struct CompressionOptions { int window_bits; int level; int strategy; - CompressionOptions():window_bits(-14), - level(-1), - strategy(0){} - CompressionOptions(int wbits, int lev, int strategy):window_bits(wbits), - level(lev), - strategy(strategy){} + CompressionOptions() : window_bits(-14), level(-1), strategy(0) {} + CompressionOptions(int wbits, int lev, int strategy) + : window_bits(wbits), level(lev), strategy(strategy) {} }; // Options to control the behavior of a database (passed to DB::Open) @@ -218,7 +215,6 @@ struct Options { // Default: 16 int block_restart_interval; - // Compress blocks using the specified compression algorithm. This // parameter can be changed dynamically. // @@ -249,7 +245,7 @@ struct Options { // java/C api hard to construct. std::vector compression_per_level; - //different options for compression algorithms + // different options for compression algorithms CompressionOptions compression_opts; // If non-nullptr, use the specified filter policy to reduce disk reads. @@ -328,7 +324,6 @@ struct Options { // will be 20MB, total file size for level-2 will be 200MB, // and total file size for level-3 will be 2GB. - // by default 'max_bytes_for_level_base' is 10MB. uint64_t max_bytes_for_level_base; // by default 'max_bytes_for_level_base' is 10. @@ -486,10 +481,19 @@ struct Options { // order. int table_cache_remove_scan_count_limit; - // size of one block in arena memory allocation. - // If <= 0, a proper value is automatically calculated (usually 1/10 of + // Size of one block in arena memory allocation. + // + // If <= 0, a proper value is automatically calculated (usually about 1/10 of // writer_buffer_size). // + // There are two additonal restriction of the The specified size: + // (1) size should be in the range of [4096, 2 << 30] and + // (2) be the multiple of the CPU word (which helps with the memory + // alignment). + // + // We'll automatically check and adjust the size number to make sure it + // conforms to the restrictions. + // // Default: 0 size_t arena_block_size; @@ -574,7 +578,12 @@ struct Options { // Specify the file access pattern once a compaction is started. // It will be applied to all input files of a compaction. // Default: NORMAL - enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start; + enum { + NONE, + NORMAL, + SEQUENTIAL, + WILLNEED + } access_hint_on_compaction_start; // Use adaptive mutex, which spins in the user space before resorting // to kernel. This could reduce context switch when the mutex is not @@ -641,7 +650,6 @@ struct Options { // Default: 10000, if inplace_update_support = true, else 0. size_t inplace_update_num_locks; - // * existing_value - pointer to previous value (from both memtable and sst). // nullptr if key doesn't exist // * existing_value_size - sizeof(existing_value). 0 if key doesn't exist @@ -681,6 +689,17 @@ struct Options { // number of hash probes per key uint32_t memtable_prefix_bloom_probes; + + // Maximum number of successive merge operations on a key in the memtable. + // + // When a merge operation is added to the memtable and the maximum number of + // successive merges is reached, the value of the key will be calculated and + // inserted into the memtable instead of the merge operation. This will + // ensure that there are never more than max_successive_merges merge + // operations in the memtable. + // + // Default: 0 (disabled) + size_t max_successive_merges; }; // @@ -691,7 +710,7 @@ struct Options { // the block cache. It will not page in data from the OS cache or data that // resides in storage. enum ReadTier { - kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage + kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage kBlockCacheTier = 0x1 // data in memtable or block cache }; @@ -744,13 +763,14 @@ struct ReadOptions { prefix_seek(false), snapshot(nullptr), prefix(nullptr), - read_tier(kReadAllTier) { - } - ReadOptions(bool cksum, bool cache) : - verify_checksums(cksum), fill_cache(cache), - prefix_seek(false), snapshot(nullptr), prefix(nullptr), - read_tier(kReadAllTier) { - } + read_tier(kReadAllTier) {} + ReadOptions(bool cksum, bool cache) + : verify_checksums(cksum), + fill_cache(cache), + prefix_seek(false), + snapshot(nullptr), + prefix(nullptr), + read_tier(kReadAllTier) {} }; // Options that control write operations @@ -777,10 +797,7 @@ struct WriteOptions { // and the write may got lost after a crash. bool disableWAL; - WriteOptions() - : sync(false), - disableWAL(false) { - } + WriteOptions() : sync(false), disableWAL(false) {} }; // Options that control flush operations @@ -789,9 +806,7 @@ struct FlushOptions { // Default: true bool wait; - FlushOptions() - : wait(true) { - } + FlushOptions() : wait(true) {} }; } // namespace rocksdb diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index 1c6210fe9f..2cfb731f63 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -88,7 +88,10 @@ class WriteBatch { Status Iterate(Handler* handler) const; // Retrieve the serialized version of this batch. - std::string Data() { return rep_; } + const std::string& Data() const { return rep_; } + + // Retrieve data size of the batch. + size_t GetDataSize() const { return rep_.size(); } // Returns the number of updates in the batch int Count() const; diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h index 335e028576..fbe2ae8a37 100644 --- a/include/utilities/backupable_db.h +++ b/include/utilities/backupable_db.h @@ -31,6 +31,14 @@ struct BackupableDBOptions { // Default: nullptr Env* backup_env; + // If share_table_files == true, backup will assume that table files with + // same name have the same contents. This enables incremental backups and + // avoids unnecessary data copies. + // If share_table_files == false, each backup will be on its own and will + // not share any data with other backups. + // default: true + bool share_table_files; + // Backup info and error messages will be written to info_log // if non-nullptr. // Default: nullptr @@ -49,6 +57,7 @@ struct BackupableDBOptions { explicit BackupableDBOptions(const std::string& _backup_dir, Env* _backup_env = nullptr, + bool _share_table_files = true, Logger* _info_log = nullptr, bool _sync = true, bool _destroy_old_data = false) : @@ -93,6 +102,14 @@ class BackupableDB : public StackableDB { Status PurgeOldBackups(uint32_t num_backups_to_keep); // deletes a specific backup Status DeleteBackup(BackupID backup_id); + // Call this from another thread if you want to stop the backup + // that is currently happening. It will return immediatelly, will + // not wait for the backup to stop. + // The backup will stop ASAP and the call to CreateNewBackup will + // return Status::Incomplete(). It will not clean up after itself, but + // the state will remain consistent. The state will be cleaned up + // next time you create BackupableDB or RestoreBackupableDB. + void StopBackup(); private: BackupEngine* backup_engine_; @@ -108,9 +125,10 @@ class RestoreBackupableDB { void GetBackupInfo(std::vector* backup_info); // restore from backup with backup_id - // IMPORTANT -- if you restore from some backup that is not the latest, - // and you start creating new backups from the new DB, all the backups - // that were newer than the backup you restored from will be deleted + // IMPORTANT -- if options_.share_table_files == true and you restore DB + // from some backup that is not the latest, and you start creating new + // backups from the new DB, all the backups that were newer than the + // backup you restored from will be deleted // // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. // If you try creating a new backup now, old backups 4 and 5 will be deleted diff --git a/table/table_test.cc b/table/table_test.cc index c6f7d22751..290efea0d9 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -394,7 +394,9 @@ class MemTableConstructor: public Constructor { : Constructor(cmp), internal_comparator_(cmp), table_factory_(new SkipListFactory) { - memtable_ = new MemTable(internal_comparator_, table_factory_.get()); + Options options; + options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, options); memtable_->Ref(); } ~MemTableConstructor() { @@ -402,7 +404,9 @@ class MemTableConstructor: public Constructor { } virtual Status FinishImpl(const Options& options, const KVMap& data) { delete memtable_->Unref(); - memtable_ = new MemTable(internal_comparator_, table_factory_.get()); + Options memtable_options; + memtable_options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, memtable_options); memtable_->Ref(); int seq = 1; for (KVMap::const_iterator it = data.begin(); @@ -1381,10 +1385,11 @@ class MemTableTest { }; TEST(MemTableTest, Simple) { InternalKeyComparator cmp(BytewiseComparator()); auto table_factory = std::make_shared(); - MemTable* memtable = new MemTable(cmp, table_factory.get()); + Options options; + options.memtable_factory = table_factory; + MemTable* memtable = new MemTable(cmp, options); memtable->Ref(); WriteBatch batch; - Options options; WriteBatchInternal::SetSequence(&batch, 100); batch.Put(std::string("k1"), std::string("v1")); batch.Put(std::string("k2"), std::string("v2")); diff --git a/util/arena_impl.cc b/util/arena_impl.cc index d5c2a537e2..5125e23641 100644 --- a/util/arena_impl.cc +++ b/util/arena_impl.cc @@ -8,71 +8,86 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "util/arena_impl.h" +#include namespace rocksdb { -ArenaImpl::ArenaImpl(size_t block_size) { - if (block_size < kMinBlockSize) { - block_size_ = kMinBlockSize; - } else if (block_size > kMaxBlockSize) { - block_size_ = kMaxBlockSize; - } else { - block_size_ = block_size; +const size_t ArenaImpl::kMinBlockSize = 4096; +const size_t ArenaImpl::kMaxBlockSize = 2 << 30; +static const int kAlignUnit = sizeof(void*); + +size_t OptimizeBlockSize(size_t block_size) { + // Make sure block_size is in optimal range + block_size = std::max(ArenaImpl::kMinBlockSize, block_size); + block_size = std::min(ArenaImpl::kMaxBlockSize, block_size); + + // make sure block_size is the multiple of kAlignUnit + if (block_size % kAlignUnit != 0) { + block_size = (1 + block_size / kAlignUnit) * kAlignUnit; } - blocks_memory_ = 0; - alloc_ptr_ = nullptr; // First allocation will allocate a block - alloc_bytes_remaining_ = 0; + return block_size; +} + +ArenaImpl::ArenaImpl(size_t block_size) + : kBlockSize(OptimizeBlockSize(block_size)) { + assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize && + kBlockSize % kAlignUnit == 0); } ArenaImpl::~ArenaImpl() { - for (size_t i = 0; i < blocks_.size(); i++) { - delete[] blocks_[i]; + for (const auto& block : blocks_) { + delete[] block; } } -char* ArenaImpl::AllocateFallback(size_t bytes) { - if (bytes > block_size_ / 4) { +char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) { + if (bytes > kBlockSize / 4) { // Object is more than a quarter of our block size. Allocate it separately // to avoid wasting too much space in leftover bytes. - char* result = AllocateNewBlock(bytes); - return result; + return AllocateNewBlock(bytes); } // We waste the remaining space in the current block. - alloc_ptr_ = AllocateNewBlock(block_size_); - alloc_bytes_remaining_ = block_size_; + auto block_head = AllocateNewBlock(kBlockSize); + alloc_bytes_remaining_ = kBlockSize - bytes; - char* result = alloc_ptr_; - alloc_ptr_ += bytes; - alloc_bytes_remaining_ -= bytes; - return result; + if (aligned) { + aligned_alloc_ptr_ = block_head + bytes; + unaligned_alloc_ptr_ = block_head + kBlockSize; + return block_head; + } else { + aligned_alloc_ptr_ = block_head; + unaligned_alloc_ptr_ = block_head + kBlockSize - bytes; + return unaligned_alloc_ptr_; + } } char* ArenaImpl::AllocateAligned(size_t bytes) { - const int align = sizeof(void*); // We'll align to pointer size - assert((align & (align-1)) == 0); // Pointer size should be a power of 2 - size_t current_mod = reinterpret_cast(alloc_ptr_) & (align-1); - size_t slop = (current_mod == 0 ? 0 : align - current_mod); + assert((kAlignUnit & (kAlignUnit - 1)) == + 0); // Pointer size should be a power of 2 + size_t current_mod = + reinterpret_cast(aligned_alloc_ptr_) & (kAlignUnit - 1); + size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod); size_t needed = bytes + slop; char* result; if (needed <= alloc_bytes_remaining_) { - result = alloc_ptr_ + slop; - alloc_ptr_ += needed; + result = aligned_alloc_ptr_ + slop; + aligned_alloc_ptr_ += needed; alloc_bytes_remaining_ -= needed; } else { // AllocateFallback always returned aligned memory - result = AllocateFallback(bytes); + result = AllocateFallback(bytes, true /* aligned */); } - assert((reinterpret_cast(result) & (align-1)) == 0); + assert((reinterpret_cast(result) & (kAlignUnit - 1)) == 0); return result; } char* ArenaImpl::AllocateNewBlock(size_t block_bytes) { - char* result = new char[block_bytes]; + char* block = new char[block_bytes]; blocks_memory_ += block_bytes; - blocks_.push_back(result); - return result; + blocks_.push_back(block); + return block; } } // namespace rocksdb diff --git a/util/arena_impl.h b/util/arena_impl.h index b5a6842472..538385ccc0 100644 --- a/util/arena_impl.h +++ b/util/arena_impl.h @@ -22,49 +22,54 @@ namespace rocksdb { class ArenaImpl : public Arena { public: + // No copying allowed + ArenaImpl(const ArenaImpl&) = delete; + void operator=(const ArenaImpl&) = delete; + + static const size_t kMinBlockSize; + static const size_t kMaxBlockSize; + explicit ArenaImpl(size_t block_size = kMinBlockSize); virtual ~ArenaImpl(); - virtual char* Allocate(size_t bytes); + virtual char* Allocate(size_t bytes) override; - virtual char* AllocateAligned(size_t bytes); + virtual char* AllocateAligned(size_t bytes) override; // Returns an estimate of the total memory usage of data allocated - // by the arena (including space allocated but not yet used for user + // by the arena (exclude the space allocated but not yet used for future // allocations). - // - // TODO: Do we need to exclude space allocated but not used? virtual const size_t ApproximateMemoryUsage() { - return blocks_memory_ + blocks_.capacity() * sizeof(char*); + return blocks_memory_ + blocks_.capacity() * sizeof(char*) - + alloc_bytes_remaining_; } - virtual const size_t MemoryAllocatedBytes() { + virtual const size_t MemoryAllocatedBytes() override { return blocks_memory_; } private: - char* AllocateFallback(size_t bytes); + // Number of bytes allocated in one block + const size_t kBlockSize; + // Array of new[] allocated memory blocks + typedef std::vector Blocks; + Blocks blocks_; + + // Stats for current active block. + // For each block, we allocate aligned memory chucks from one end and + // allocate unaligned memory chucks from the other end. Otherwise the + // memory waste for alignment will be higher if we allocate both types of + // memory from one direction. + char* unaligned_alloc_ptr_ = nullptr; + char* aligned_alloc_ptr_ = nullptr; + // How many bytes left in currently active block? + size_t alloc_bytes_remaining_ = 0; + + char* AllocateFallback(size_t bytes, bool aligned); char* AllocateNewBlock(size_t block_bytes); - static const size_t kMinBlockSize = 4096; - static const size_t kMaxBlockSize = 2 << 30; - - // Number of bytes allocated in one block - size_t block_size_; - - // Allocation state - char* alloc_ptr_; - size_t alloc_bytes_remaining_; - - // Array of new[] allocated memory blocks - std::vector blocks_; - // Bytes of memory in blocks allocated so far - size_t blocks_memory_; - - // No copying allowed - ArenaImpl(const ArenaImpl&); - void operator=(const ArenaImpl&); + size_t blocks_memory_ = 0; }; inline char* ArenaImpl::Allocate(size_t bytes) { @@ -73,12 +78,16 @@ inline char* ArenaImpl::Allocate(size_t bytes) { // them for our internal use). assert(bytes > 0); if (bytes <= alloc_bytes_remaining_) { - char* result = alloc_ptr_; - alloc_ptr_ += bytes; + unaligned_alloc_ptr_ -= bytes; alloc_bytes_remaining_ -= bytes; - return result; + return unaligned_alloc_ptr_; } - return AllocateFallback(bytes); + return AllocateFallback(bytes, false /* unaligned */); } +// check and adjust the block_size so that the return value is +// 1. in the range of [kMinBlockSize, kMaxBlockSize]. +// 2. the multiple of align unit. +extern size_t OptimizeBlockSize(size_t block_size); + } // namespace rocksdb diff --git a/util/arena_test.cc b/util/arena_test.cc index 12aa7f7fe5..ca6dfc99d6 100644 --- a/util/arena_test.cc +++ b/util/arena_test.cc @@ -57,8 +57,34 @@ TEST(ArenaImplTest, MemoryAllocatedBytes) { ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated); } +// Make sure we didn't count the allocate but not used memory space in +// Arena::ApproximateMemoryUsage() +TEST(ArenaImplTest, ApproximateMemoryUsageTest) { + const size_t kBlockSize = 4096; + const size_t kEntrySize = kBlockSize / 8; + const size_t kZero = 0; + ArenaImpl arena(kBlockSize); + ASSERT_EQ(kZero, arena.ApproximateMemoryUsage()); + + auto num_blocks = kBlockSize / kEntrySize; + + // first allocation + arena.AllocateAligned(kEntrySize); + auto mem_usage = arena.MemoryAllocatedBytes(); + ASSERT_EQ(mem_usage, kBlockSize); + auto usage = arena.ApproximateMemoryUsage(); + ASSERT_LT(usage, mem_usage); + for (size_t i = 1; i < num_blocks; ++i) { + arena.AllocateAligned(kEntrySize); + ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes()); + ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize); + usage = arena.ApproximateMemoryUsage(); + } + ASSERT_GT(usage, mem_usage); +} + TEST(ArenaImplTest, Simple) { - std::vector > allocated; + std::vector> allocated; ArenaImpl arena_impl; const int N = 100000; size_t bytes = 0; @@ -68,8 +94,9 @@ TEST(ArenaImplTest, Simple) { if (i % (N / 10) == 0) { s = i; } else { - s = rnd.OneIn(4000) ? rnd.Uniform(6000) : - (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); + s = rnd.OneIn(4000) + ? rnd.Uniform(6000) + : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); } if (s == 0) { // Our arena disallows size 0 allocations. @@ -89,7 +116,7 @@ TEST(ArenaImplTest, Simple) { bytes += s; allocated.push_back(std::make_pair(s, r)); ASSERT_GE(arena_impl.ApproximateMemoryUsage(), bytes); - if (i > N/10) { + if (i > N / 10) { ASSERT_LE(arena_impl.ApproximateMemoryUsage(), bytes * 1.10); } } diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc index 214ac2c55d..9c2d8f52c2 100644 --- a/util/hash_linklist_rep.cc +++ b/util/hash_linklist_rep.cc @@ -66,17 +66,15 @@ class HashLinkListRep : public MemTableRep { virtual ~HashLinkListRep(); - virtual std::shared_ptr GetIterator() override; + virtual MemTableRep::Iterator* GetIterator() override; - virtual std::shared_ptr GetIterator( - const Slice& slice) override; + virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override; - virtual std::shared_ptr GetPrefixIterator( - const Slice& prefix) override; - - virtual std::shared_ptr GetDynamicPrefixIterator() + virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix) override; + virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override; + private: friend class DynamicIterator; typedef SkipList FullList; @@ -298,8 +296,6 @@ class HashLinkListRep : public MemTableRep { virtual void SeekToLast() { } private: }; - - std::shared_ptr empty_iterator_; }; HashLinkListRep::HashLinkListRep(MemTableRep::KeyComparator& compare, @@ -308,9 +304,7 @@ HashLinkListRep::HashLinkListRep(MemTableRep::KeyComparator& compare, : bucket_size_(bucket_size), transform_(transform), compare_(compare), - arena_(arena), - empty_iterator_(std::make_shared()) { - + arena_(arena) { char* mem = arena_->AllocateAligned( sizeof(port::AtomicPointer) * bucket_size); @@ -389,7 +383,7 @@ size_t HashLinkListRep::ApproximateMemoryUsage() { return 0; } -std::shared_ptr HashLinkListRep::GetIterator() { +MemTableRep::Iterator* HashLinkListRep::GetIterator() { auto list = new FullList(compare_, arena_); for (size_t i = 0; i < bucket_size_; ++i) { auto bucket = GetBucket(i); @@ -400,26 +394,24 @@ std::shared_ptr HashLinkListRep::GetIterator() { } } } - return std::make_shared(list); + return new FullListIterator(list); } -std::shared_ptr HashLinkListRep::GetPrefixIterator( +MemTableRep::Iterator* HashLinkListRep::GetPrefixIterator( const Slice& prefix) { auto bucket = GetBucket(prefix); if (bucket == nullptr) { - return empty_iterator_; + return new EmptyIterator(); } - return std::make_shared(this, bucket); + return new Iterator(this, bucket); } -std::shared_ptr HashLinkListRep::GetIterator( - const Slice& slice) { +MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) { return GetPrefixIterator(transform_->Transform(slice)); } -std::shared_ptr - HashLinkListRep::GetDynamicPrefixIterator() { - return std::make_shared(*this); +MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() { + return new DynamicIterator(*this); } bool HashLinkListRep::BucketContains(Node* head, const Key& key) const { @@ -450,10 +442,9 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, } // anon namespace -std::shared_ptr HashLinkListRepFactory::CreateMemTableRep( +MemTableRep* HashLinkListRepFactory::CreateMemTableRep( MemTableRep::KeyComparator& compare, Arena* arena) { - return std::make_shared(compare, arena, transform_, - bucket_count_); + return new HashLinkListRep(compare, arena, transform_, bucket_count_); } MemTableRepFactory* NewHashLinkListRepFactory( diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h index 16d2517c25..efa9d8f2ec 100644 --- a/util/hash_linklist_rep.h +++ b/util/hash_linklist_rep.h @@ -22,8 +22,8 @@ class HashLinkListRepFactory : public MemTableRepFactory { virtual ~HashLinkListRepFactory() { delete transform_; } - virtual std::shared_ptr CreateMemTableRep( - MemTableRep::KeyComparator& compare, Arena* arena) override; + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare, + Arena* arena) override; virtual const char* Name() const override { return "HashLinkListRepFactory"; diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc index 95e6edfae9..906f830303 100644 --- a/util/hash_skiplist_rep.cc +++ b/util/hash_skiplist_rep.cc @@ -33,17 +33,15 @@ class HashSkipListRep : public MemTableRep { virtual ~HashSkipListRep(); - virtual std::shared_ptr GetIterator() override; + virtual MemTableRep::Iterator* GetIterator() override; - virtual std::shared_ptr GetIterator( - const Slice& slice) override; + virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override; - virtual std::shared_ptr GetPrefixIterator( - const Slice& prefix) override; - - virtual std::shared_ptr GetDynamicPrefixIterator() + virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix) override; + virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override; + private: friend class DynamicIterator; typedef SkipList Bucket; @@ -216,22 +214,18 @@ class HashSkipListRep : public MemTableRep { virtual void SeekToLast() { } private: }; - - std::shared_ptr empty_iterator_; }; HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare, Arena* arena, const SliceTransform* transform, size_t bucket_size, int32_t skiplist_height, int32_t skiplist_branching_factor) - : bucket_size_(bucket_size), - skiplist_height_(skiplist_height), - skiplist_branching_factor_(skiplist_branching_factor), - transform_(transform), - compare_(compare), - arena_(arena), - empty_iterator_(std::make_shared()) { - + : bucket_size_(bucket_size), + skiplist_height_(skiplist_height), + skiplist_branching_factor_(skiplist_branching_factor), + transform_(transform), + compare_(compare), + arena_(arena) { buckets_ = new port::AtomicPointer[bucket_size]; for (size_t i = 0; i < bucket_size_; ++i) { @@ -276,7 +270,7 @@ size_t HashSkipListRep::ApproximateMemoryUsage() { return sizeof(buckets_); } -std::shared_ptr HashSkipListRep::GetIterator() { +MemTableRep::Iterator* HashSkipListRep::GetIterator() { auto list = new Bucket(compare_, arena_); for (size_t i = 0; i < bucket_size_; ++i) { auto bucket = GetBucket(i); @@ -287,35 +281,31 @@ std::shared_ptr HashSkipListRep::GetIterator() { } } } - return std::make_shared(list); + return new Iterator(list); } -std::shared_ptr HashSkipListRep::GetPrefixIterator( - const Slice& prefix) { +MemTableRep::Iterator* HashSkipListRep::GetPrefixIterator(const Slice& prefix) { auto bucket = GetBucket(prefix); if (bucket == nullptr) { - return empty_iterator_; + return new EmptyIterator(); } - return std::make_shared(bucket, false); + return new Iterator(bucket, false); } -std::shared_ptr HashSkipListRep::GetIterator( - const Slice& slice) { +MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) { return GetPrefixIterator(transform_->Transform(slice)); } -std::shared_ptr - HashSkipListRep::GetDynamicPrefixIterator() { - return std::make_shared(*this); +MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() { + return new DynamicIterator(*this); } } // anon namespace -std::shared_ptr HashSkipListRepFactory::CreateMemTableRep( +MemTableRep* HashSkipListRepFactory::CreateMemTableRep( MemTableRep::KeyComparator& compare, Arena* arena) { - return std::make_shared(compare, arena, transform_, - bucket_count_, skiplist_height_, - skiplist_branching_factor_); + return new HashSkipListRep(compare, arena, transform_, bucket_count_, + skiplist_height_, skiplist_branching_factor_); } MemTableRepFactory* NewHashSkipListRepFactory( diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h index 3411f66b15..1ea844eda5 100644 --- a/util/hash_skiplist_rep.h +++ b/util/hash_skiplist_rep.h @@ -26,8 +26,8 @@ class HashSkipListRepFactory : public MemTableRepFactory { virtual ~HashSkipListRepFactory() { delete transform_; } - virtual std::shared_ptr CreateMemTableRep( - MemTableRep::KeyComparator& compare, Arena* arena) override; + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare, + Arena* arena) override; virtual const char* Name() const override { return "HashSkipListRepFactory"; diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc index 58d81460e9..65ecd61a26 100644 --- a/util/ldb_cmd.cc +++ b/util/ldb_cmd.cc @@ -1024,7 +1024,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, } int max = -1; for (int i = 0; i < versions.NumberLevels(); i++) { - if (versions.NumLevelFiles(i)) { + if (versions.current()->NumLevelFiles(i)) { max = i; } } diff --git a/util/manual_compaction_test.cc b/util/manual_compaction_test.cc index ebe1339e53..dd615f0570 100644 --- a/util/manual_compaction_test.cc +++ b/util/manual_compaction_test.cc @@ -9,9 +9,13 @@ #include #include "rocksdb/db.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/slice.h" #include "rocksdb/write_batch.h" #include "util/testharness.h" +using namespace rocksdb; + namespace { const int kNumKeys = 1100000; @@ -26,12 +30,71 @@ std::string Key2(int i) { return Key1(i) + "_xxx"; } -class ManualCompactionTest { }; +class ManualCompactionTest { + public: + ManualCompactionTest() { + // Get rid of any state from an old run. + dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test"; + DestroyDB(dbname_, rocksdb::Options()); + } + + std::string dbname_; +}; + +class DestroyAllCompactionFilter : public CompactionFilter { + public: + DestroyAllCompactionFilter() {} + + virtual bool Filter(int level, + const Slice& key, + const Slice& existing_value, + std::string* new_value, + bool* value_changed) const { + return existing_value.ToString() == "destroy"; + } + + virtual const char* Name() const { + return "DestroyAllCompactionFilter"; + } +}; + +TEST(ManualCompactionTest, CompactTouchesAllKeys) { + for (int iter = 0; iter < 2; ++iter) { + DB* db; + Options options; + if (iter == 0) { // level compaction + options.num_levels = 3; + options.compaction_style = kCompactionStyleLevel; + } else { // universal compaction + options.compaction_style = kCompactionStyleUniversal; + } + options.create_if_missing = true; + options.compression = rocksdb::kNoCompression; + options.compaction_filter = new DestroyAllCompactionFilter(); + ASSERT_OK(DB::Open(options, dbname_, &db)); + + db->Put(WriteOptions(), Slice("key1"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key2"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key3"), Slice("value3")); + db->Put(WriteOptions(), Slice("key4"), Slice("destroy")); + + Slice key4("key4"); + db->CompactRange(nullptr, &key4); + Iterator* itr = db->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ("key3", itr->key().ToString()); + itr->Next(); + ASSERT_TRUE(!itr->Valid()); + delete itr; + + delete options.compaction_filter; + delete db; + DestroyDB(dbname_, options); + } +} TEST(ManualCompactionTest, Test) { - // Get rid of any state from an old run. - std::string dbpath = rocksdb::test::TmpDir() + "/rocksdb_cbug_test"; - DestroyDB(dbpath, rocksdb::Options()); // Open database. Disable compression since it affects the creation // of layers and the code below is trying to test against a very @@ -40,7 +103,7 @@ TEST(ManualCompactionTest, Test) { rocksdb::Options db_options; db_options.create_if_missing = true; db_options.compression = rocksdb::kNoCompression; - ASSERT_OK(rocksdb::DB::Open(db_options, dbpath, &db)); + ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db)); // create first key range rocksdb::WriteBatch batch; @@ -83,7 +146,7 @@ TEST(ManualCompactionTest, Test) { // close database delete db; - DestroyDB(dbpath, rocksdb::Options()); + DestroyDB(dbname_, rocksdb::Options()); } } // anonymous namespace diff --git a/util/options.cc b/util/options.cc index 93aa268f15..918b87281d 100644 --- a/util/options.cc +++ b/util/options.cc @@ -104,7 +104,8 @@ Options::Options() inplace_update_num_locks(10000), inplace_callback(nullptr), memtable_prefix_bloom_bits(0), - memtable_prefix_bloom_probes(6) { + memtable_prefix_bloom_probes(6), + max_successive_merges(0) { assert(memtable_factory.get() != nullptr); } @@ -300,6 +301,8 @@ Options::Dump(Logger* log) const memtable_prefix_bloom_bits); Log(log, " Options.memtable_prefix_bloom_probes: %d", memtable_prefix_bloom_probes); + Log(log, " Options.max_successive_merges: %zd", + max_successive_merges); } // Options::Dump // diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc index f4c6e0c93f..6f1fb1a157 100644 --- a/util/skiplistrep.cc +++ b/util/skiplistrep.cc @@ -97,15 +97,15 @@ public: // Unhide default implementations of GetIterator using MemTableRep::GetIterator; - virtual std::shared_ptr GetIterator() override { - return std::make_shared(&skip_list_); + virtual MemTableRep::Iterator* GetIterator() override { + return new SkipListRep::Iterator(&skip_list_); } }; } -std::shared_ptr SkipListFactory::CreateMemTableRep ( - MemTableRep::KeyComparator& compare, Arena* arena) { - return std::shared_ptr(new SkipListRep(compare, arena)); +MemTableRep* SkipListFactory::CreateMemTableRep( + MemTableRep::KeyComparator& compare, Arena* arena) { + return new SkipListRep(compare, arena); } } // namespace rocksdb diff --git a/util/vectorrep.cc b/util/vectorrep.cc index 3887f356d5..bd2f0873d1 100644 --- a/util/vectorrep.cc +++ b/util/vectorrep.cc @@ -90,7 +90,7 @@ class VectorRep : public MemTableRep { using MemTableRep::GetIterator; // Return an iterator over the keys in this representation. - virtual std::shared_ptr GetIterator() override; + virtual MemTableRep::Iterator* GetIterator() override; private: friend class Iterator; @@ -233,22 +233,22 @@ void VectorRep::Iterator::SeekToLast() { } } -std::shared_ptr VectorRep::GetIterator() { +MemTableRep::Iterator* VectorRep::GetIterator() { ReadLock l(&rwlock_); // Do not sort here. The sorting would be done the first time // a Seek is performed on the iterator. if (immutable_) { - return std::make_shared(this, bucket_, compare_); + return new Iterator(this, bucket_, compare_); } else { std::shared_ptr tmp; tmp.reset(new Bucket(*bucket_)); // make a copy - return std::make_shared(nullptr, tmp, compare_); + return new Iterator(nullptr, tmp, compare_); } } } // anon namespace -std::shared_ptr VectorRepFactory::CreateMemTableRep( - MemTableRep::KeyComparator& compare, Arena* arena) { - return std::make_shared(compare, arena, count_); +MemTableRep* VectorRepFactory::CreateMemTableRep( + MemTableRep::KeyComparator& compare, Arena* arena) { + return new VectorRep(compare, arena, count_); } } // namespace rocksdb diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 61e009cd31..26bdd254b5 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -20,6 +20,7 @@ #include #include #include +#include namespace rocksdb { @@ -31,6 +32,9 @@ class BackupEngine { Status CreateNewBackup(DB* db, bool flush_before_backup = false); Status PurgeOldBackups(uint32_t num_backups_to_keep); Status DeleteBackup(BackupID backup_id); + void StopBackup() { + stop_backup_.store(true, std::memory_order_release); + } void GetBackupInfo(std::vector* backup_info); Status RestoreDBFromBackup(BackupID backup_id, const std::string &db_dir, @@ -106,13 +110,16 @@ class BackupEngine { return "private"; } inline std::string GetPrivateFileRel(BackupID backup_id, - const std::string &file = "") const { + bool tmp = false, + const std::string& file = "") const { assert(file.size() == 0 || file[0] != '/'); - return GetPrivateDirRel() + "/" + std::to_string(backup_id) + "/" + file; + return GetPrivateDirRel() + "/" + std::to_string(backup_id) + + (tmp ? ".tmp" : "") + "/" + file; } - inline std::string GetSharedFileRel(const std::string& file = "") const { + inline std::string GetSharedFileRel(const std::string& file = "", + bool tmp = false) const { assert(file.size() == 0 || file[0] != '/'); - return "shared/" + file; + return "shared/" + file + (tmp ? ".tmp" : ""); } inline std::string GetLatestBackupFile(bool tmp = false) const { return GetAbsolutePath(std::string("LATEST_BACKUP") + (tmp ? ".tmp" : "")); @@ -151,6 +158,7 @@ class BackupEngine { std::map backups_; std::unordered_map backuped_file_refs_; std::vector obsolete_backups_; + std::atomic stop_backup_; // options data BackupableDBOptions options_; @@ -161,13 +169,17 @@ class BackupEngine { }; BackupEngine::BackupEngine(Env* db_env, const BackupableDBOptions& options) - : options_(options), - db_env_(db_env), - backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_) { + : stop_backup_(false), + options_(options), + db_env_(db_env), + backup_env_(options.backup_env != nullptr ? options.backup_env + : db_env_) { // create all the dirs we need backup_env_->CreateDirIfMissing(GetAbsolutePath()); - backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel())); + if (options_.share_table_files) { + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel())); + } backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel())); backup_env_->CreateDirIfMissing(GetBackupMetaDir()); @@ -298,8 +310,9 @@ Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) { Log(options_.info_log, "Started the backup process -- creating backup %u", new_backup_id); - // create private dir - s = backup_env_->CreateDir(GetAbsolutePath(GetPrivateFileRel(new_backup_id))); + // create temporary private dir + s = backup_env_->CreateDir( + GetAbsolutePath(GetPrivateFileRel(new_backup_id, true))); // copy live_files for (size_t i = 0; s.ok() && i < live_files.size(); ++i) { @@ -320,7 +333,7 @@ Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) { // * if it's kDescriptorFile, limit the size to manifest_file_size s = BackupFile(new_backup_id, &new_backup, - type == kTableFile, /* shared */ + options_.share_table_files && type == kTableFile, db->GetName(), /* src_dir */ live_files[i], /* src_fname */ (type == kDescriptorFile) ? manifest_file_size : 0); @@ -342,6 +355,13 @@ Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) { // we copied all the files, enable file deletions db->EnableFileDeletions(); + if (s.ok()) { + // move tmp private backup to real backup folder + s = backup_env_->RenameFile( + GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)), // tmp + GetAbsolutePath(GetPrivateFileRel(new_backup_id, false))); + } + if (s.ok()) { // persist the backup metadata on the disk s = new_backup.StoreToFile(options_.sync); @@ -561,6 +581,9 @@ Status BackupEngine::CopyFile(const std::string& src, Slice data; do { + if (stop_backup_.load(std::memory_order_acquire)) { + return Status::Incomplete("Backup stopped"); + } size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? copy_file_buffer_size_ : size_limit; s = src_file->Read(buffer_to_read, &data, buf.get()); @@ -590,12 +613,16 @@ Status BackupEngine::BackupFile(BackupID backup_id, assert(src_fname.size() > 0 && src_fname[0] == '/'); std::string dst_relative = src_fname.substr(1); + std::string dst_relative_tmp; if (shared) { - dst_relative = GetSharedFileRel(dst_relative); + dst_relative_tmp = GetSharedFileRel(dst_relative, true); + dst_relative = GetSharedFileRel(dst_relative, false); } else { - dst_relative = GetPrivateFileRel(backup_id, dst_relative); + dst_relative_tmp = GetPrivateFileRel(backup_id, true, dst_relative); + dst_relative = GetPrivateFileRel(backup_id, false, dst_relative); } std::string dst_path = GetAbsolutePath(dst_relative); + std::string dst_path_tmp = GetAbsolutePath(dst_relative_tmp); Status s; uint64_t size; @@ -607,12 +634,15 @@ Status BackupEngine::BackupFile(BackupID backup_id, } else { Log(options_.info_log, "Copying %s", src_fname.c_str()); s = CopyFile(src_dir + src_fname, - dst_path, + dst_path_tmp, db_env_, backup_env_, options_.sync, &size, size_limit); + if (s.ok() && shared) { + s = backup_env_->RenameFile(dst_path_tmp, dst_path); + } } if (s.ok()) { backup->AddFile(dst_relative, size); @@ -671,14 +701,16 @@ void BackupEngine::GarbageCollection(bool full_scan) { &private_children); for (auto& child : private_children) { BackupID backup_id = 0; + bool tmp_dir = child.find(".tmp") != std::string::npos; sscanf(child.c_str(), "%u", &backup_id); - if (backup_id == 0 || backups_.find(backup_id) != backups_.end()) { + if (!tmp_dir && // if it's tmp_dir, delete it + (backup_id == 0 || backups_.find(backup_id) != backups_.end())) { // it's either not a number or it's still alive. continue continue; } // here we have to delete the dir and all its children std::string full_private_path = - GetAbsolutePath(GetPrivateFileRel(backup_id)); + GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir)); std::vector subchildren; backup_env_->GetChildren(full_private_path, &subchildren); for (auto& subchild : subchildren) { @@ -813,7 +845,9 @@ Status BackupEngine::BackupMeta::StoreToFile(bool sync) { BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options) : StackableDB(db), backup_engine_(new BackupEngine(db->GetEnv(), options)) { - backup_engine_->DeleteBackupsNewerThan(GetLatestSequenceNumber()); + if (options.share_table_files) { + backup_engine_->DeleteBackupsNewerThan(GetLatestSequenceNumber()); + } } BackupableDB::~BackupableDB() { @@ -836,6 +870,10 @@ Status BackupableDB::DeleteBackup(BackupID backup_id) { return backup_engine_->DeleteBackup(backup_id); } +void BackupableDB::StopBackup() { + backup_engine_->StopBackup(); +} + // --- RestoreBackupableDB methods ------ RestoreBackupableDB::RestoreBackupableDB(Env* db_env, diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index c64f0170b8..de240558f9 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -305,7 +305,7 @@ class BackupableDBTest { CreateLoggerFromOptions(dbname_, backupdir_, env_, Options(), &logger_); backupable_options_.reset(new BackupableDBOptions( - backupdir_, test_backup_env_.get(), logger_.get(), true)); + backupdir_, test_backup_env_.get(), true, logger_.get(), true)); // delete old files in db DestroyDB(dbname_, Options()); @@ -317,7 +317,8 @@ class BackupableDBTest { return db; } - void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false) { + void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false, + bool share_table_files = true) { // reset all the defaults test_backup_env_->SetLimitWrittenFiles(1000000); test_db_env_->SetLimitWrittenFiles(1000000); @@ -331,6 +332,7 @@ class BackupableDBTest { ASSERT_OK(DB::Open(options_, dbname_, &db)); } backupable_options_->destroy_old_data = destroy_old_data; + backupable_options_->share_table_files = share_table_files; db_.reset(new BackupableDB(db, *backupable_options_)); } @@ -659,6 +661,38 @@ TEST(BackupableDBTest, DeleteNewerBackups) { CloseRestoreDB(); } +TEST(BackupableDBTest, NoShareTableFiles) { + const int keys_iteration = 5000; + OpenBackupableDB(true, false, false); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(i % 2))); + } + CloseBackupableDB(); + + for (int i = 0; i < 5; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * 6); + } +} + +TEST(BackupableDBTest, DeleteTmpFiles) { + OpenBackupableDB(); + CloseBackupableDB(); + std::string shared_tmp = backupdir_ + "/shared/00006.sst.tmp"; + std::string private_tmp_dir = backupdir_ + "/private/10.tmp"; + std::string private_tmp_file = private_tmp_dir + "/00003.sst"; + file_manager_->WriteToFile(shared_tmp, "tmp"); + file_manager_->CreateDir(private_tmp_dir); + file_manager_->WriteToFile(private_tmp_file, "tmp"); + ASSERT_EQ(true, file_manager_->FileExists(private_tmp_dir)); + OpenBackupableDB(); + CloseBackupableDB(); + ASSERT_EQ(false, file_manager_->FileExists(shared_tmp)); + ASSERT_EQ(false, file_manager_->FileExists(private_tmp_file)); + ASSERT_EQ(false, file_manager_->FileExists(private_tmp_dir)); +} + } // anon namespace } // namespace rocksdb