mirror of https://github.com/facebook/rocksdb.git
Merge branch 'main' into JniReaderForTableIterator
This commit is contained in:
commit
ccc6b591f1
|
@ -80,7 +80,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4.1.0
|
- uses: actions/checkout@v4.1.0
|
||||||
- uses: "./.github/actions/pre-steps"
|
- uses: "./.github/actions/pre-steps"
|
||||||
- run: PORTABLE=1 make V=1 -j32 valgrind_test
|
- run: make V=1 -j32 valgrind_test
|
||||||
- uses: "./.github/actions/post-steps"
|
- uses: "./.github/actions/post-steps"
|
||||||
build-windows-vs2022-avx2:
|
build-windows-vs2022-avx2:
|
||||||
if: ${{ github.repository_owner == 'facebook' }}
|
if: ${{ github.repository_owner == 'facebook' }}
|
||||||
|
|
13
Makefile
13
Makefile
|
@ -630,6 +630,11 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
|
||||||
VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
|
VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
|
||||||
# Not yet supported: --show-leak-kinds=definite,possible,reachable --errors-for-leak-kinds=definite,possible,reachable
|
# Not yet supported: --show-leak-kinds=definite,possible,reachable --errors-for-leak-kinds=definite,possible,reachable
|
||||||
|
|
||||||
|
# Work around valgrind hanging on systems with limited internet access
|
||||||
|
ifneq ($(shell which git 2>/dev/null && git config --get https.proxy),)
|
||||||
|
export DEBUGINFOD_URLS=
|
||||||
|
endif
|
||||||
|
|
||||||
TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST)
|
TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST)
|
||||||
BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES))
|
BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES))
|
||||||
CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES))
|
CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES))
|
||||||
|
@ -1164,16 +1169,16 @@ ubsan_crash_test_with_best_efforts_recovery: clean
|
||||||
$(MAKE) clean
|
$(MAKE) clean
|
||||||
|
|
||||||
full_valgrind_test:
|
full_valgrind_test:
|
||||||
ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check
|
ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check
|
||||||
|
|
||||||
full_valgrind_test_some:
|
full_valgrind_test_some:
|
||||||
ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some
|
ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check_some
|
||||||
|
|
||||||
valgrind_test:
|
valgrind_test:
|
||||||
ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check
|
ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check
|
||||||
|
|
||||||
valgrind_test_some:
|
valgrind_test_some:
|
||||||
ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some
|
ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check_some
|
||||||
|
|
||||||
valgrind_check: $(TESTS)
|
valgrind_check: $(TESTS)
|
||||||
$(MAKE) DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" gen_parallel_tests
|
$(MAKE) DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" gen_parallel_tests
|
||||||
|
|
2
TARGETS
2
TARGETS
|
@ -417,6 +417,8 @@ cpp_binary_wrapper(name="ldb", srcs=["tools/ldb.cc"], deps=[":rocksdb_tools_lib"
|
||||||
|
|
||||||
cpp_binary_wrapper(name="db_stress", srcs=["db_stress_tool/db_stress.cc"], deps=[":rocksdb_stress_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
|
cpp_binary_wrapper(name="db_stress", srcs=["db_stress_tool/db_stress.cc"], deps=[":rocksdb_stress_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
|
||||||
|
|
||||||
|
cpp_binary_wrapper(name="db_bench", srcs=["tools/db_bench.cc"], deps=[":rocksdb_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
|
||||||
|
|
||||||
cpp_binary_wrapper(name="cache_bench", srcs=["cache/cache_bench.cc"], deps=[":rocksdb_cache_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
|
cpp_binary_wrapper(name="cache_bench", srcs=["cache/cache_bench.cc"], deps=[":rocksdb_cache_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
|
||||||
|
|
||||||
cpp_binary_wrapper(name="ribbon_bench", srcs=["microbench/ribbon_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True)
|
cpp_binary_wrapper(name="ribbon_bench", srcs=["microbench/ribbon_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True)
|
||||||
|
|
|
@ -201,6 +201,10 @@ def generate_targets(repo_path, deps_map):
|
||||||
TARGETS.add_binary(
|
TARGETS.add_binary(
|
||||||
"db_stress", ["db_stress_tool/db_stress.cc"], [":rocksdb_stress_lib"]
|
"db_stress", ["db_stress_tool/db_stress.cc"], [":rocksdb_stress_lib"]
|
||||||
)
|
)
|
||||||
|
# db_bench binary
|
||||||
|
TARGETS.add_binary(
|
||||||
|
"db_bench", ["tools/db_bench.cc"], [":rocksdb_tools_lib"]
|
||||||
|
)
|
||||||
# cache_bench binary
|
# cache_bench binary
|
||||||
TARGETS.add_binary(
|
TARGETS.add_binary(
|
||||||
"cache_bench", ["cache/cache_bench.cc"], [":rocksdb_cache_bench_tools_lib"]
|
"cache_bench", ["cache/cache_bench.cc"], [":rocksdb_cache_bench_tools_lib"]
|
||||||
|
|
|
@ -886,6 +886,32 @@ TEST_P(CacheTest, ApplyToAllEntriesDuringResize) {
|
||||||
ASSERT_EQ(special_count, kSpecialCount);
|
ASSERT_EQ(special_count, kSpecialCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_P(CacheTest, ApplyToHandleTest) {
|
||||||
|
std::string callback_state;
|
||||||
|
const auto callback = [&](const Slice& key, Cache::ObjectPtr value,
|
||||||
|
size_t charge,
|
||||||
|
const Cache::CacheItemHelper* helper) {
|
||||||
|
callback_state = std::to_string(DecodeKey(key)) + "," +
|
||||||
|
std::to_string(DecodeValue(value)) + "," +
|
||||||
|
std::to_string(charge);
|
||||||
|
assert(helper == &CacheTest::kHelper);
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<std::string> inserted;
|
||||||
|
|
||||||
|
for (int i = 0; i < 10; ++i) {
|
||||||
|
Insert(i, i * 2, i + 1);
|
||||||
|
inserted.push_back(std::to_string(i) + "," + std::to_string(i * 2) + "," +
|
||||||
|
std::to_string(i + 1));
|
||||||
|
}
|
||||||
|
for (int i = 0; i < 10; ++i) {
|
||||||
|
Cache::Handle* handle = cache_->Lookup(EncodeKey(i));
|
||||||
|
cache_->ApplyToHandle(cache_.get(), handle, callback);
|
||||||
|
EXPECT_EQ(inserted[i], callback_state);
|
||||||
|
cache_->Release(handle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST_P(CacheTest, DefaultShardBits) {
|
TEST_P(CacheTest, DefaultShardBits) {
|
||||||
// Prevent excessive allocation (to save time & space)
|
// Prevent excessive allocation (to save time & space)
|
||||||
estimated_value_size_ = 100000;
|
estimated_value_size_ = 100000;
|
||||||
|
|
|
@ -1444,6 +1444,22 @@ const Cache::CacheItemHelper* BaseHyperClockCache<Table>::GetCacheItemHelper(
|
||||||
return h->helper;
|
return h->helper;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class Table>
|
||||||
|
void BaseHyperClockCache<Table>::ApplyToHandle(
|
||||||
|
Cache* cache, Handle* handle,
|
||||||
|
const std::function<void(const Slice& key, Cache::ObjectPtr value,
|
||||||
|
size_t charge, const CacheItemHelper* helper)>&
|
||||||
|
callback) {
|
||||||
|
BaseHyperClockCache<Table>* cache_ptr =
|
||||||
|
static_cast<BaseHyperClockCache<Table>*>(cache);
|
||||||
|
auto h = static_cast<const typename Table::HandleImpl*>(handle);
|
||||||
|
UniqueId64x2 unhashed;
|
||||||
|
auto hash_seed = cache_ptr->GetShard(h->GetHash()).GetTable().GetHashSeed();
|
||||||
|
callback(
|
||||||
|
ClockCacheShard<Table>::ReverseHash(h->hashed_key, &unhashed, hash_seed),
|
||||||
|
h->value, h->GetTotalCharge(), h->helper);
|
||||||
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
// For each cache shard, estimate what the table load factor would be if
|
// For each cache shard, estimate what the table load factor would be if
|
||||||
|
|
|
@ -1128,6 +1128,12 @@ class BaseHyperClockCache : public ShardedCache<ClockCacheShard<Table>> {
|
||||||
|
|
||||||
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
|
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
|
||||||
|
|
||||||
|
void ApplyToHandle(
|
||||||
|
Cache* cache, Handle* handle,
|
||||||
|
const std::function<void(const Slice& key, Cache::ObjectPtr obj,
|
||||||
|
size_t charge, const CacheItemHelper* helper)>&
|
||||||
|
callback) override;
|
||||||
|
|
||||||
void ReportProblems(
|
void ReportProblems(
|
||||||
const std::shared_ptr<Logger>& /*info_log*/) const override;
|
const std::shared_ptr<Logger>& /*info_log*/) const override;
|
||||||
};
|
};
|
||||||
|
|
|
@ -677,6 +677,17 @@ const Cache::CacheItemHelper* LRUCache::GetCacheItemHelper(
|
||||||
return h->helper;
|
return h->helper;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void LRUCache::ApplyToHandle(
|
||||||
|
Cache* cache, Handle* handle,
|
||||||
|
const std::function<void(const Slice& key, ObjectPtr value, size_t charge,
|
||||||
|
const CacheItemHelper* helper)>& callback) {
|
||||||
|
auto cache_ptr = static_cast<LRUCache*>(cache);
|
||||||
|
auto h = static_cast<const LRUHandle*>(handle);
|
||||||
|
callback(h->key(), h->value,
|
||||||
|
h->GetCharge(cache_ptr->GetShard(0).metadata_charge_policy_),
|
||||||
|
h->helper);
|
||||||
|
}
|
||||||
|
|
||||||
size_t LRUCache::TEST_GetLRUSize() {
|
size_t LRUCache::TEST_GetLRUSize() {
|
||||||
return SumOverShards([](LRUCacheShard& cs) { return cs.TEST_GetLRUSize(); });
|
return SumOverShards([](LRUCacheShard& cs) { return cs.TEST_GetLRUSize(); });
|
||||||
}
|
}
|
||||||
|
|
|
@ -452,6 +452,12 @@ class LRUCache
|
||||||
size_t GetCharge(Handle* handle) const override;
|
size_t GetCharge(Handle* handle) const override;
|
||||||
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
|
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
|
||||||
|
|
||||||
|
void ApplyToHandle(
|
||||||
|
Cache* cache, Handle* handle,
|
||||||
|
const std::function<void(const Slice& key, ObjectPtr obj, size_t charge,
|
||||||
|
const CacheItemHelper* helper)>& callback)
|
||||||
|
override;
|
||||||
|
|
||||||
// Retrieves number of elements in LRU, for unit test purpose only.
|
// Retrieves number of elements in LRU, for unit test purpose only.
|
||||||
size_t TEST_GetLRUSize();
|
size_t TEST_GetLRUSize();
|
||||||
// Retrieves high pri pool ratio.
|
// Retrieves high pri pool ratio.
|
||||||
|
|
|
@ -253,6 +253,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicTest) {
|
||||||
table_options.cache_index_and_filter_blocks = false;
|
table_options.cache_index_and_filter_blocks = false;
|
||||||
Options options = GetDefaultOptions();
|
Options options = GetDefaultOptions();
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
|
options.compression = kLZ4Compression;
|
||||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||||
|
|
||||||
// Disable paranoid_file_checks so that flush will not read back the newly
|
// Disable paranoid_file_checks so that flush will not read back the newly
|
||||||
|
@ -364,6 +365,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) {
|
||||||
table_options.cache_index_and_filter_blocks = false;
|
table_options.cache_index_and_filter_blocks = false;
|
||||||
Options options = GetDefaultOptions();
|
Options options = GetDefaultOptions();
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
|
options.compression = kLZ4Compression;
|
||||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||||
|
|
||||||
options.paranoid_file_checks = false;
|
options.paranoid_file_checks = false;
|
||||||
|
@ -506,6 +508,7 @@ TEST_F(DBTieredSecondaryCacheTest, WaitAllTest) {
|
||||||
table_options.cache_index_and_filter_blocks = false;
|
table_options.cache_index_and_filter_blocks = false;
|
||||||
Options options = GetDefaultOptions();
|
Options options = GetDefaultOptions();
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
|
options.compression = kLZ4Compression;
|
||||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||||
|
|
||||||
options.paranoid_file_checks = false;
|
options.paranoid_file_checks = false;
|
||||||
|
@ -606,6 +609,7 @@ TEST_F(DBTieredSecondaryCacheTest, ReadyBeforeWaitAllTest) {
|
||||||
table_options.cache_index_and_filter_blocks = false;
|
table_options.cache_index_and_filter_blocks = false;
|
||||||
Options options = GetDefaultOptions();
|
Options options = GetDefaultOptions();
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
|
options.compression = kLZ4Compression;
|
||||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||||
options.statistics = CreateDBStatistics();
|
options.statistics = CreateDBStatistics();
|
||||||
|
|
||||||
|
@ -717,6 +721,7 @@ TEST_F(DBTieredSecondaryCacheTest, IterateTest) {
|
||||||
table_options.cache_index_and_filter_blocks = false;
|
table_options.cache_index_and_filter_blocks = false;
|
||||||
Options options = GetDefaultOptions();
|
Options options = GetDefaultOptions();
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
|
options.compression = kLZ4Compression;
|
||||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||||
|
|
||||||
options.paranoid_file_checks = false;
|
options.paranoid_file_checks = false;
|
||||||
|
@ -784,6 +789,7 @@ TEST_P(DBTieredAdmPolicyTest, CompressedOnlyTest) {
|
||||||
table_options.cache_index_and_filter_blocks = false;
|
table_options.cache_index_and_filter_blocks = false;
|
||||||
Options options = GetDefaultOptions();
|
Options options = GetDefaultOptions();
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
|
options.compression = kLZ4Compression;
|
||||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||||
|
|
||||||
size_t comp_cache_usage = compressed_secondary_cache()->TEST_GetUsage();
|
size_t comp_cache_usage = compressed_secondary_cache()->TEST_GetUsage();
|
||||||
|
@ -836,6 +842,7 @@ TEST_P(DBTieredAdmPolicyTest, CompressedCacheAdmission) {
|
||||||
table_options.cache_index_and_filter_blocks = false;
|
table_options.cache_index_and_filter_blocks = false;
|
||||||
Options options = GetDefaultOptions();
|
Options options = GetDefaultOptions();
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
|
options.compression = kLZ4Compression;
|
||||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||||
|
|
||||||
size_t comp_cache_usage = compressed_secondary_cache()->TEST_GetUsage();
|
size_t comp_cache_usage = compressed_secondary_cache()->TEST_GetUsage();
|
||||||
|
@ -937,6 +944,7 @@ TEST_F(DBTieredSecondaryCacheTest, FSBufferTest) {
|
||||||
table_options.cache_index_and_filter_blocks = false;
|
table_options.cache_index_and_filter_blocks = false;
|
||||||
Options options = GetDefaultOptions();
|
Options options = GetDefaultOptions();
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
|
options.compression = kLZ4Compression;
|
||||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||||
options.statistics = CreateDBStatistics();
|
options.statistics = CreateDBStatistics();
|
||||||
options.env = wrap_env.get();
|
options.env = wrap_env.get();
|
||||||
|
|
|
@ -1565,28 +1565,6 @@ Status ColumnFamilyData::SetOptions(
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
// REQUIRES: DB mutex held
|
|
||||||
Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
|
|
||||||
if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
|
|
||||||
return Env::WLTH_NOT_SET;
|
|
||||||
}
|
|
||||||
if (level == 0) {
|
|
||||||
return Env::WLTH_MEDIUM;
|
|
||||||
}
|
|
||||||
int base_level = current_->storage_info()->base_level();
|
|
||||||
|
|
||||||
// L1: medium, L2: long, ...
|
|
||||||
if (level - base_level >= 2) {
|
|
||||||
return Env::WLTH_EXTREME;
|
|
||||||
} else if (level < base_level) {
|
|
||||||
// There is no restriction which prevents level passed in to be smaller
|
|
||||||
// than base_level.
|
|
||||||
return Env::WLTH_MEDIUM;
|
|
||||||
}
|
|
||||||
return static_cast<Env::WriteLifeTimeHint>(
|
|
||||||
level - base_level + static_cast<int>(Env::WLTH_MEDIUM));
|
|
||||||
}
|
|
||||||
|
|
||||||
Status ColumnFamilyData::AddDirectories(
|
Status ColumnFamilyData::AddDirectories(
|
||||||
std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs) {
|
std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs) {
|
||||||
Status s;
|
Status s;
|
||||||
|
|
|
@ -511,8 +511,6 @@ class ColumnFamilyData {
|
||||||
return initial_cf_options_;
|
return initial_cf_options_;
|
||||||
}
|
}
|
||||||
|
|
||||||
Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
|
|
||||||
|
|
||||||
// created_dirs remembers directory created, so that we don't need to call
|
// created_dirs remembers directory created, so that we don't need to call
|
||||||
// the same data creation operation again.
|
// the same data creation operation again.
|
||||||
Status AddDirectories(
|
Status AddDirectories(
|
||||||
|
|
|
@ -686,12 +686,11 @@ bool Compaction::KeyRangeNotExistsBeyondOutputLevel(
|
||||||
};
|
};
|
||||||
|
|
||||||
// Mark (or clear) each file that is being compacted
|
// Mark (or clear) each file that is being compacted
|
||||||
void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
|
void Compaction::MarkFilesBeingCompacted(bool being_compacted) const {
|
||||||
for (size_t i = 0; i < num_input_levels(); i++) {
|
for (size_t i = 0; i < num_input_levels(); i++) {
|
||||||
for (size_t j = 0; j < inputs_[i].size(); j++) {
|
for (size_t j = 0; j < inputs_[i].size(); j++) {
|
||||||
assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
|
assert(being_compacted != inputs_[i][j]->being_compacted);
|
||||||
: inputs_[i][j]->being_compacted);
|
inputs_[i][j]->being_compacted = being_compacted;
|
||||||
inputs_[i][j]->being_compacted = mark_as_compacted;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -735,7 +734,7 @@ uint64_t Compaction::CalculateTotalInputSize() const {
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Compaction::ReleaseCompactionFiles(Status status) {
|
void Compaction::ReleaseCompactionFiles(const Status& status) {
|
||||||
MarkFilesBeingCompacted(false);
|
MarkFilesBeingCompacted(false);
|
||||||
cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
|
cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
|
||||||
}
|
}
|
||||||
|
|
|
@ -230,7 +230,7 @@ class Compaction {
|
||||||
// Delete this compaction from the list of running compactions.
|
// Delete this compaction from the list of running compactions.
|
||||||
//
|
//
|
||||||
// Requirement: DB mutex held
|
// Requirement: DB mutex held
|
||||||
void ReleaseCompactionFiles(Status status);
|
void ReleaseCompactionFiles(const Status& status);
|
||||||
|
|
||||||
// Returns the summary of the compaction in "output" with maximum "len"
|
// Returns the summary of the compaction in "output" with maximum "len"
|
||||||
// in bytes. The caller is responsible for the memory management of
|
// in bytes. The caller is responsible for the memory management of
|
||||||
|
@ -435,13 +435,13 @@ class Compaction {
|
||||||
const int start_level,
|
const int start_level,
|
||||||
const int output_level);
|
const int output_level);
|
||||||
|
|
||||||
|
// mark (or clear) all files that are being compacted
|
||||||
|
void MarkFilesBeingCompacted(bool being_compacted) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
Status InitInputTableProperties();
|
Status InitInputTableProperties();
|
||||||
|
|
||||||
// mark (or clear) all files that are being compacted
|
|
||||||
void MarkFilesBeingCompacted(bool mark_as_compacted);
|
|
||||||
|
|
||||||
// get the smallest and largest key present in files to be compacted
|
// get the smallest and largest key present in files to be compacted
|
||||||
static void GetBoundaryKeys(VersionStorageInfo* vstorage,
|
static void GetBoundaryKeys(VersionStorageInfo* vstorage,
|
||||||
const std::vector<CompactionInputFiles>& inputs,
|
const std::vector<CompactionInputFiles>& inputs,
|
||||||
|
|
|
@ -251,12 +251,13 @@ void CompactionJob::Prepare() {
|
||||||
|
|
||||||
// Generate file_levels_ for compaction before making Iterator
|
// Generate file_levels_ for compaction before making Iterator
|
||||||
auto* c = compact_->compaction;
|
auto* c = compact_->compaction;
|
||||||
ColumnFamilyData* cfd = c->column_family_data();
|
[[maybe_unused]] ColumnFamilyData* cfd = c->column_family_data();
|
||||||
assert(cfd != nullptr);
|
assert(cfd != nullptr);
|
||||||
assert(cfd->current()->storage_info()->NumLevelFiles(
|
const VersionStorageInfo* storage_info = c->input_version()->storage_info();
|
||||||
compact_->compaction->level()) > 0);
|
assert(storage_info);
|
||||||
|
assert(storage_info->NumLevelFiles(compact_->compaction->level()) > 0);
|
||||||
|
|
||||||
write_hint_ = cfd->CalculateSSTWriteHint(c->output_level());
|
write_hint_ = storage_info->CalculateSSTWriteHint(c->output_level());
|
||||||
bottommost_level_ = c->bottommost_level();
|
bottommost_level_ = c->bottommost_level();
|
||||||
|
|
||||||
if (c->ShouldFormSubcompactions()) {
|
if (c->ShouldFormSubcompactions()) {
|
||||||
|
@ -297,8 +298,8 @@ void CompactionJob::Prepare() {
|
||||||
for (const auto& each_level : *c->inputs()) {
|
for (const auto& each_level : *c->inputs()) {
|
||||||
for (const auto& fmd : each_level.files) {
|
for (const auto& fmd : each_level.files) {
|
||||||
std::shared_ptr<const TableProperties> tp;
|
std::shared_ptr<const TableProperties> tp;
|
||||||
Status s =
|
Status s = c->input_version()->GetTableProperties(read_options, &tp,
|
||||||
cfd->current()->GetTableProperties(read_options, &tp, fmd, nullptr);
|
fmd, nullptr);
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
s = seqno_to_time_mapping_.DecodeFrom(tp->seqno_to_time_mapping);
|
s = seqno_to_time_mapping_.DecodeFrom(tp->seqno_to_time_mapping);
|
||||||
}
|
}
|
||||||
|
|
|
@ -133,7 +133,8 @@ CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions,
|
||||||
CompactionPicker::~CompactionPicker() = default;
|
CompactionPicker::~CompactionPicker() = default;
|
||||||
|
|
||||||
// Delete this compaction from the list of running compactions.
|
// Delete this compaction from the list of running compactions.
|
||||||
void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
|
void CompactionPicker::ReleaseCompactionFiles(Compaction* c,
|
||||||
|
const Status& status) {
|
||||||
UnregisterCompaction(c);
|
UnregisterCompaction(c);
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
c->ResetNextCompactionIndex();
|
c->ResetNextCompactionIndex();
|
||||||
|
|
|
@ -104,7 +104,7 @@ class CompactionPicker {
|
||||||
// Free up the files that participated in a compaction
|
// Free up the files that participated in a compaction
|
||||||
//
|
//
|
||||||
// Requirement: DB mutex held
|
// Requirement: DB mutex held
|
||||||
void ReleaseCompactionFiles(Compaction* c, Status status);
|
void ReleaseCompactionFiles(Compaction* c, const Status& status);
|
||||||
|
|
||||||
// Returns true if any one of the specified files are being compacted
|
// Returns true if any one of the specified files are being compacted
|
||||||
bool AreFilesInCompaction(const std::vector<FileMetaData*>& files);
|
bool AreFilesInCompaction(const std::vector<FileMetaData*>& files);
|
||||||
|
|
|
@ -261,11 +261,11 @@ Status CompactionServiceCompactionJob::Run() {
|
||||||
|
|
||||||
auto* c = compact_->compaction;
|
auto* c = compact_->compaction;
|
||||||
assert(c->column_family_data() != nullptr);
|
assert(c->column_family_data() != nullptr);
|
||||||
assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
|
const VersionStorageInfo* storage_info = c->input_version()->storage_info();
|
||||||
compact_->compaction->level()) > 0);
|
assert(storage_info);
|
||||||
|
assert(storage_info->NumLevelFiles(compact_->compaction->level()) > 0);
|
||||||
|
|
||||||
write_hint_ =
|
write_hint_ = storage_info->CalculateSSTWriteHint(c->output_level());
|
||||||
c->column_family_data()->CalculateSSTWriteHint(c->output_level());
|
|
||||||
bottommost_level_ = c->bottommost_level();
|
bottommost_level_ = c->bottommost_level();
|
||||||
|
|
||||||
Slice begin = compaction_input_.begin;
|
Slice begin = compaction_input_.begin;
|
||||||
|
|
|
@ -6146,7 +6146,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
|
||||||
|
|
||||||
std::vector<std::string> pending_compaction_cfs;
|
std::vector<std::string> pending_compaction_cfs;
|
||||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||||
"SchedulePendingCompaction::cfd", [&](void* arg) {
|
"EnqueuePendingCompaction::cfd", [&](void* arg) {
|
||||||
const std::string& cf_name =
|
const std::string& cf_name =
|
||||||
static_cast<ColumnFamilyData*>(arg)->GetName();
|
static_cast<ColumnFamilyData*>(arg)->GetName();
|
||||||
pending_compaction_cfs.emplace_back(cf_name);
|
pending_compaction_cfs.emplace_back(cf_name);
|
||||||
|
|
|
@ -473,7 +473,7 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||||
SchedulePendingCompaction(cfd);
|
EnqueuePendingCompaction(cfd);
|
||||||
}
|
}
|
||||||
MaybeScheduleFlushOrCompaction();
|
MaybeScheduleFlushOrCompaction();
|
||||||
}
|
}
|
||||||
|
@ -4282,7 +4282,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
||||||
->storage_info()
|
->storage_info()
|
||||||
->BottommostFilesMarkedForCompaction()
|
->BottommostFilesMarkedForCompaction()
|
||||||
.empty()) {
|
.empty()) {
|
||||||
SchedulePendingCompaction(cfd);
|
EnqueuePendingCompaction(cfd);
|
||||||
MaybeScheduleFlushOrCompaction();
|
MaybeScheduleFlushOrCompaction();
|
||||||
cf_scheduled.push_back(cfd);
|
cf_scheduled.push_back(cfd);
|
||||||
}
|
}
|
||||||
|
@ -5837,6 +5837,10 @@ Status DBImpl::IngestExternalFiles(
|
||||||
"allow_db_generated_files.");
|
"allow_db_generated_files.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (ingest_opts.move_files && ingest_opts.link_files) {
|
||||||
|
return Status::InvalidArgument(
|
||||||
|
"`move_files` and `link_files` can not both be true.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO (yanqin) maybe handle the case in which column_families have
|
// TODO (yanqin) maybe handle the case in which column_families have
|
||||||
|
|
|
@ -2216,10 +2216,27 @@ class DBImpl : public DB {
|
||||||
void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
|
void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
|
||||||
FlushReason flush_reason, FlushRequest* req);
|
FlushReason flush_reason, FlushRequest* req);
|
||||||
|
|
||||||
// Returns true if `req` is successfully enqueued.
|
// Below functions are for executing flush, compaction in the background. A
|
||||||
bool SchedulePendingFlush(const FlushRequest& req);
|
// dequeue is the communication channel between threads that asks for the work
|
||||||
|
// to be done and the available threads in the thread pool that pick it up to
|
||||||
|
// execute it. We use these terminologies to describe the state of the work
|
||||||
|
// and its transitions:
|
||||||
|
// 1) It becomes pending once it's successfully enqueued into the
|
||||||
|
// corresponding dequeue, a work in this state is also called unscheduled.
|
||||||
|
// Counter `unscheduled_*_` counts work in this state.
|
||||||
|
// 2) When `MaybeScheduleFlushOrCompaction` schedule a thread to run `BGWork*`
|
||||||
|
// for the work, it becomes scheduled
|
||||||
|
// Counter `bg_*_scheduled_` counts work in this state.
|
||||||
|
// 3) Once the thread start to execute `BGWork*`, the work is popped from the
|
||||||
|
// dequeue, it is now in running state
|
||||||
|
// Counter `num_running_*_` counts work in this state.
|
||||||
|
// 4) Eventually, the work is finished. We don't need to specifically track
|
||||||
|
// finished work.
|
||||||
|
|
||||||
void SchedulePendingCompaction(ColumnFamilyData* cfd);
|
// Returns true if `req` is successfully enqueued.
|
||||||
|
bool EnqueuePendingFlush(const FlushRequest& req);
|
||||||
|
|
||||||
|
void EnqueuePendingCompaction(ColumnFamilyData* cfd);
|
||||||
void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
|
void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
|
||||||
FileType type, uint64_t number, int job_id);
|
FileType type, uint64_t number, int job_id);
|
||||||
static void BGWorkCompaction(void* arg);
|
static void BGWorkCompaction(void* arg);
|
||||||
|
@ -2946,6 +2963,14 @@ DBOptions SanitizeOptions(const std::string& db, const DBOptions& src,
|
||||||
CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions,
|
CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions,
|
||||||
const MutableCFOptions& mutable_cf_options);
|
const MutableCFOptions& mutable_cf_options);
|
||||||
|
|
||||||
|
// Return a VersionEdit for the DB's recovery when the `memtables` of the
|
||||||
|
// specified column family are obsolete. Specifically, the min log number to
|
||||||
|
// keep, and the WAL files that can be deleted.
|
||||||
|
VersionEdit GetDBRecoveryEditForObsoletingMemTables(
|
||||||
|
VersionSet* vset, const ColumnFamilyData& cfd,
|
||||||
|
const autovector<VersionEdit*>& edit_list,
|
||||||
|
const autovector<MemTable*>& memtables, LogsWithPrepTracker* prep_tracker);
|
||||||
|
|
||||||
// Return the earliest log file to keep after the memtable flush is
|
// Return the earliest log file to keep after the memtable flush is
|
||||||
// finalized.
|
// finalized.
|
||||||
// `cfd_to_flush` is the column family whose memtable (specified in
|
// `cfd_to_flush` is the column family whose memtable (specified in
|
||||||
|
|
|
@ -1880,7 +1880,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
|
||||||
Status status = versions_->LogAndApply(cfd, mutable_cf_options,
|
Status status = versions_->LogAndApply(cfd, mutable_cf_options,
|
||||||
read_options, write_options, &edit,
|
read_options, write_options, &edit,
|
||||||
&mutex_, directories_.GetDbDir());
|
&mutex_, directories_.GetDbDir());
|
||||||
|
c->MarkFilesBeingCompacted(false);
|
||||||
cfd->compaction_picker()->UnregisterCompaction(c.get());
|
cfd->compaction_picker()->UnregisterCompaction(c.get());
|
||||||
c.reset();
|
c.reset();
|
||||||
|
|
||||||
|
@ -2377,7 +2377,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
|
||||||
ColumnFamilyData* loop_cfd =
|
ColumnFamilyData* loop_cfd =
|
||||||
req.cfd_to_max_mem_id_to_persist.begin()->first;
|
req.cfd_to_max_mem_id_to_persist.begin()->first;
|
||||||
bool already_queued_for_flush = loop_cfd->queued_for_flush();
|
bool already_queued_for_flush = loop_cfd->queued_for_flush();
|
||||||
bool flush_req_enqueued = SchedulePendingFlush(req);
|
bool flush_req_enqueued = EnqueuePendingFlush(req);
|
||||||
if (already_queued_for_flush || flush_req_enqueued) {
|
if (already_queued_for_flush || flush_req_enqueued) {
|
||||||
loop_cfd->SetFlushSkipReschedule();
|
loop_cfd->SetFlushSkipReschedule();
|
||||||
}
|
}
|
||||||
|
@ -2528,7 +2528,7 @@ Status DBImpl::AtomicFlushMemTables(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GenerateFlushRequest(cfds, flush_reason, &flush_req);
|
GenerateFlushRequest(cfds, flush_reason, &flush_req);
|
||||||
SchedulePendingFlush(flush_req);
|
EnqueuePendingFlush(flush_req);
|
||||||
MaybeScheduleFlushOrCompaction();
|
MaybeScheduleFlushOrCompaction();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2583,7 +2583,7 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
|
||||||
if (immutable_db_options_.atomic_flush) {
|
if (immutable_db_options_.atomic_flush) {
|
||||||
FlushRequest flush_req;
|
FlushRequest flush_req;
|
||||||
GenerateFlushRequest(cfds, flush_reason, &flush_req);
|
GenerateFlushRequest(cfds, flush_reason, &flush_req);
|
||||||
SchedulePendingFlush(flush_req);
|
EnqueuePendingFlush(flush_req);
|
||||||
for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) {
|
for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) {
|
||||||
flush_memtable_ids.push_back(iter.second);
|
flush_memtable_ids.push_back(iter.second);
|
||||||
}
|
}
|
||||||
|
@ -2597,7 +2597,7 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
|
||||||
flush_reason,
|
flush_reason,
|
||||||
{{cfd,
|
{{cfd,
|
||||||
std::numeric_limits<uint64_t>::max() /* max_mem_id_to_persist */}}};
|
std::numeric_limits<uint64_t>::max() /* max_mem_id_to_persist */}}};
|
||||||
if (SchedulePendingFlush(flush_req)) {
|
if (EnqueuePendingFlush(flush_req)) {
|
||||||
cfd->SetFlushSkipReschedule();
|
cfd->SetFlushSkipReschedule();
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -2950,6 +2950,7 @@ void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
|
||||||
cfd->Ref();
|
cfd->Ref();
|
||||||
compaction_queue_.push_back(cfd);
|
compaction_queue_.push_back(cfd);
|
||||||
cfd->set_queued_for_compaction(true);
|
cfd->set_queued_for_compaction(true);
|
||||||
|
++unscheduled_compactions_;
|
||||||
}
|
}
|
||||||
|
|
||||||
ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
|
ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
|
||||||
|
@ -3005,7 +3006,7 @@ ColumnFamilyData* DBImpl::PickCompactionFromQueue(
|
||||||
return cfd;
|
return cfd;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
|
bool DBImpl::EnqueuePendingFlush(const FlushRequest& flush_req) {
|
||||||
mutex_.AssertHeld();
|
mutex_.AssertHeld();
|
||||||
bool enqueued = false;
|
bool enqueued = false;
|
||||||
if (reject_new_background_jobs_) {
|
if (reject_new_background_jobs_) {
|
||||||
|
@ -3041,16 +3042,15 @@ bool DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
|
||||||
return enqueued;
|
return enqueued;
|
||||||
}
|
}
|
||||||
|
|
||||||
void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
|
void DBImpl::EnqueuePendingCompaction(ColumnFamilyData* cfd) {
|
||||||
mutex_.AssertHeld();
|
mutex_.AssertHeld();
|
||||||
if (reject_new_background_jobs_) {
|
if (reject_new_background_jobs_) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
|
if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
|
||||||
TEST_SYNC_POINT_CALLBACK("SchedulePendingCompaction::cfd",
|
TEST_SYNC_POINT_CALLBACK("EnqueuePendingCompaction::cfd",
|
||||||
static_cast<void*>(cfd));
|
static_cast<void*>(cfd));
|
||||||
AddToCompactionQueue(cfd);
|
AddToCompactionQueue(cfd);
|
||||||
++unscheduled_compactions_;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3218,7 +3218,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
flush_req.reschedule_count += 1;
|
flush_req.reschedule_count += 1;
|
||||||
#endif /* !NDEBUG */
|
#endif /* !NDEBUG */
|
||||||
SchedulePendingFlush(flush_req);
|
EnqueuePendingFlush(flush_req);
|
||||||
*reason = flush_reason;
|
*reason = flush_reason;
|
||||||
*flush_rescheduled_to_retain_udt = true;
|
*flush_rescheduled_to_retain_udt = true;
|
||||||
return Status::TryAgain();
|
return Status::TryAgain();
|
||||||
|
@ -3678,7 +3678,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
|
||||||
->ComputeCompactionScore(*(c->immutable_options()),
|
->ComputeCompactionScore(*(c->immutable_options()),
|
||||||
*(c->mutable_cf_options()));
|
*(c->mutable_cf_options()));
|
||||||
AddToCompactionQueue(cfd);
|
AddToCompactionQueue(cfd);
|
||||||
++unscheduled_compactions_;
|
|
||||||
|
|
||||||
c.reset();
|
c.reset();
|
||||||
// Don't need to sleep here, because BackgroundCallCompaction
|
// Don't need to sleep here, because BackgroundCallCompaction
|
||||||
|
@ -3707,7 +3706,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
|
||||||
if (cfd->NeedsCompaction()) {
|
if (cfd->NeedsCompaction()) {
|
||||||
// Yes, we need more compactions!
|
// Yes, we need more compactions!
|
||||||
AddToCompactionQueue(cfd);
|
AddToCompactionQueue(cfd);
|
||||||
++unscheduled_compactions_;
|
|
||||||
MaybeScheduleFlushOrCompaction();
|
MaybeScheduleFlushOrCompaction();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3997,7 +3995,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
|
||||||
*(c->mutable_cf_options()));
|
*(c->mutable_cf_options()));
|
||||||
if (!cfd->queued_for_compaction()) {
|
if (!cfd->queued_for_compaction()) {
|
||||||
AddToCompactionQueue(cfd);
|
AddToCompactionQueue(cfd);
|
||||||
++unscheduled_compactions_;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4269,7 +4266,7 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
|
||||||
|
|
||||||
// Whenever we install new SuperVersion, we might need to issue new flushes or
|
// Whenever we install new SuperVersion, we might need to issue new flushes or
|
||||||
// compactions.
|
// compactions.
|
||||||
SchedulePendingCompaction(cfd);
|
EnqueuePendingCompaction(cfd);
|
||||||
MaybeScheduleFlushOrCompaction();
|
MaybeScheduleFlushOrCompaction();
|
||||||
|
|
||||||
// Update max_total_in_memory_state_
|
// Update max_total_in_memory_state_
|
||||||
|
|
|
@ -47,7 +47,7 @@ Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
|
||||||
// compaction score
|
// compaction score
|
||||||
vstorage->ComputeCompactionScore(*cfd->ioptions(),
|
vstorage->ComputeCompactionScore(*cfd->ioptions(),
|
||||||
*cfd->GetLatestMutableCFOptions());
|
*cfd->GetLatestMutableCFOptions());
|
||||||
SchedulePendingCompaction(cfd);
|
EnqueuePendingCompaction(cfd);
|
||||||
MaybeScheduleFlushOrCompaction();
|
MaybeScheduleFlushOrCompaction();
|
||||||
}
|
}
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
|
|
@ -722,6 +722,38 @@ void DBImpl::DeleteObsoleteFiles() {
|
||||||
mutex_.Lock();
|
mutex_.Lock();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
VersionEdit GetDBRecoveryEditForObsoletingMemTables(
|
||||||
|
VersionSet* vset, const ColumnFamilyData& cfd,
|
||||||
|
const autovector<VersionEdit*>& edit_list,
|
||||||
|
const autovector<MemTable*>& memtables, LogsWithPrepTracker* prep_tracker) {
|
||||||
|
VersionEdit wal_deletion_edit;
|
||||||
|
uint64_t min_wal_number_to_keep = 0;
|
||||||
|
assert(edit_list.size() > 0);
|
||||||
|
if (vset->db_options()->allow_2pc) {
|
||||||
|
// Note that if mempurge is successful, the edit_list will
|
||||||
|
// not be applicable (contains info of new min_log number to keep,
|
||||||
|
// and level 0 file path of SST file created during normal flush,
|
||||||
|
// so both pieces of information are irrelevant after a successful
|
||||||
|
// mempurge operation).
|
||||||
|
min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
|
||||||
|
vset, cfd, edit_list, memtables, prep_tracker);
|
||||||
|
|
||||||
|
// We piggyback the information of earliest log file to keep in the
|
||||||
|
// manifest entry for the last file flushed.
|
||||||
|
} else {
|
||||||
|
min_wal_number_to_keep =
|
||||||
|
PrecomputeMinLogNumberToKeepNon2PC(vset, cfd, edit_list);
|
||||||
|
}
|
||||||
|
|
||||||
|
wal_deletion_edit.SetMinLogNumberToKeep(min_wal_number_to_keep);
|
||||||
|
if (vset->db_options()->track_and_verify_wals_in_manifest) {
|
||||||
|
if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
|
||||||
|
wal_deletion_edit.DeleteWalsBefore(min_wal_number_to_keep);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return wal_deletion_edit;
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t FindMinPrepLogReferencedByMemTable(
|
uint64_t FindMinPrepLogReferencedByMemTable(
|
||||||
VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) {
|
VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) {
|
||||||
uint64_t min_log = 0;
|
uint64_t min_log = 0;
|
||||||
|
|
|
@ -1681,7 +1681,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
||||||
meta.oldest_ancester_time = current_time;
|
meta.oldest_ancester_time = current_time;
|
||||||
meta.epoch_number = cfd->NewEpochNumber();
|
meta.epoch_number = cfd->NewEpochNumber();
|
||||||
{
|
{
|
||||||
auto write_hint = cfd->CalculateSSTWriteHint(0);
|
auto write_hint =
|
||||||
|
cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0);
|
||||||
mutex_.Unlock();
|
mutex_.Unlock();
|
||||||
|
|
||||||
SequenceNumber earliest_write_conflict_snapshot;
|
SequenceNumber earliest_write_conflict_snapshot;
|
||||||
|
|
|
@ -1789,13 +1789,13 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
|
||||||
if (!immutable_db_options_.atomic_flush) {
|
if (!immutable_db_options_.atomic_flush) {
|
||||||
FlushRequest flush_req;
|
FlushRequest flush_req;
|
||||||
GenerateFlushRequest({cfd}, FlushReason::kWalFull, &flush_req);
|
GenerateFlushRequest({cfd}, FlushReason::kWalFull, &flush_req);
|
||||||
SchedulePendingFlush(flush_req);
|
EnqueuePendingFlush(flush_req);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (immutable_db_options_.atomic_flush) {
|
if (immutable_db_options_.atomic_flush) {
|
||||||
FlushRequest flush_req;
|
FlushRequest flush_req;
|
||||||
GenerateFlushRequest(cfds, FlushReason::kWalFull, &flush_req);
|
GenerateFlushRequest(cfds, FlushReason::kWalFull, &flush_req);
|
||||||
SchedulePendingFlush(flush_req);
|
EnqueuePendingFlush(flush_req);
|
||||||
}
|
}
|
||||||
MaybeScheduleFlushOrCompaction();
|
MaybeScheduleFlushOrCompaction();
|
||||||
}
|
}
|
||||||
|
@ -1881,13 +1881,13 @@ Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
|
||||||
FlushRequest flush_req;
|
FlushRequest flush_req;
|
||||||
GenerateFlushRequest({cfd}, FlushReason::kWriteBufferManager,
|
GenerateFlushRequest({cfd}, FlushReason::kWriteBufferManager,
|
||||||
&flush_req);
|
&flush_req);
|
||||||
SchedulePendingFlush(flush_req);
|
EnqueuePendingFlush(flush_req);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (immutable_db_options_.atomic_flush) {
|
if (immutable_db_options_.atomic_flush) {
|
||||||
FlushRequest flush_req;
|
FlushRequest flush_req;
|
||||||
GenerateFlushRequest(cfds, FlushReason::kWriteBufferManager, &flush_req);
|
GenerateFlushRequest(cfds, FlushReason::kWriteBufferManager, &flush_req);
|
||||||
SchedulePendingFlush(flush_req);
|
EnqueuePendingFlush(flush_req);
|
||||||
}
|
}
|
||||||
MaybeScheduleFlushOrCompaction();
|
MaybeScheduleFlushOrCompaction();
|
||||||
}
|
}
|
||||||
|
@ -2163,12 +2163,12 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
|
||||||
AssignAtomicFlushSeq(cfds);
|
AssignAtomicFlushSeq(cfds);
|
||||||
FlushRequest flush_req;
|
FlushRequest flush_req;
|
||||||
GenerateFlushRequest(cfds, FlushReason::kWriteBufferFull, &flush_req);
|
GenerateFlushRequest(cfds, FlushReason::kWriteBufferFull, &flush_req);
|
||||||
SchedulePendingFlush(flush_req);
|
EnqueuePendingFlush(flush_req);
|
||||||
} else {
|
} else {
|
||||||
for (auto* cfd : cfds) {
|
for (auto* cfd : cfds) {
|
||||||
FlushRequest flush_req;
|
FlushRequest flush_req;
|
||||||
GenerateFlushRequest({cfd}, FlushReason::kWriteBufferFull, &flush_req);
|
GenerateFlushRequest({cfd}, FlushReason::kWriteBufferFull, &flush_req);
|
||||||
SchedulePendingFlush(flush_req);
|
EnqueuePendingFlush(flush_req);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
MaybeScheduleFlushOrCompaction();
|
MaybeScheduleFlushOrCompaction();
|
||||||
|
|
|
@ -895,6 +895,81 @@ TEST_P(DBIOCorruptionTest, ManifestCorruptionRetry) {
|
||||||
SyncPoint::GetInstance()->DisableProcessing();
|
SyncPoint::GetInstance()->DisableProcessing();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_P(DBIOCorruptionTest, FooterReadCorruptionRetry) {
|
||||||
|
Random rnd(300);
|
||||||
|
bool retry = false;
|
||||||
|
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||||
|
"ReadFooterFromFileInternal:0", [&](void* arg) {
|
||||||
|
Slice* data = static_cast<Slice*>(arg);
|
||||||
|
if (!retry) {
|
||||||
|
std::memcpy(const_cast<char*>(data->data()),
|
||||||
|
rnd.RandomString(static_cast<int>(data->size())).c_str(),
|
||||||
|
data->size());
|
||||||
|
retry = true;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||||
|
|
||||||
|
ASSERT_OK(Put("key1", "val1"));
|
||||||
|
Status s = Flush();
|
||||||
|
if (std::get<2>(GetParam())) {
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||||
|
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||||
|
1);
|
||||||
|
|
||||||
|
std::string val;
|
||||||
|
ReadOptions ro;
|
||||||
|
ro.async_io = std::get<1>(GetParam());
|
||||||
|
ASSERT_OK(dbfull()->Get(ro, "key1", &val));
|
||||||
|
ASSERT_EQ(val, "val1");
|
||||||
|
} else {
|
||||||
|
ASSERT_NOK(s);
|
||||||
|
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||||
|
ASSERT_GT(stats()->getTickerCount(SST_FOOTER_CORRUPTION_COUNT), 0);
|
||||||
|
}
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_P(DBIOCorruptionTest, TablePropertiesCorruptionRetry) {
|
||||||
|
Random rnd(300);
|
||||||
|
bool retry = false;
|
||||||
|
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||||
|
"ReadTablePropertiesHelper:0", [&](void* arg) {
|
||||||
|
Slice* data = static_cast<Slice*>(arg);
|
||||||
|
if (!retry) {
|
||||||
|
std::memcpy(const_cast<char*>(data->data()),
|
||||||
|
rnd.RandomString(static_cast<int>(data->size())).c_str(),
|
||||||
|
data->size());
|
||||||
|
retry = true;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||||
|
|
||||||
|
ASSERT_OK(Put("key1", "val1"));
|
||||||
|
Status s = Flush();
|
||||||
|
if (std::get<2>(GetParam())) {
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||||
|
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||||
|
1);
|
||||||
|
|
||||||
|
std::string val;
|
||||||
|
ReadOptions ro;
|
||||||
|
ro.async_io = std::get<1>(GetParam());
|
||||||
|
ASSERT_OK(dbfull()->Get(ro, "key1", &val));
|
||||||
|
ASSERT_EQ(val, "val1");
|
||||||
|
} else {
|
||||||
|
ASSERT_NOK(s);
|
||||||
|
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||||
|
}
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||||
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||||
|
}
|
||||||
|
|
||||||
// The parameters are - 1. Use FS provided buffer, 2. Use async IO ReadOption,
|
// The parameters are - 1. Use FS provided buffer, 2. Use async IO ReadOption,
|
||||||
// 3. Retry with verify_and_reconstruct_read IOOption
|
// 3. Retry with verify_and_reconstruct_read IOOption
|
||||||
INSTANTIATE_TEST_CASE_P(DBIOCorruptionTest, DBIOCorruptionTest,
|
INSTANTIATE_TEST_CASE_P(DBIOCorruptionTest, DBIOCorruptionTest,
|
||||||
|
|
|
@ -113,7 +113,7 @@ Status ExternalSstFileIngestionJob::Prepare(
|
||||||
const std::string path_outside_db = f.external_file_path;
|
const std::string path_outside_db = f.external_file_path;
|
||||||
const std::string path_inside_db = TableFileName(
|
const std::string path_inside_db = TableFileName(
|
||||||
cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
|
cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
|
||||||
if (ingestion_options_.move_files) {
|
if (ingestion_options_.move_files || ingestion_options_.link_files) {
|
||||||
status =
|
status =
|
||||||
fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
|
fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
|
@ -626,8 +626,7 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
|
||||||
DeleteInternalFiles();
|
DeleteInternalFiles();
|
||||||
consumed_seqno_count_ = 0;
|
consumed_seqno_count_ = 0;
|
||||||
files_overlap_ = false;
|
files_overlap_ = false;
|
||||||
} else if (status.ok() && ingestion_options_.move_files &&
|
} else if (status.ok() && ingestion_options_.move_files) {
|
||||||
!ingestion_options_.allow_db_generated_files) {
|
|
||||||
// The files were moved and added successfully, remove original file links
|
// The files were moved and added successfully, remove original file links
|
||||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||||
Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
|
Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
|
||||||
|
|
|
@ -66,7 +66,7 @@ class ExternalSSTFileTestBase : public DBTestBase {
|
||||||
|
|
||||||
class ExternSSTFileLinkFailFallbackTest
|
class ExternSSTFileLinkFailFallbackTest
|
||||||
: public ExternalSSTFileTestBase,
|
: public ExternalSSTFileTestBase,
|
||||||
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
|
public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
|
||||||
public:
|
public:
|
||||||
ExternSSTFileLinkFailFallbackTest() {
|
ExternSSTFileLinkFailFallbackTest() {
|
||||||
fs_ = std::make_shared<ExternalSSTTestFS>(env_->GetFileSystem(), true);
|
fs_ = std::make_shared<ExternalSSTTestFS>(env_->GetFileSystem(), true);
|
||||||
|
@ -2210,7 +2210,8 @@ TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
|
||||||
DestroyAndReopen(options_);
|
DestroyAndReopen(options_);
|
||||||
const int kNumKeys = 10000;
|
const int kNumKeys = 10000;
|
||||||
IngestExternalFileOptions ifo;
|
IngestExternalFileOptions ifo;
|
||||||
ifo.move_files = true;
|
ifo.move_files = std::get<2>(GetParam());
|
||||||
|
ifo.link_files = !ifo.move_files;
|
||||||
ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;
|
ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;
|
||||||
|
|
||||||
std::string file_path = sst_files_dir_ + "file1.sst";
|
std::string file_path = sst_files_dir_ + "file1.sst";
|
||||||
|
@ -2251,6 +2252,13 @@ TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
|
||||||
ASSERT_EQ(0, bytes_copied);
|
ASSERT_EQ(0, bytes_copied);
|
||||||
ASSERT_EQ(file_size, bytes_moved);
|
ASSERT_EQ(file_size, bytes_moved);
|
||||||
ASSERT_FALSE(copyfile);
|
ASSERT_FALSE(copyfile);
|
||||||
|
|
||||||
|
Status es = env_->FileExists(file_path);
|
||||||
|
if (ifo.move_files) {
|
||||||
|
ASSERT_TRUE(es.IsNotFound());
|
||||||
|
} else {
|
||||||
|
ASSERT_OK(es);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Link operation fails.
|
// Link operation fails.
|
||||||
ASSERT_EQ(0, bytes_moved);
|
ASSERT_EQ(0, bytes_moved);
|
||||||
|
@ -2269,6 +2277,11 @@ TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
|
||||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
|
||||||
|
ExternSSTFileLinkFailFallbackTest,
|
||||||
|
testing::Combine(testing::Bool(), testing::Bool(),
|
||||||
|
testing::Bool()));
|
||||||
|
|
||||||
class TestIngestExternalFileListener : public EventListener {
|
class TestIngestExternalFileListener : public EventListener {
|
||||||
public:
|
public:
|
||||||
void OnExternalFileIngested(DB* /*db*/,
|
void OnExternalFileIngested(DB* /*db*/,
|
||||||
|
@ -3719,19 +3732,13 @@ TEST_F(ExternalSSTFileWithTimestampTest, TimestampsNotPersistedBasic) {
|
||||||
INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
|
INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
|
||||||
testing::Combine(testing::Bool(), testing::Bool()));
|
testing::Combine(testing::Bool(), testing::Bool()));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
|
|
||||||
ExternSSTFileLinkFailFallbackTest,
|
|
||||||
testing::Values(std::make_tuple(true, false),
|
|
||||||
std::make_tuple(true, true),
|
|
||||||
std::make_tuple(false, false)));
|
|
||||||
|
|
||||||
class IngestDBGeneratedFileTest
|
class IngestDBGeneratedFileTest
|
||||||
: public ExternalSSTFileTestBase,
|
: public ExternalSSTFileTestBase,
|
||||||
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
|
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
|
||||||
public:
|
public:
|
||||||
IngestDBGeneratedFileTest() {
|
IngestDBGeneratedFileTest() {
|
||||||
ingest_opts.allow_db_generated_files = true;
|
ingest_opts.allow_db_generated_files = true;
|
||||||
ingest_opts.move_files = std::get<0>(GetParam());
|
ingest_opts.link_files = std::get<0>(GetParam());
|
||||||
ingest_opts.verify_checksums_before_ingest = std::get<1>(GetParam());
|
ingest_opts.verify_checksums_before_ingest = std::get<1>(GetParam());
|
||||||
ingest_opts.snapshot_consistency = false;
|
ingest_opts.snapshot_consistency = false;
|
||||||
}
|
}
|
||||||
|
@ -3744,10 +3751,10 @@ INSTANTIATE_TEST_CASE_P(BasicMultiConfig, IngestDBGeneratedFileTest,
|
||||||
testing::Combine(testing::Bool(), testing::Bool()));
|
testing::Combine(testing::Bool(), testing::Bool()));
|
||||||
|
|
||||||
TEST_P(IngestDBGeneratedFileTest, FailureCase) {
|
TEST_P(IngestDBGeneratedFileTest, FailureCase) {
|
||||||
if (encrypted_env_ && ingest_opts.move_files) {
|
if (encrypted_env_ && ingest_opts.link_files) {
|
||||||
// FIXME: should fail ingestion or support this combination.
|
// FIXME: should fail ingestion or support this combination.
|
||||||
ROCKSDB_GTEST_SKIP(
|
ROCKSDB_GTEST_SKIP(
|
||||||
"Encrypted env and move_files do not work together, as we reopen the "
|
"Encrypted env and link_files do not work together, as we reopen the "
|
||||||
"file after linking it which appends an extra encryption prefix.");
|
"file after linking it which appends an extra encryption prefix.");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -3943,7 +3950,7 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
|
||||||
ingest_opts.allow_global_seqno = std::get<1>(GetParam());
|
ingest_opts.allow_global_seqno = std::get<1>(GetParam());
|
||||||
ingest_opts.allow_blocking_flush = std::get<2>(GetParam());
|
ingest_opts.allow_blocking_flush = std::get<2>(GetParam());
|
||||||
ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam());
|
ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam());
|
||||||
ingest_opts.move_files = std::get<4>(GetParam());
|
ingest_opts.link_files = std::get<4>(GetParam());
|
||||||
|
|
||||||
do {
|
do {
|
||||||
SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
|
SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
|
||||||
|
|
|
@ -630,7 +630,7 @@ Status FlushJob::MemPurge() {
|
||||||
new_mem->SetNextLogNumber(mems_[0]->GetNextLogNumber());
|
new_mem->SetNextLogNumber(mems_[0]->GetNextLogNumber());
|
||||||
|
|
||||||
// This addition will not trigger another flush, because
|
// This addition will not trigger another flush, because
|
||||||
// we do not call SchedulePendingFlush().
|
// we do not call EnqueuePendingFlush().
|
||||||
cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free);
|
cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free);
|
||||||
new_mem->Ref();
|
new_mem->Ref();
|
||||||
// Piggyback FlushJobInfo on the first flushed memtable.
|
// Piggyback FlushJobInfo on the first flushed memtable.
|
||||||
|
@ -861,7 +861,7 @@ Status FlushJob::WriteLevel0Table() {
|
||||||
std::vector<BlobFileAddition> blob_file_additions;
|
std::vector<BlobFileAddition> blob_file_additions;
|
||||||
|
|
||||||
{
|
{
|
||||||
auto write_hint = cfd_->CalculateSSTWriteHint(0);
|
auto write_hint = base_->storage_info()->CalculateSSTWriteHint(/*level=*/0);
|
||||||
Env::IOPriority io_priority = GetRateLimiterPriority();
|
Env::IOPriority io_priority = GetRateLimiterPriority();
|
||||||
db_mutex_->Unlock();
|
db_mutex_->Unlock();
|
||||||
if (log_buffer_) {
|
if (log_buffer_) {
|
||||||
|
|
|
@ -582,37 +582,28 @@ Status MemTableList::TryInstallMemtableFlushResults(
|
||||||
|
|
||||||
// TODO(myabandeh): Not sure how batch_count could be 0 here.
|
// TODO(myabandeh): Not sure how batch_count could be 0 here.
|
||||||
if (batch_count > 0) {
|
if (batch_count > 0) {
|
||||||
uint64_t min_wal_number_to_keep = 0;
|
VersionEdit edit;
|
||||||
assert(edit_list.size() > 0);
|
#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
|
||||||
if (vset->db_options()->allow_2pc) {
|
if (memtables_to_flush.size() == memlist.size()) {
|
||||||
// Note that if mempurge is successful, the edit_list will
|
// TODO(yuzhangyu): remove this testing code once the
|
||||||
// not be applicable (contains info of new min_log number to keep,
|
// `GetEditForDroppingCurrentVersion` API is used by the atomic data
|
||||||
// and level 0 file path of SST file created during normal flush,
|
// replacement. This function can get the same edits for wal related
|
||||||
// so both pieces of information are irrelevant after a successful
|
// fields, and some duplicated fields as contained already in edit_list
|
||||||
// mempurge operation).
|
// for column family's recovery.
|
||||||
min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
|
edit = GetEditForDroppingCurrentVersion(cfd, vset, prep_tracker);
|
||||||
vset, *cfd, edit_list, memtables_to_flush, prep_tracker);
|
|
||||||
|
|
||||||
// We piggyback the information of earliest log file to keep in the
|
|
||||||
// manifest entry for the last file flushed.
|
|
||||||
} else {
|
} else {
|
||||||
min_wal_number_to_keep =
|
edit = GetDBRecoveryEditForObsoletingMemTables(
|
||||||
PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list);
|
vset, *cfd, edit_list, memtables_to_flush, prep_tracker);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
VersionEdit wal_deletion;
|
edit = GetDBRecoveryEditForObsoletingMemTables(
|
||||||
wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep);
|
vset, *cfd, edit_list, memtables_to_flush, prep_tracker);
|
||||||
if (vset->db_options()->track_and_verify_wals_in_manifest) {
|
#endif // ROCKSDB_ASSERT_STATUS_CHECKED
|
||||||
if (min_wal_number_to_keep >
|
TEST_SYNC_POINT_CALLBACK(
|
||||||
vset->GetWalSet().GetMinWalNumberToKeep()) {
|
"MemTableList::TryInstallMemtableFlushResults:"
|
||||||
wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
|
"AfterComputeMinWalToKeep",
|
||||||
}
|
nullptr);
|
||||||
TEST_SYNC_POINT_CALLBACK(
|
edit_list.push_back(&edit);
|
||||||
"MemTableList::TryInstallMemtableFlushResults:"
|
|
||||||
"AfterComputeMinWalToKeep",
|
|
||||||
nullptr);
|
|
||||||
}
|
|
||||||
edit_list.push_back(&wal_deletion);
|
|
||||||
|
|
||||||
const auto manifest_write_cb = [this, cfd, batch_count, log_buffer,
|
const auto manifest_write_cb = [this, cfd, batch_count, log_buffer,
|
||||||
to_delete, mu](const Status& status) {
|
to_delete, mu](const Status& status) {
|
||||||
|
@ -1026,4 +1017,38 @@ void MemTableList::RemoveOldMemTables(uint64_t log_number,
|
||||||
ResetTrimHistoryNeeded();
|
ResetTrimHistoryNeeded();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
VersionEdit MemTableList::GetEditForDroppingCurrentVersion(
|
||||||
|
const ColumnFamilyData* cfd, VersionSet* vset,
|
||||||
|
LogsWithPrepTracker* prep_tracker) const {
|
||||||
|
assert(cfd);
|
||||||
|
auto& memlist = current_->memlist_;
|
||||||
|
if (memlist.empty()) {
|
||||||
|
return VersionEdit();
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t max_next_log_number = 0;
|
||||||
|
autovector<VersionEdit*> edit_list;
|
||||||
|
autovector<MemTable*> memtables_to_drop;
|
||||||
|
for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
|
||||||
|
MemTable* m = *it;
|
||||||
|
memtables_to_drop.push_back(m);
|
||||||
|
max_next_log_number = std::max(m->GetNextLogNumber(), max_next_log_number);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the obsoleted MemTables' impact on WALs related to DB's recovery (min
|
||||||
|
// log number to keep, a delta of WAL files to delete).
|
||||||
|
VersionEdit edit_with_log_number;
|
||||||
|
edit_with_log_number.SetPrevLogNumber(0);
|
||||||
|
edit_with_log_number.SetLogNumber(max_next_log_number);
|
||||||
|
edit_list.push_back(&edit_with_log_number);
|
||||||
|
VersionEdit edit = GetDBRecoveryEditForObsoletingMemTables(
|
||||||
|
vset, *cfd, edit_list, memtables_to_drop, prep_tracker);
|
||||||
|
|
||||||
|
// Set fields related to the column family's recovery.
|
||||||
|
edit.SetColumnFamily(cfd->GetID());
|
||||||
|
edit.SetPrevLogNumber(0);
|
||||||
|
edit.SetLogNumber(max_next_log_number);
|
||||||
|
return edit;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace ROCKSDB_NAMESPACE
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
|
|
@ -447,6 +447,12 @@ class MemTableList {
|
||||||
void RemoveOldMemTables(uint64_t log_number,
|
void RemoveOldMemTables(uint64_t log_number,
|
||||||
autovector<MemTable*>* to_delete);
|
autovector<MemTable*>* to_delete);
|
||||||
|
|
||||||
|
// This API is only used by atomic date replacement. To get an edit for
|
||||||
|
// dropping the current `MemTableListVersion`.
|
||||||
|
VersionEdit GetEditForDroppingCurrentVersion(
|
||||||
|
const ColumnFamilyData* cfd, VersionSet* vset,
|
||||||
|
LogsWithPrepTracker* prep_tracker) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend Status InstallMemtableAtomicFlushResults(
|
friend Status InstallMemtableAtomicFlushResults(
|
||||||
const autovector<MemTableList*>* imm_lists,
|
const autovector<MemTableList*>* imm_lists,
|
||||||
|
|
|
@ -451,7 +451,8 @@ class Repairer {
|
||||||
meta.file_creation_time = current_time;
|
meta.file_creation_time = current_time;
|
||||||
SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
|
SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
|
||||||
|
|
||||||
auto write_hint = cfd->CalculateSSTWriteHint(0);
|
auto write_hint =
|
||||||
|
cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0);
|
||||||
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
|
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
|
||||||
range_del_iters;
|
range_del_iters;
|
||||||
auto range_del_iter = mem->NewRangeTombstoneIterator(
|
auto range_del_iter = mem->NewRangeTombstoneIterator(
|
||||||
|
|
|
@ -44,8 +44,8 @@
|
||||||
#include "table/compaction_merging_iterator.h"
|
#include "table/compaction_merging_iterator.h"
|
||||||
|
|
||||||
#if USE_COROUTINES
|
#if USE_COROUTINES
|
||||||
#include "folly/experimental/coro/BlockingWait.h"
|
#include "folly/coro/BlockingWait.h"
|
||||||
#include "folly/experimental/coro/Collect.h"
|
#include "folly/coro/Collect.h"
|
||||||
#endif
|
#endif
|
||||||
#include "file/filename.h"
|
#include "file/filename.h"
|
||||||
#include "file/random_access_file_reader.h"
|
#include "file/random_access_file_reader.h"
|
||||||
|
@ -4919,6 +4919,27 @@ bool VersionStorageInfo::RangeMightExistAfterSortedRun(
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Env::WriteLifeTimeHint VersionStorageInfo::CalculateSSTWriteHint(
|
||||||
|
int level) const {
|
||||||
|
if (compaction_style_ != kCompactionStyleLevel) {
|
||||||
|
return Env::WLTH_NOT_SET;
|
||||||
|
}
|
||||||
|
if (level == 0) {
|
||||||
|
return Env::WLTH_MEDIUM;
|
||||||
|
}
|
||||||
|
|
||||||
|
// L1: medium, L2: long, ...
|
||||||
|
if (level - base_level_ >= 2) {
|
||||||
|
return Env::WLTH_EXTREME;
|
||||||
|
} else if (level < base_level_) {
|
||||||
|
// There is no restriction which prevents level passed in to be smaller
|
||||||
|
// than base_level.
|
||||||
|
return Env::WLTH_MEDIUM;
|
||||||
|
}
|
||||||
|
return static_cast<Env::WriteLifeTimeHint>(
|
||||||
|
level - base_level_ + static_cast<int>(Env::WLTH_MEDIUM));
|
||||||
|
}
|
||||||
|
|
||||||
void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files,
|
void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files,
|
||||||
std::vector<uint64_t>* live_blob_files) const {
|
std::vector<uint64_t>* live_blob_files) const {
|
||||||
assert(live_table_files);
|
assert(live_table_files);
|
||||||
|
|
|
@ -49,8 +49,8 @@
|
||||||
#include "db/write_controller.h"
|
#include "db/write_controller.h"
|
||||||
#include "env/file_system_tracer.h"
|
#include "env/file_system_tracer.h"
|
||||||
#if USE_COROUTINES
|
#if USE_COROUTINES
|
||||||
#include "folly/experimental/coro/BlockingWait.h"
|
#include "folly/coro/BlockingWait.h"
|
||||||
#include "folly/experimental/coro/Collect.h"
|
#include "folly/coro/Collect.h"
|
||||||
#endif
|
#endif
|
||||||
#include "monitoring/instrumented_mutex.h"
|
#include "monitoring/instrumented_mutex.h"
|
||||||
#include "options/db_options.h"
|
#include "options/db_options.h"
|
||||||
|
@ -626,6 +626,8 @@ class VersionStorageInfo {
|
||||||
const Slice& largest_user_key,
|
const Slice& largest_user_key,
|
||||||
int last_level, int last_l0_idx);
|
int last_level, int last_l0_idx);
|
||||||
|
|
||||||
|
Env::WriteLifeTimeHint CalculateSSTWriteHint(int level) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void ComputeCompensatedSizes();
|
void ComputeCompensatedSizes();
|
||||||
void UpdateNumNonEmptyLevels();
|
void UpdateNumNonEmptyLevels();
|
||||||
|
|
|
@ -99,6 +99,7 @@ void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) {
|
||||||
size_added_by_compaction += filemeta->fd.GetFileSize();
|
size_added_by_compaction += filemeta->fd.GetFileSize();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
assert(cur_compactions_reserved_size_ >= size_added_by_compaction);
|
||||||
cur_compactions_reserved_size_ -= size_added_by_compaction;
|
cur_compactions_reserved_size_ -= size_added_by_compaction;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -450,7 +451,6 @@ void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
|
||||||
// File was added before, we will just update the size
|
// File was added before, we will just update the size
|
||||||
total_files_size_ -= tracked_file->second;
|
total_files_size_ -= tracked_file->second;
|
||||||
total_files_size_ += file_size;
|
total_files_size_ += file_size;
|
||||||
cur_compactions_reserved_size_ -= file_size;
|
|
||||||
} else {
|
} else {
|
||||||
total_files_size_ += file_size;
|
total_files_size_ += file_size;
|
||||||
}
|
}
|
||||||
|
|
|
@ -411,6 +411,14 @@ class Cache {
|
||||||
const CacheItemHelper* helper)>& callback,
|
const CacheItemHelper* helper)>& callback,
|
||||||
const ApplyToAllEntriesOptions& opts) = 0;
|
const ApplyToAllEntriesOptions& opts) = 0;
|
||||||
|
|
||||||
|
// Apply a callback to a cache handle. The Cache must ensure the lifetime
|
||||||
|
// of the key passed to the callback is valid for the duration of the
|
||||||
|
// callback. The handle may not belong to the cache, but is guaranteed to
|
||||||
|
// be type compatible.
|
||||||
|
virtual void ApplyToHandle(
|
||||||
|
Cache* cache, Handle* handle,
|
||||||
|
const std::function<void(const Slice& key, ObjectPtr obj, size_t charge,
|
||||||
|
const CacheItemHelper* helper)>& callback) = 0;
|
||||||
// Remove all entries.
|
// Remove all entries.
|
||||||
// Prerequisite: no entry is referenced.
|
// Prerequisite: no entry is referenced.
|
||||||
virtual void EraseUnRefEntries() = 0;
|
virtual void EraseUnRefEntries() = 0;
|
||||||
|
@ -636,6 +644,15 @@ class CacheWrapper : public Cache {
|
||||||
target_->ApplyToAllEntries(callback, opts);
|
target_->ApplyToAllEntries(callback, opts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual void ApplyToHandle(
|
||||||
|
Cache* cache, Handle* handle,
|
||||||
|
const std::function<void(const Slice& key, ObjectPtr obj, size_t charge,
|
||||||
|
const CacheItemHelper* helper)>& callback)
|
||||||
|
override {
|
||||||
|
auto cache_ptr = static_cast<CacheWrapper*>(cache);
|
||||||
|
target_->ApplyToHandle(cache_ptr->target_.get(), handle, callback);
|
||||||
|
}
|
||||||
|
|
||||||
void EraseUnRefEntries() override { target_->EraseUnRefEntries(); }
|
void EraseUnRefEntries() override { target_->EraseUnRefEntries(); }
|
||||||
|
|
||||||
void StartAsyncLookup(AsyncLookupHandle& async_handle) override {
|
void StartAsyncLookup(AsyncLookupHandle& async_handle) override {
|
||||||
|
|
|
@ -56,7 +56,9 @@ struct ConfigOptions {
|
||||||
// setting
|
// setting
|
||||||
};
|
};
|
||||||
|
|
||||||
// When true, any unused options will be ignored and OK will be returned
|
// When true, any unused options will be ignored and OK will be returned.
|
||||||
|
// For options files that appear to be from the current version or earlier,
|
||||||
|
// unknown options are considered corruption regardless of this setting.
|
||||||
bool ignore_unknown_options = false;
|
bool ignore_unknown_options = false;
|
||||||
|
|
||||||
// When true, any unsupported options will be ignored and OK will be returned
|
// When true, any unsupported options will be ignored and OK will be returned
|
||||||
|
|
|
@ -2131,10 +2131,16 @@ struct CompactRangeOptions {
|
||||||
// IngestExternalFileOptions is used by IngestExternalFile()
|
// IngestExternalFileOptions is used by IngestExternalFile()
|
||||||
struct IngestExternalFileOptions {
|
struct IngestExternalFileOptions {
|
||||||
// Can be set to true to move the files instead of copying them.
|
// Can be set to true to move the files instead of copying them.
|
||||||
// Note that original file links will be removed after successful ingestion,
|
// The input files will be unlinked after successful ingestion.
|
||||||
// unless `allow_db_generated_files` is true.
|
// The implementation depends on hard links (LinkFile) instead of traditional
|
||||||
|
// move (RenameFile) to maximize the chances to restore to the original
|
||||||
|
// state upon failure.
|
||||||
bool move_files = false;
|
bool move_files = false;
|
||||||
// If set to true, ingestion falls back to copy when move fails.
|
// Same as move_files except that input files will NOT be unlinked.
|
||||||
|
// Only one of `move_files` and `link_files` can be set at the same time.
|
||||||
|
bool link_files = false;
|
||||||
|
// If set to true, ingestion falls back to copy when hard linking fails.
|
||||||
|
// This applies to both `move_files` and `link_files`.
|
||||||
bool failed_move_fall_back_to_copy = true;
|
bool failed_move_fall_back_to_copy = true;
|
||||||
// If set to false, an ingested file keys could appear in existing snapshots
|
// If set to false, an ingested file keys could appear in existing snapshots
|
||||||
// that where created before the file was ingested.
|
// that where created before the file was ingested.
|
||||||
|
@ -2209,8 +2215,6 @@ struct IngestExternalFileOptions {
|
||||||
// Enables ingestion of files not generated by SstFileWriter. When true:
|
// Enables ingestion of files not generated by SstFileWriter. When true:
|
||||||
// - Allows files to be ingested when their cf_id doesn't match the CF they
|
// - Allows files to be ingested when their cf_id doesn't match the CF they
|
||||||
// are being ingested into.
|
// are being ingested into.
|
||||||
// - Preserves original file links after successful ingestion when
|
|
||||||
// `move_files = true`.
|
|
||||||
// REQUIREMENTS:
|
// REQUIREMENTS:
|
||||||
// - Ingested files must not overlap with existing keys.
|
// - Ingested files must not overlap with existing keys.
|
||||||
// - `write_global_seqno` must be false.
|
// - `write_global_seqno` must be false.
|
||||||
|
|
|
@ -296,12 +296,13 @@ Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in,
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the option file is not generated by a higher minor version,
|
// If the option file is not generated by a higher version, unknown
|
||||||
// there shouldn't be any unknown option.
|
// option should only mean corruption.
|
||||||
if (config_options.ignore_unknown_options &&
|
if (config_options.ignore_unknown_options &&
|
||||||
section == kOptionSectionVersion) {
|
section == kOptionSectionVersion) {
|
||||||
if (db_version[0] < ROCKSDB_MAJOR || (db_version[0] == ROCKSDB_MAJOR &&
|
using VTuple = std::tuple<int, int, int>;
|
||||||
db_version[1] <= ROCKSDB_MINOR)) {
|
if (VTuple(db_version[0], db_version[1], db_version[2]) <=
|
||||||
|
VTuple(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH)) {
|
||||||
config_options.ignore_unknown_options = false;
|
config_options.ignore_unknown_options = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3449,44 +3449,8 @@ TEST_F(OptionsParserTest, DuplicateCFOptions) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(OptionsParserTest, IgnoreUnknownOptions) {
|
TEST_F(OptionsParserTest, IgnoreUnknownOptions) {
|
||||||
for (int case_id = 0; case_id < 5; case_id++) {
|
auto testCase = [&](bool should_ignore, const std::string& version_string) {
|
||||||
DBOptions db_opt;
|
SCOPED_TRACE(std::to_string(should_ignore) + ", " + version_string);
|
||||||
db_opt.max_open_files = 12345;
|
|
||||||
db_opt.max_background_flushes = 301;
|
|
||||||
db_opt.max_total_wal_size = 1024;
|
|
||||||
ColumnFamilyOptions cf_opt;
|
|
||||||
|
|
||||||
std::string version_string;
|
|
||||||
bool should_ignore = true;
|
|
||||||
if (case_id == 0) {
|
|
||||||
// same version
|
|
||||||
should_ignore = false;
|
|
||||||
version_string = std::to_string(ROCKSDB_MAJOR) + "." +
|
|
||||||
std::to_string(ROCKSDB_MINOR) + ".0";
|
|
||||||
} else if (case_id == 1) {
|
|
||||||
// higher minor version
|
|
||||||
should_ignore = true;
|
|
||||||
version_string = std::to_string(ROCKSDB_MAJOR) + "." +
|
|
||||||
std::to_string(ROCKSDB_MINOR + 1) + ".0";
|
|
||||||
} else if (case_id == 2) {
|
|
||||||
// higher major version.
|
|
||||||
should_ignore = true;
|
|
||||||
version_string = std::to_string(ROCKSDB_MAJOR + 1) + ".0.0";
|
|
||||||
} else if (case_id == 3) {
|
|
||||||
// lower minor version
|
|
||||||
#if ROCKSDB_MINOR == 0
|
|
||||||
continue;
|
|
||||||
#else
|
|
||||||
version_string = std::to_string(ROCKSDB_MAJOR) + "." +
|
|
||||||
std::to_string(ROCKSDB_MINOR - 1) + ".0";
|
|
||||||
should_ignore = false;
|
|
||||||
#endif
|
|
||||||
} else {
|
|
||||||
// lower major version
|
|
||||||
should_ignore = false;
|
|
||||||
version_string = std::to_string(ROCKSDB_MAJOR - 1) + "." +
|
|
||||||
std::to_string(ROCKSDB_MINOR) + ".0";
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string options_file_content =
|
std::string options_file_content =
|
||||||
"# This is a testing option string.\n"
|
"# This is a testing option string.\n"
|
||||||
|
@ -3519,16 +3483,45 @@ TEST_F(OptionsParserTest, IgnoreUnknownOptions) {
|
||||||
RocksDBOptionsParser parser;
|
RocksDBOptionsParser parser;
|
||||||
ASSERT_NOK(parser.Parse(kTestFileName, fs_.get(), false,
|
ASSERT_NOK(parser.Parse(kTestFileName, fs_.get(), false,
|
||||||
4096 /* readahead_size */));
|
4096 /* readahead_size */));
|
||||||
|
Status parse_status = parser.Parse(kTestFileName, fs_.get(),
|
||||||
|
true /* ignore_unknown_options */,
|
||||||
|
4096 /* readahead_size */);
|
||||||
if (should_ignore) {
|
if (should_ignore) {
|
||||||
ASSERT_OK(parser.Parse(kTestFileName, fs_.get(),
|
ASSERT_OK(parse_status);
|
||||||
true /* ignore_unknown_options */,
|
|
||||||
4096 /* readahead_size */));
|
|
||||||
} else {
|
} else {
|
||||||
ASSERT_NOK(parser.Parse(kTestFileName, fs_.get(),
|
ASSERT_NOK(parse_status);
|
||||||
true /* ignore_unknown_options */,
|
|
||||||
4096 /* readahead_size */));
|
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
|
|
||||||
|
// Same version
|
||||||
|
testCase(false, GetRocksVersionAsString());
|
||||||
|
// Same except .0 patch
|
||||||
|
testCase(false, std::to_string(ROCKSDB_MAJOR) + "." +
|
||||||
|
std::to_string(ROCKSDB_MINOR) + ".0");
|
||||||
|
// Higher major version
|
||||||
|
testCase(true, std::to_string(ROCKSDB_MAJOR + 1) + "." +
|
||||||
|
std::to_string(ROCKSDB_MINOR) + ".0");
|
||||||
|
// Higher minor version
|
||||||
|
testCase(true, std::to_string(ROCKSDB_MAJOR) + "." +
|
||||||
|
std::to_string(ROCKSDB_MINOR + 1) + ".0");
|
||||||
|
// Higher patch version
|
||||||
|
testCase(true, std::to_string(ROCKSDB_MAJOR) + "." +
|
||||||
|
std::to_string(ROCKSDB_MINOR) + "." +
|
||||||
|
std::to_string(ROCKSDB_PATCH + 1));
|
||||||
|
// Lower major version
|
||||||
|
testCase(false, std::to_string(ROCKSDB_MAJOR - 1) + "." +
|
||||||
|
std::to_string(ROCKSDB_MINOR) + ".0");
|
||||||
|
#if ROCKSDB_MINOR > 0
|
||||||
|
// Lower minor version
|
||||||
|
testCase(false, std::to_string(ROCKSDB_MAJOR) + "." +
|
||||||
|
std::to_string(ROCKSDB_MINOR - 1) + ".0");
|
||||||
|
#endif
|
||||||
|
#if ROCKSDB_PATCH > 0
|
||||||
|
// Lower patch version
|
||||||
|
testCase(false, std::to_string(ROCKSDB_MAJOR) + "." +
|
||||||
|
std::to_string(ROCKSDB_MINOR - 1) + "." +
|
||||||
|
std::to_string(ROCKSDB_PATCH - 1));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(OptionsParserTest, ParseVersion) {
|
TEST_F(OptionsParserTest, ParseVersion) {
|
||||||
|
|
|
@ -680,26 +680,12 @@ Status BlockBasedTable::Open(
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
s = ReadFooterFromFile(opts, file.get(), *ioptions.fs,
|
s = ReadFooterFromFile(opts, file.get(), *ioptions.fs,
|
||||||
prefetch_buffer.get(), file_size, &footer,
|
prefetch_buffer.get(), file_size, &footer,
|
||||||
kBlockBasedTableMagicNumber);
|
kBlockBasedTableMagicNumber, ioptions.stats);
|
||||||
}
|
|
||||||
// If the footer is corrupted and the FS supports checksum verification and
|
|
||||||
// correction, try reading the footer again
|
|
||||||
if (s.IsCorruption()) {
|
|
||||||
RecordTick(ioptions.statistics.get(), SST_FOOTER_CORRUPTION_COUNT);
|
|
||||||
if (CheckFSFeatureSupport(ioptions.fs.get(),
|
|
||||||
FSSupportedOps::kVerifyAndReconstructRead)) {
|
|
||||||
IOOptions retry_opts = opts;
|
|
||||||
retry_opts.verify_and_reconstruct_read = true;
|
|
||||||
s = ReadFooterFromFile(retry_opts, file.get(), *ioptions.fs,
|
|
||||||
prefetch_buffer.get(), file_size, &footer,
|
|
||||||
kBlockBasedTableMagicNumber);
|
|
||||||
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
|
|
||||||
if (s.ok()) {
|
|
||||||
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
|
if (s.IsCorruption()) {
|
||||||
|
RecordTick(ioptions.statistics.get(), SST_FOOTER_CORRUPTION_COUNT);
|
||||||
|
}
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
if (!IsSupportedFormatVersion(footer.format_version())) {
|
if (!IsSupportedFormatVersion(footer.format_version())) {
|
||||||
|
|
|
@ -91,9 +91,24 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
|
||||||
uint64_t alt_hash = GetSliceHash64(alt);
|
uint64_t alt_hash = GetSliceHash64(alt);
|
||||||
std::optional<uint64_t> prev_key_hash;
|
std::optional<uint64_t> prev_key_hash;
|
||||||
std::optional<uint64_t> prev_alt_hash = hash_entries_info_.prev_alt_hash;
|
std::optional<uint64_t> prev_alt_hash = hash_entries_info_.prev_alt_hash;
|
||||||
|
|
||||||
if (!hash_entries_info_.entries.empty()) {
|
if (!hash_entries_info_.entries.empty()) {
|
||||||
prev_key_hash = hash_entries_info_.entries.back();
|
prev_key_hash = hash_entries_info_.entries.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef ROCKSDB_VALGRIND_RUN
|
||||||
|
// Valgrind can report uninitialized FPs on std::optional usage. See e.g.
|
||||||
|
// https://stackoverflow.com/q/51616179
|
||||||
|
if (!prev_key_hash.has_value()) {
|
||||||
|
std::memset((void*)&prev_key_hash, 0, sizeof(prev_key_hash));
|
||||||
|
prev_key_hash.reset();
|
||||||
|
}
|
||||||
|
if (!prev_alt_hash.has_value()) {
|
||||||
|
std::memset((void*)&prev_alt_hash, 0, sizeof(prev_key_hash));
|
||||||
|
prev_alt_hash.reset();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Add alt first, so that entries.back() always contains previous key
|
// Add alt first, so that entries.back() always contains previous key
|
||||||
// ASSUMING a change from one alt to the next implies a change to
|
// ASSUMING a change from one alt to the next implies a change to
|
||||||
// corresponding key
|
// corresponding key
|
||||||
|
@ -295,15 +310,6 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
|
||||||
bool detect_filter_construct_corruption_;
|
bool detect_filter_construct_corruption_;
|
||||||
|
|
||||||
struct HashEntriesInfo {
|
struct HashEntriesInfo {
|
||||||
#ifdef ROCKSDB_VALGRIND_RUN
|
|
||||||
HashEntriesInfo() {
|
|
||||||
// Valgrind can report uninitialized FPs on std::optional usage. See e.g.
|
|
||||||
// https://stackoverflow.com/q/51616179
|
|
||||||
std::memset((void*)&prev_alt_hash, 0, sizeof(prev_alt_hash));
|
|
||||||
prev_alt_hash = {};
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// A deque avoids unnecessary copying of already-saved values
|
// A deque avoids unnecessary copying of already-saved values
|
||||||
// and has near-minimal peak memory use.
|
// and has near-minimal peak memory use.
|
||||||
std::deque<uint64_t> entries;
|
std::deque<uint64_t> entries;
|
||||||
|
|
|
@ -475,10 +475,12 @@ std::string Footer::ToString() const {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
|
static Status ReadFooterFromFileInternal(const IOOptions& opts,
|
||||||
FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
|
RandomAccessFileReader* file,
|
||||||
uint64_t file_size, Footer* footer,
|
FileSystem& fs,
|
||||||
uint64_t enforce_table_magic_number) {
|
FilePrefetchBuffer* prefetch_buffer,
|
||||||
|
uint64_t file_size, Footer* footer,
|
||||||
|
uint64_t enforce_table_magic_number) {
|
||||||
if (file_size < Footer::kMinEncodedLength) {
|
if (file_size < Footer::kMinEncodedLength) {
|
||||||
return Status::Corruption("file is too short (" +
|
return Status::Corruption("file is too short (" +
|
||||||
std::to_string(file_size) +
|
std::to_string(file_size) +
|
||||||
|
@ -516,6 +518,8 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_SYNC_POINT_CALLBACK("ReadFooterFromFileInternal:0", &footer_input);
|
||||||
|
|
||||||
// Check that we actually read the whole footer from the file. It may be
|
// Check that we actually read the whole footer from the file. It may be
|
||||||
// that size isn't correct.
|
// that size isn't correct.
|
||||||
if (footer_input.size() < Footer::kMinEncodedLength) {
|
if (footer_input.size() < Footer::kMinEncodedLength) {
|
||||||
|
@ -543,6 +547,30 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
|
||||||
|
FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
|
||||||
|
uint64_t file_size, Footer* footer,
|
||||||
|
uint64_t enforce_table_magic_number,
|
||||||
|
Statistics* stats) {
|
||||||
|
Status s =
|
||||||
|
ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer, file_size,
|
||||||
|
footer, enforce_table_magic_number);
|
||||||
|
if (s.IsCorruption() &&
|
||||||
|
CheckFSFeatureSupport(&fs, FSSupportedOps::kVerifyAndReconstructRead)) {
|
||||||
|
IOOptions new_opts = opts;
|
||||||
|
new_opts.verify_and_reconstruct_read = true;
|
||||||
|
footer->Reset();
|
||||||
|
s = ReadFooterFromFileInternal(new_opts, file, fs, prefetch_buffer,
|
||||||
|
file_size, footer,
|
||||||
|
enforce_table_magic_number);
|
||||||
|
RecordTick(stats, FILE_READ_CORRUPTION_RETRY_COUNT);
|
||||||
|
if (s.ok()) {
|
||||||
|
RecordTick(stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
// Custom handling for the last byte of a block, to avoid invoking streaming
|
// Custom handling for the last byte of a block, to avoid invoking streaming
|
||||||
// API to get an effective block checksum. This function is its own inverse
|
// API to get an effective block checksum. This function is its own inverse
|
||||||
|
|
|
@ -186,6 +186,16 @@ class Footer {
|
||||||
// Create empty. Populate using DecodeFrom.
|
// Create empty. Populate using DecodeFrom.
|
||||||
Footer() {}
|
Footer() {}
|
||||||
|
|
||||||
|
void Reset() {
|
||||||
|
table_magic_number_ = kNullTableMagicNumber;
|
||||||
|
format_version_ = kInvalidFormatVersion;
|
||||||
|
base_context_checksum_ = 0;
|
||||||
|
metaindex_handle_ = BlockHandle::NullBlockHandle();
|
||||||
|
index_handle_ = BlockHandle::NullBlockHandle();
|
||||||
|
checksum_type_ = kInvalidChecksumType;
|
||||||
|
block_trailer_size_ = 0;
|
||||||
|
}
|
||||||
|
|
||||||
// Deserialize a footer (populate fields) from `input` and check for various
|
// Deserialize a footer (populate fields) from `input` and check for various
|
||||||
// corruptions. `input_offset` is the offset within the target file of
|
// corruptions. `input_offset` is the offset within the target file of
|
||||||
// `input` buffer, which is needed for verifying format_version >= 6 footer.
|
// `input` buffer, which is needed for verifying format_version >= 6 footer.
|
||||||
|
@ -304,7 +314,8 @@ class FooterBuilder {
|
||||||
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
|
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
|
||||||
FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
|
FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
|
||||||
uint64_t file_size, Footer* footer,
|
uint64_t file_size, Footer* footer,
|
||||||
uint64_t enforce_table_magic_number = 0);
|
uint64_t enforce_table_magic_number = 0,
|
||||||
|
Statistics* stats = nullptr);
|
||||||
|
|
||||||
// Computes a checksum using the given ChecksumType. Sometimes we need to
|
// Computes a checksum using the given ChecksumType. Sometimes we need to
|
||||||
// include one more input byte logically at the end but not part of the main
|
// include one more input byte logically at the end but not part of the main
|
||||||
|
|
|
@ -262,184 +262,232 @@ Status ReadTablePropertiesHelper(
|
||||||
MemoryAllocator* memory_allocator) {
|
MemoryAllocator* memory_allocator) {
|
||||||
assert(table_properties);
|
assert(table_properties);
|
||||||
|
|
||||||
// If this is an external SST file ingested with write_global_seqno set to
|
Status s;
|
||||||
// true, then we expect the checksum mismatch because checksum was written
|
bool retry = false;
|
||||||
// by SstFileWriter, but its global seqno in the properties block may have
|
while (true) {
|
||||||
// been changed during ingestion. For this reason, we initially read
|
BlockContents block_contents;
|
||||||
// and process without checksum verification, then later try checksum
|
size_t len = handle.size() + footer.GetBlockTrailerSize();
|
||||||
// verification so that if it fails, we can copy to a temporary buffer with
|
// If this is an external SST file ingested with write_global_seqno set to
|
||||||
// global seqno set to its original value, i.e. 0, and attempt checksum
|
// true, then we expect the checksum mismatch because checksum was written
|
||||||
// verification again.
|
// by SstFileWriter, but its global seqno in the properties block may have
|
||||||
ReadOptions modified_ro = ro;
|
// been changed during ingestion. For this reason, we initially read
|
||||||
modified_ro.verify_checksums = false;
|
// and process without checksum verification, then later try checksum
|
||||||
BlockContents block_contents;
|
// verification so that if it fails, we can copy to a temporary buffer with
|
||||||
BlockFetcher block_fetcher(file, prefetch_buffer, footer, modified_ro, handle,
|
// global seqno set to its original value, i.e. 0, and attempt checksum
|
||||||
&block_contents, ioptions, false /* decompress */,
|
// verification again.
|
||||||
false /*maybe_compressed*/, BlockType::kProperties,
|
if (!retry) {
|
||||||
UncompressionDict::GetEmptyDict(),
|
ReadOptions modified_ro = ro;
|
||||||
PersistentCacheOptions::kEmpty, memory_allocator);
|
modified_ro.verify_checksums = false;
|
||||||
Status s = block_fetcher.ReadBlockContents();
|
BlockFetcher block_fetcher(
|
||||||
if (!s.ok()) {
|
file, prefetch_buffer, footer, modified_ro, handle, &block_contents,
|
||||||
return s;
|
ioptions, false /* decompress */, false /*maybe_compressed*/,
|
||||||
}
|
BlockType::kProperties, UncompressionDict::GetEmptyDict(),
|
||||||
|
PersistentCacheOptions::kEmpty, memory_allocator);
|
||||||
// Unfortunately, Block::size() might not equal block_contents.data.size(),
|
s = block_fetcher.ReadBlockContents();
|
||||||
// and Block hides block_contents
|
if (!s.ok()) {
|
||||||
uint64_t block_size = block_contents.data.size();
|
return s;
|
||||||
Block properties_block(std::move(block_contents));
|
}
|
||||||
std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
|
assert(block_fetcher.GetBlockSizeWithTrailer() == len);
|
||||||
|
TEST_SYNC_POINT_CALLBACK("ReadTablePropertiesHelper:0",
|
||||||
std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
|
&block_contents.data);
|
||||||
// All pre-defined properties of type uint64_t
|
} else {
|
||||||
std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
|
assert(s.IsCorruption());
|
||||||
{TablePropertiesNames::kOriginalFileNumber,
|
// If retrying, use a stronger file system read to check and correct
|
||||||
&new_table_properties->orig_file_number},
|
// data corruption
|
||||||
{TablePropertiesNames::kDataSize, &new_table_properties->data_size},
|
IOOptions opts;
|
||||||
{TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
|
if (PrepareIOFromReadOptions(ro, ioptions.clock, opts) !=
|
||||||
{TablePropertiesNames::kIndexPartitions,
|
IOStatus::OK()) {
|
||||||
&new_table_properties->index_partitions},
|
return s;
|
||||||
{TablePropertiesNames::kTopLevelIndexSize,
|
}
|
||||||
&new_table_properties->top_level_index_size},
|
opts.verify_and_reconstruct_read = true;
|
||||||
{TablePropertiesNames::kIndexKeyIsUserKey,
|
std::unique_ptr<char[]> data(new char[len]);
|
||||||
&new_table_properties->index_key_is_user_key},
|
Slice result;
|
||||||
{TablePropertiesNames::kIndexValueIsDeltaEncoded,
|
IOStatus io_s =
|
||||||
&new_table_properties->index_value_is_delta_encoded},
|
file->Read(opts, handle.offset(), len, &result, data.get(), nullptr);
|
||||||
{TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
|
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
|
||||||
{TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
|
if (!io_s.ok()) {
|
||||||
{TablePropertiesNames::kRawValueSize,
|
ROCKS_LOG_INFO(ioptions.info_log,
|
||||||
&new_table_properties->raw_value_size},
|
"Reading properties block failed - %s",
|
||||||
{TablePropertiesNames::kNumDataBlocks,
|
io_s.ToString().c_str());
|
||||||
&new_table_properties->num_data_blocks},
|
// Return the original corruption error as that's more serious
|
||||||
{TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
|
return s;
|
||||||
{TablePropertiesNames::kNumFilterEntries,
|
}
|
||||||
&new_table_properties->num_filter_entries},
|
if (result.size() < len) {
|
||||||
{TablePropertiesNames::kDeletedKeys,
|
return Status::Corruption("Reading properties block failed - " +
|
||||||
&new_table_properties->num_deletions},
|
std::to_string(result.size()) +
|
||||||
{TablePropertiesNames::kMergeOperands,
|
" bytes read");
|
||||||
&new_table_properties->num_merge_operands},
|
}
|
||||||
{TablePropertiesNames::kNumRangeDeletions,
|
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
||||||
&new_table_properties->num_range_deletions},
|
block_contents = BlockContents(std::move(data), handle.size());
|
||||||
{TablePropertiesNames::kFormatVersion,
|
|
||||||
&new_table_properties->format_version},
|
|
||||||
{TablePropertiesNames::kFixedKeyLen,
|
|
||||||
&new_table_properties->fixed_key_len},
|
|
||||||
{TablePropertiesNames::kColumnFamilyId,
|
|
||||||
&new_table_properties->column_family_id},
|
|
||||||
{TablePropertiesNames::kCreationTime,
|
|
||||||
&new_table_properties->creation_time},
|
|
||||||
{TablePropertiesNames::kOldestKeyTime,
|
|
||||||
&new_table_properties->oldest_key_time},
|
|
||||||
{TablePropertiesNames::kFileCreationTime,
|
|
||||||
&new_table_properties->file_creation_time},
|
|
||||||
{TablePropertiesNames::kSlowCompressionEstimatedDataSize,
|
|
||||||
&new_table_properties->slow_compression_estimated_data_size},
|
|
||||||
{TablePropertiesNames::kFastCompressionEstimatedDataSize,
|
|
||||||
&new_table_properties->fast_compression_estimated_data_size},
|
|
||||||
{TablePropertiesNames::kTailStartOffset,
|
|
||||||
&new_table_properties->tail_start_offset},
|
|
||||||
{TablePropertiesNames::kUserDefinedTimestampsPersisted,
|
|
||||||
&new_table_properties->user_defined_timestamps_persisted},
|
|
||||||
{TablePropertiesNames::kKeyLargestSeqno,
|
|
||||||
&new_table_properties->key_largest_seqno},
|
|
||||||
};
|
|
||||||
|
|
||||||
std::string last_key;
|
|
||||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
||||||
s = iter->status();
|
|
||||||
if (!s.ok()) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
auto key = iter->key().ToString();
|
uint64_t block_size = block_contents.data.size();
|
||||||
// properties block should be strictly sorted with no duplicate key.
|
Block properties_block(std::move(block_contents));
|
||||||
if (!last_key.empty() &&
|
// Unfortunately, Block::size() might not equal block_contents.data.size(),
|
||||||
BytewiseComparator()->Compare(key, last_key) <= 0) {
|
// and Block hides block_contents
|
||||||
s = Status::Corruption("properties unsorted");
|
std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
|
||||||
break;
|
|
||||||
}
|
|
||||||
last_key = key;
|
|
||||||
|
|
||||||
auto raw_val = iter->value();
|
std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
|
||||||
auto pos = predefined_uint64_properties.find(key);
|
// All pre-defined properties of type uint64_t
|
||||||
|
std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
|
||||||
|
{TablePropertiesNames::kOriginalFileNumber,
|
||||||
|
&new_table_properties->orig_file_number},
|
||||||
|
{TablePropertiesNames::kDataSize, &new_table_properties->data_size},
|
||||||
|
{TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
|
||||||
|
{TablePropertiesNames::kIndexPartitions,
|
||||||
|
&new_table_properties->index_partitions},
|
||||||
|
{TablePropertiesNames::kTopLevelIndexSize,
|
||||||
|
&new_table_properties->top_level_index_size},
|
||||||
|
{TablePropertiesNames::kIndexKeyIsUserKey,
|
||||||
|
&new_table_properties->index_key_is_user_key},
|
||||||
|
{TablePropertiesNames::kIndexValueIsDeltaEncoded,
|
||||||
|
&new_table_properties->index_value_is_delta_encoded},
|
||||||
|
{TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
|
||||||
|
{TablePropertiesNames::kRawKeySize,
|
||||||
|
&new_table_properties->raw_key_size},
|
||||||
|
{TablePropertiesNames::kRawValueSize,
|
||||||
|
&new_table_properties->raw_value_size},
|
||||||
|
{TablePropertiesNames::kNumDataBlocks,
|
||||||
|
&new_table_properties->num_data_blocks},
|
||||||
|
{TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
|
||||||
|
{TablePropertiesNames::kNumFilterEntries,
|
||||||
|
&new_table_properties->num_filter_entries},
|
||||||
|
{TablePropertiesNames::kDeletedKeys,
|
||||||
|
&new_table_properties->num_deletions},
|
||||||
|
{TablePropertiesNames::kMergeOperands,
|
||||||
|
&new_table_properties->num_merge_operands},
|
||||||
|
{TablePropertiesNames::kNumRangeDeletions,
|
||||||
|
&new_table_properties->num_range_deletions},
|
||||||
|
{TablePropertiesNames::kFormatVersion,
|
||||||
|
&new_table_properties->format_version},
|
||||||
|
{TablePropertiesNames::kFixedKeyLen,
|
||||||
|
&new_table_properties->fixed_key_len},
|
||||||
|
{TablePropertiesNames::kColumnFamilyId,
|
||||||
|
&new_table_properties->column_family_id},
|
||||||
|
{TablePropertiesNames::kCreationTime,
|
||||||
|
&new_table_properties->creation_time},
|
||||||
|
{TablePropertiesNames::kOldestKeyTime,
|
||||||
|
&new_table_properties->oldest_key_time},
|
||||||
|
{TablePropertiesNames::kFileCreationTime,
|
||||||
|
&new_table_properties->file_creation_time},
|
||||||
|
{TablePropertiesNames::kSlowCompressionEstimatedDataSize,
|
||||||
|
&new_table_properties->slow_compression_estimated_data_size},
|
||||||
|
{TablePropertiesNames::kFastCompressionEstimatedDataSize,
|
||||||
|
&new_table_properties->fast_compression_estimated_data_size},
|
||||||
|
{TablePropertiesNames::kTailStartOffset,
|
||||||
|
&new_table_properties->tail_start_offset},
|
||||||
|
{TablePropertiesNames::kUserDefinedTimestampsPersisted,
|
||||||
|
&new_table_properties->user_defined_timestamps_persisted},
|
||||||
|
{TablePropertiesNames::kKeyLargestSeqno,
|
||||||
|
&new_table_properties->key_largest_seqno},
|
||||||
|
};
|
||||||
|
|
||||||
if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
|
std::string last_key;
|
||||||
new_table_properties->external_sst_file_global_seqno_offset =
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||||
handle.offset() + iter->ValueOffset();
|
s = iter->status();
|
||||||
}
|
if (!s.ok()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (pos != predefined_uint64_properties.end()) {
|
auto key = iter->key().ToString();
|
||||||
if (key == TablePropertiesNames::kDeletedKeys ||
|
// properties block should be strictly sorted with no duplicate key.
|
||||||
key == TablePropertiesNames::kMergeOperands) {
|
if (!last_key.empty() &&
|
||||||
// Insert in user-collected properties for API backwards compatibility
|
BytewiseComparator()->Compare(key, last_key) <= 0) {
|
||||||
|
s = Status::Corruption("properties unsorted");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
last_key = key;
|
||||||
|
|
||||||
|
auto raw_val = iter->value();
|
||||||
|
auto pos = predefined_uint64_properties.find(key);
|
||||||
|
|
||||||
|
if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
|
||||||
|
new_table_properties->external_sst_file_global_seqno_offset =
|
||||||
|
handle.offset() + iter->ValueOffset();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pos != predefined_uint64_properties.end()) {
|
||||||
|
if (key == TablePropertiesNames::kDeletedKeys ||
|
||||||
|
key == TablePropertiesNames::kMergeOperands) {
|
||||||
|
// Insert in user-collected properties for API backwards compatibility
|
||||||
|
new_table_properties->user_collected_properties.insert(
|
||||||
|
{key, raw_val.ToString()});
|
||||||
|
}
|
||||||
|
// handle predefined rocksdb properties
|
||||||
|
uint64_t val;
|
||||||
|
if (!GetVarint64(&raw_val, &val)) {
|
||||||
|
// skip malformed value
|
||||||
|
auto error_msg =
|
||||||
|
"Detect malformed value in properties meta-block:"
|
||||||
|
"\tkey: " +
|
||||||
|
key + "\tval: " + raw_val.ToString();
|
||||||
|
ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
*(pos->second) = val;
|
||||||
|
} else if (key == TablePropertiesNames::kDbId) {
|
||||||
|
new_table_properties->db_id = raw_val.ToString();
|
||||||
|
} else if (key == TablePropertiesNames::kDbSessionId) {
|
||||||
|
new_table_properties->db_session_id = raw_val.ToString();
|
||||||
|
} else if (key == TablePropertiesNames::kDbHostId) {
|
||||||
|
new_table_properties->db_host_id = raw_val.ToString();
|
||||||
|
} else if (key == TablePropertiesNames::kFilterPolicy) {
|
||||||
|
new_table_properties->filter_policy_name = raw_val.ToString();
|
||||||
|
} else if (key == TablePropertiesNames::kColumnFamilyName) {
|
||||||
|
new_table_properties->column_family_name = raw_val.ToString();
|
||||||
|
} else if (key == TablePropertiesNames::kComparator) {
|
||||||
|
new_table_properties->comparator_name = raw_val.ToString();
|
||||||
|
} else if (key == TablePropertiesNames::kMergeOperator) {
|
||||||
|
new_table_properties->merge_operator_name = raw_val.ToString();
|
||||||
|
} else if (key == TablePropertiesNames::kPrefixExtractorName) {
|
||||||
|
new_table_properties->prefix_extractor_name = raw_val.ToString();
|
||||||
|
} else if (key == TablePropertiesNames::kPropertyCollectors) {
|
||||||
|
new_table_properties->property_collectors_names = raw_val.ToString();
|
||||||
|
} else if (key == TablePropertiesNames::kCompression) {
|
||||||
|
new_table_properties->compression_name = raw_val.ToString();
|
||||||
|
} else if (key == TablePropertiesNames::kCompressionOptions) {
|
||||||
|
new_table_properties->compression_options = raw_val.ToString();
|
||||||
|
} else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
|
||||||
|
new_table_properties->seqno_to_time_mapping = raw_val.ToString();
|
||||||
|
} else {
|
||||||
|
// handle user-collected properties
|
||||||
new_table_properties->user_collected_properties.insert(
|
new_table_properties->user_collected_properties.insert(
|
||||||
{key, raw_val.ToString()});
|
{key, raw_val.ToString()});
|
||||||
}
|
}
|
||||||
// handle predefined rocksdb properties
|
}
|
||||||
uint64_t val;
|
|
||||||
if (!GetVarint64(&raw_val, &val)) {
|
// Modified version of BlockFetcher checksum verification
|
||||||
// skip malformed value
|
// (See write_global_seqno comment above)
|
||||||
auto error_msg =
|
if (s.ok() && footer.GetBlockTrailerSize() > 0) {
|
||||||
"Detect malformed value in properties meta-block:"
|
s = VerifyBlockChecksum(footer, properties_block.data(), block_size,
|
||||||
"\tkey: " +
|
file->file_name(), handle.offset());
|
||||||
key + "\tval: " + raw_val.ToString();
|
if (s.IsCorruption()) {
|
||||||
ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
|
if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
|
||||||
continue;
|
std::string tmp_buf(properties_block.data(), len);
|
||||||
|
uint64_t global_seqno_offset =
|
||||||
|
new_table_properties->external_sst_file_global_seqno_offset -
|
||||||
|
handle.offset();
|
||||||
|
EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
|
||||||
|
s = VerifyBlockChecksum(footer, tmp_buf.data(), block_size,
|
||||||
|
file->file_name(), handle.offset());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
*(pos->second) = val;
|
}
|
||||||
} else if (key == TablePropertiesNames::kDbId) {
|
|
||||||
new_table_properties->db_id = raw_val.ToString();
|
// If we detected a corruption and the file system supports verification
|
||||||
} else if (key == TablePropertiesNames::kDbSessionId) {
|
// and reconstruction, retry the read
|
||||||
new_table_properties->db_session_id = raw_val.ToString();
|
if (s.IsCorruption() && !retry &&
|
||||||
} else if (key == TablePropertiesNames::kDbHostId) {
|
CheckFSFeatureSupport(ioptions.fs.get(),
|
||||||
new_table_properties->db_host_id = raw_val.ToString();
|
FSSupportedOps::kVerifyAndReconstructRead)) {
|
||||||
} else if (key == TablePropertiesNames::kFilterPolicy) {
|
retry = true;
|
||||||
new_table_properties->filter_policy_name = raw_val.ToString();
|
|
||||||
} else if (key == TablePropertiesNames::kColumnFamilyName) {
|
|
||||||
new_table_properties->column_family_name = raw_val.ToString();
|
|
||||||
} else if (key == TablePropertiesNames::kComparator) {
|
|
||||||
new_table_properties->comparator_name = raw_val.ToString();
|
|
||||||
} else if (key == TablePropertiesNames::kMergeOperator) {
|
|
||||||
new_table_properties->merge_operator_name = raw_val.ToString();
|
|
||||||
} else if (key == TablePropertiesNames::kPrefixExtractorName) {
|
|
||||||
new_table_properties->prefix_extractor_name = raw_val.ToString();
|
|
||||||
} else if (key == TablePropertiesNames::kPropertyCollectors) {
|
|
||||||
new_table_properties->property_collectors_names = raw_val.ToString();
|
|
||||||
} else if (key == TablePropertiesNames::kCompression) {
|
|
||||||
new_table_properties->compression_name = raw_val.ToString();
|
|
||||||
} else if (key == TablePropertiesNames::kCompressionOptions) {
|
|
||||||
new_table_properties->compression_options = raw_val.ToString();
|
|
||||||
} else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
|
|
||||||
new_table_properties->seqno_to_time_mapping = raw_val.ToString();
|
|
||||||
} else {
|
} else {
|
||||||
// handle user-collected properties
|
if (s.ok()) {
|
||||||
new_table_properties->user_collected_properties.insert(
|
*table_properties = std::move(new_table_properties);
|
||||||
{key, raw_val.ToString()});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Modified version of BlockFetcher checksum verification
|
|
||||||
// (See write_global_seqno comment above)
|
|
||||||
if (s.ok() && footer.GetBlockTrailerSize() > 0) {
|
|
||||||
s = VerifyBlockChecksum(footer, properties_block.data(), block_size,
|
|
||||||
file->file_name(), handle.offset());
|
|
||||||
if (s.IsCorruption()) {
|
|
||||||
if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
|
|
||||||
std::string tmp_buf(properties_block.data(),
|
|
||||||
block_fetcher.GetBlockSizeWithTrailer());
|
|
||||||
uint64_t global_seqno_offset =
|
|
||||||
new_table_properties->external_sst_file_global_seqno_offset -
|
|
||||||
handle.offset();
|
|
||||||
EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
|
|
||||||
s = VerifyBlockChecksum(footer, tmp_buf.data(), block_size,
|
|
||||||
file->file_name(), handle.offset());
|
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (s.ok()) {
|
|
||||||
*table_properties = std::move(new_table_properties);
|
|
||||||
}
|
|
||||||
|
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,8 +12,8 @@
|
||||||
|
|
||||||
#include "db/range_tombstone_fragmenter.h"
|
#include "db/range_tombstone_fragmenter.h"
|
||||||
#if USE_COROUTINES
|
#if USE_COROUTINES
|
||||||
#include "folly/experimental/coro/Coroutine.h"
|
#include "folly/coro/Coroutine.h"
|
||||||
#include "folly/experimental/coro/Task.h"
|
#include "folly/coro/Task.h"
|
||||||
#endif
|
#endif
|
||||||
#include "rocksdb/slice_transform.h"
|
#include "rocksdb/slice_transform.h"
|
||||||
#include "rocksdb/table_reader_caller.h"
|
#include "rocksdb/table_reader_caller.h"
|
||||||
|
|
|
@ -125,7 +125,7 @@ EOF
|
||||||
|
|
||||||
# To check for DB forward compatibility with loading options (old version
|
# To check for DB forward compatibility with loading options (old version
|
||||||
# reading data from new), as well as backward compatibility
|
# reading data from new), as well as backward compatibility
|
||||||
declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb", "9.6.fb")
|
declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb")
|
||||||
# To check for DB forward compatibility without loading options (in addition
|
# To check for DB forward compatibility without loading options (in addition
|
||||||
# to the "with loading options" set), as well as backward compatibility
|
# to the "with loading options" set), as well as backward compatibility
|
||||||
declare -a db_forward_no_options_refs=() # N/A at the moment
|
declare -a db_forward_no_options_refs=() # N/A at the moment
|
||||||
|
|
|
@ -923,16 +923,22 @@ def finalize_and_sanitize(src_params):
|
||||||
dest_params["prefixpercent"] = 0
|
dest_params["prefixpercent"] = 0
|
||||||
dest_params["check_multiget_consistency"] = 0
|
dest_params["check_multiget_consistency"] = 0
|
||||||
dest_params["check_multiget_entity_consistency"] = 0
|
dest_params["check_multiget_entity_consistency"] = 0
|
||||||
if dest_params.get("disable_wal") == 0 and dest_params.get("reopen") > 0:
|
if dest_params.get("disable_wal") == 0:
|
||||||
# Reopen with WAL currently requires persisting WAL data before closing for reopen.
|
if dest_params.get("reopen") > 0 or (dest_params.get("manual_wal_flush_one_in") and dest_params.get("column_families") != 1):
|
||||||
# Previous injected WAL write errors may not be cleared by the time of closing and ready
|
# Reopen with WAL currently requires persisting WAL data before closing for reopen.
|
||||||
# for persisting WAL.
|
# Previous injected WAL write errors may not be cleared by the time of closing and ready
|
||||||
# To simplify, we disable any WAL write error injection.
|
# for persisting WAL.
|
||||||
# TODO(hx235): support WAL write error injection with reopen
|
# To simplify, we disable any WAL write error injection.
|
||||||
# TODO(hx235): support excluding WAL from metadata write fault injection so we don't
|
# TODO(hx235): support WAL write error injection with reopen
|
||||||
# have to disable metadata write fault injection to other file
|
# TODO(hx235): support excluding WAL from metadata write fault injection so we don't
|
||||||
dest_params["exclude_wal_from_write_fault_injection"] = 1
|
# have to disable metadata write fault injection to other file
|
||||||
dest_params["metadata_write_fault_one_in"] = 0
|
#
|
||||||
|
# WAL write failure can drop buffered WAL data. This can cause
|
||||||
|
# inconsistency when one CF has a successful flush during auto
|
||||||
|
# recovery. Disable the fault injection in this path for now until
|
||||||
|
# we have a fix that allows auto recovery.
|
||||||
|
dest_params["exclude_wal_from_write_fault_injection"] = 1
|
||||||
|
dest_params["metadata_write_fault_one_in"] = 0
|
||||||
if dest_params.get("disable_wal") == 1:
|
if dest_params.get("disable_wal") == 1:
|
||||||
# disableWAL and recycle_log_file_num options are not mutually
|
# disableWAL and recycle_log_file_num options are not mutually
|
||||||
# compatible at the moment
|
# compatible at the moment
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
* Support ingesting db generated files using hard link, i.e. IngestExternalFileOptions::move_files and IngestExternalFileOptions::allow_db_generated_files._
|
* Support ingesting db generated files using hard link, i.e. IngestExternalFileOptions::move_files/link_files and IngestExternalFileOptions::allow_db_generated_files.
|
|
@ -0,0 +1 @@
|
||||||
|
* Add a new file ingestion option `IngestExternalFileOptions::link_files` to hard link input files and preserve original files links after ingestion.
|
|
@ -0,0 +1 @@
|
||||||
|
* Fix a bug in CompactRange() where result files may not be compacted in any future compaction. This can only happen when users configure CompactRangeOptions::change_level to true and the change level step of manual compaction fails (#13009).
|
|
@ -7,7 +7,7 @@
|
||||||
|
|
||||||
#if USE_COROUTINES
|
#if USE_COROUTINES
|
||||||
#include "file/random_access_file_reader.h"
|
#include "file/random_access_file_reader.h"
|
||||||
#include "folly/experimental/coro/ViaIfAsync.h"
|
#include "folly/coro/ViaIfAsync.h"
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
#include "rocksdb/file_system.h"
|
#include "rocksdb/file_system.h"
|
||||||
#include "rocksdb/statistics.h"
|
#include "rocksdb/statistics.h"
|
||||||
|
|
|
@ -5,8 +5,8 @@
|
||||||
// (found in the LICENSE.Apache file in the root directory).
|
// (found in the LICENSE.Apache file in the root directory).
|
||||||
|
|
||||||
#if defined(USE_COROUTINES)
|
#if defined(USE_COROUTINES)
|
||||||
#include "folly/experimental/coro/Coroutine.h"
|
#include "folly/coro/Coroutine.h"
|
||||||
#include "folly/experimental/coro/Task.h"
|
#include "folly/coro/Task.h"
|
||||||
#endif
|
#endif
|
||||||
#include "rocksdb/rocksdb_namespace.h"
|
#include "rocksdb/rocksdb_namespace.h"
|
||||||
|
|
||||||
|
|
|
@ -506,10 +506,10 @@ IOStatus TestFSRandomAccessFile::ReadAsync(
|
||||||
s = target_->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, nullptr);
|
s = target_->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, nullptr);
|
||||||
// TODO (low priority): fs_->ReadUnsyncedData()
|
// TODO (low priority): fs_->ReadUnsyncedData()
|
||||||
} else {
|
} else {
|
||||||
// If there’s no injected error, then cb will be called asynchronously when
|
// If there's no injected error, then cb will be called asynchronously when
|
||||||
// target_ actually finishes the read. But if there’s an injected error, it
|
// target_ actually finishes the read. But if there's an injected error, it
|
||||||
// needs to immediately call cb(res, cb_arg) s since target_->ReadAsync()
|
// needs to immediately call cb(res, cb_arg) s since target_->ReadAsync()
|
||||||
// isn’t invoked at all.
|
// isn't invoked at all.
|
||||||
res.status = res_status;
|
res.status = res_status;
|
||||||
cb(res, cb_arg);
|
cb(res, cb_arg);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue