Merge branch 'main' into JniReaderForTableIterator

This commit is contained in:
Swaminathan Balachandran 2024-08-31 10:09:33 -07:00 committed by GitHub
commit 21eca90d5e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
131 changed files with 4763 additions and 1706 deletions

View File

@ -1,13 +1,13 @@
name: facebook/rocksdb/benchmark-linux
on: workflow_dispatch
jobs:
# FIXME: when this job is fixed, it should be given a cron schedule like
permissions: {}
# FIXME: Disabled temporarily
# schedule:
# - cron: 0 * * * *
# workflow_dispatch:
# - cron: 7 */2 * * * # At minute 7 past every 2nd hour
jobs:
benchmark-linux:
if: ${{ github.repository_owner == 'facebook' }}
runs-on: ubuntu-latest
runs-on: ubuntu-latest # FIXME: change this back to self-hosted when ready
steps:
- uses: actions/checkout@v4.1.0
- uses: "./.github/actions/build-for-benchmarks"

View File

@ -1,5 +1,6 @@
name: facebook/rocksdb/nightly
on: workflow_dispatch
permissions: {}
jobs:
# These jobs would be in nightly but are failing or otherwise broken for
# some reason.

View File

@ -3,6 +3,7 @@ on:
schedule:
- cron: 0 9 * * *
workflow_dispatch:
permissions: {}
jobs:
build-format-compatible:
if: ${{ github.repository_owner == 'facebook' }}
@ -59,12 +60,15 @@ jobs:
container:
image: zjay437/rocksdb:0.6
options: --shm-size=16gb
env:
CC: clang-13
CXX: clang++-13
steps:
- uses: actions/checkout@v4.1.0
- uses: "./.github/actions/pre-steps"
- uses: "./.github/actions/setup-folly"
- uses: "./.github/actions/build-folly"
- run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
- run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
- uses: "./.github/actions/post-steps"
build-linux-valgrind:
if: ${{ github.repository_owner == 'facebook' }}

View File

@ -1,5 +1,6 @@
name: facebook/rocksdb/pr-jobs-candidate
on: workflow_dispatch
permissions: {}
jobs:
# These jobs would be in pr-jobs but are failing or otherwise broken for
# some reason.

View File

@ -1,5 +1,6 @@
name: facebook/rocksdb/pr-jobs
on: [push, pull_request]
permissions: {}
jobs:
# NOTE: multiple workflows would be recommended, but the current GHA UI in
# PRs doesn't make it clear when there's an overall error with a workflow,

View File

@ -1,6 +1,28 @@
# Rocksdb Change Log
> NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
## 9.6.0 (08/19/2024)
### New Features
* *Best efforts recovery supports recovering to incomplete Version with a clean seqno cut that presents a valid point in time view from the user's perspective, if versioning history doesn't include atomic flush.
* New option `BlockBasedTableOptions::decouple_partitioned_filters` should improve efficiency in serving read queries because filter and index partitions can consistently target the configured `metadata_block_size`. This option is currently opt-in.
* Introduce a new mutable CF option `paranoid_memory_checks`. It enables additional validation on data integrity during reads/scanning. Currently, skip list based memtable will validate key ordering during look up and scans.
### Public API Changes
* Add ticker stats to count file read retries due to checksum mismatch
* Adds optional installation callback function for remote compaction
### Behavior Changes
* There may be less intra-L0 compaction triggered by total L0 size being too small. We now use compensated file size (tombstones are assigned some value size) when calculating L0 size and reduce the threshold for L0 size limit. This is to avoid accumulating too much data/tombstones in L0.
### Bug Fixes
* *Make DestroyDB supports slow deletion when it's configured in `SstFileManager`. The slow deletion is subject to the configured `rate_bytes_per_sec`, but not subject to the `max_trash_db_ratio`.
* Fixed a bug where we set unprep_seqs_ even when WriteImpl() fails. This was caught by stress test write fault injection in WriteImpl(). This may have incorrectly caused iteration creation failure for unvalidated writes or returned wrong result for WriteUnpreparedTxn::GetUnpreparedSequenceNumbers().
* Fixed a bug where successful write right after error recovery for last failed write finishes causes duplicate WAL entries
* Fixed a data race involving the background error status in `unordered_write` mode.
* *Fix a bug where file snapshot functions like backup, checkpoint may attempt to copy a non-existing manifest file. #12882
* Fix a bug where per kv checksum corruption may be ignored in MultiGet().
* Fix a race condition in pessimistic transactions that could allow multiple transactions with the same name to be registered simultaneously, resulting in a crash or other unpredictable behavior.
## 9.5.0 (07/19/2024)
### Public API Changes
* Introduced new C API function rocksdb_writebatch_iterate_cf for column family-aware iteration over the contents of a WriteBatch

View File

@ -1652,6 +1652,9 @@ bool ColumnFamilyData::ShouldPostponeFlushToRetainUDT(
}
for (const Slice& table_newest_udt :
imm()->GetTablesNewestUDT(max_memtable_id)) {
if (table_newest_udt.empty()) {
continue;
}
assert(table_newest_udt.size() == full_history_ts_low.size());
// Checking the newest UDT contained in MemTable with ascending ID up to
// `max_memtable_id`. Return immediately on finding the first MemTable that

View File

@ -3067,12 +3067,20 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) {
WaitForCompaction();
AssertFilesPerLevel("0,1", 0 /* cf */);
// We should calculate the limit by obtaining the number of env background
// threads, because the current test case will share the same env
// with another case that may have already increased the number of
// background threads which is larger than kParallelismLimit
const auto limit = env_->GetBackgroundThreads(Env::Priority::LOW);
// Block the compaction thread pool so marked files accumulate in L0.
test::SleepingBackgroundTask sleeping_tasks[kParallelismLimit];
for (int i = 0; i < kParallelismLimit; i++) {
std::vector<std::shared_ptr<test::SleepingBackgroundTask>> sleeping_tasks;
for (int i = 0; i < limit; i++) {
sleeping_tasks.emplace_back(
std::make_shared<test::SleepingBackgroundTask>());
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
&sleeping_tasks[i], Env::Priority::LOW);
sleeping_tasks[i].WaitUntilSleeping();
sleeping_tasks[i].get(), Env::Priority::LOW);
sleeping_tasks[i]->WaitUntilSleeping();
}
// Zero marked upper-level files. No speedup.
@ -3091,9 +3099,9 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) {
ASSERT_EQ(kParallelismLimit, dbfull()->TEST_BGCompactionsAllowed());
AssertFilesPerLevel("2,1", 0 /* cf */);
for (int i = 0; i < kParallelismLimit; i++) {
sleeping_tasks[i].WakeUp();
sleeping_tasks[i].WaitUntilDone();
for (int i = 0; i < limit; i++) {
sleeping_tasks[i]->WakeUp();
sleeping_tasks[i]->WaitUntilDone();
}
}

View File

@ -552,7 +552,8 @@ class CompactionJobTestBase : public testing::Test {
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*read_only=*/false));
compaction_job_stats_.Reset();
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_));
ASSERT_OK(
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
VersionEdit new_db;
new_db.SetLogNumber(0);
@ -575,7 +576,8 @@ class CompactionJobTestBase : public testing::Test {
}
ASSERT_OK(s);
// Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
Temperature::kUnknown, nullptr);
ASSERT_OK(s);

View File

@ -925,11 +925,15 @@ bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {
}
uint64_t l0_size = 0;
for (const auto& file : l0_files) {
l0_size += file->fd.GetFileSize();
assert(file->compensated_file_size >= file->fd.GetFileSize());
// Compact down L0s with more deletions.
l0_size += file->compensated_file_size;
}
const uint64_t min_lbase_size =
l0_size * static_cast<uint64_t>(std::max(
10.0, mutable_cf_options_.max_bytes_for_level_multiplier));
// Avoid L0->Lbase compactions that are inefficient for write-amp.
const double kMultiplier =
std::max(10.0, mutable_cf_options_.max_bytes_for_level_multiplier) * 2;
const uint64_t min_lbase_size = MultiplyCheckOverflow(l0_size, kMultiplier);
assert(min_lbase_size >= l0_size);
const std::vector<FileMetaData*>& lbase_files =
vstorage_->LevelFiles(/*level=*/base_level);

View File

@ -214,7 +214,10 @@ class CompactionPickerTest : public CompactionPickerTestBase {
explicit CompactionPickerTest()
: CompactionPickerTestBase(BytewiseComparator()) {}
~CompactionPickerTest() override = default;
~CompactionPickerTest() override {
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->DisableProcessing();
}
};
class CompactionPickerU64TsTest : public CompactionPickerTestBase {
@ -4284,27 +4287,28 @@ TEST_F(CompactionPickerTest, IntraL0WhenL0IsSmall) {
SCOPED_TRACE("lbase_size_multiplier=" +
std::to_string(lbase_size_multiplier));
NewVersionStorage(6, kCompactionStyleLevel);
// When L0 size is <= Lbase size / max_bytes_for_level_multiplier,
// When L0 size is <= Lbase size / max_bytes_for_level_multiplier / 2,
// intra-L0 compaction is picked. Otherwise, L0->L1
// compaction is picked.
// compensated_file_size will be used to compute total l0 size.
Add(/*level=*/0, /*file_number=*/1U, /*smallest=*/"100",
/*largest=*/"200", /*file_size=*/1000, /*path_id=*/0,
/*largest=*/"200", /*file_size=*/10, /*path_id=*/0,
/*smallest_seq=*/10, /*largest_seq=*/11,
/*compensated_file_size=*/1000);
Add(/*level=*/0, /*file_number=*/2U, /*smallest=*/"100",
/*largest=*/"100", /*file_size=*/1000, /*path_id=*/0,
/*largest=*/"100", /*file_size=*/10, /*path_id=*/0,
/*smallest_seq=*/20, /*largest_seq=*/21,
/*compensated_file_size=*/1000);
Add(/*level=*/0, /*file_number=*/3U, /*smallest=*/"100",
/*largest=*/"200", /*file_size=*/1000, /*path_id=*/0,
/*largest=*/"200", /*file_size=*/10, /*path_id=*/0,
/*smallest_seq=*/30, /*largest_seq=*/31,
/*compensated_file_size=*/1000);
Add(/*level=*/0, /*file_number=*/4U, /*smallest=*/"100",
/*largest=*/"200", /*file_size=*/1000, /*path_id=*/0,
/*largest=*/"200", /*file_size=*/10, /*path_id=*/0,
/*smallest_seq=*/40, /*largest_seq=*/41,
/*compensated_file_size=*/1000);
const uint64_t l0_size = 4000;
const uint64_t lbase_size = l0_size * lbase_size_multiplier;
const uint64_t lbase_size = l0_size * lbase_size_multiplier * 2;
Add(/*level=*/1, /*file_number=*/5U, /*smallest=*/"100",
/*largest=*/"200", /*file_size=*/lbase_size, /*path_id=*/0,
/*smallest_seq=*/0, /*largest_seq=*/0,

View File

@ -140,9 +140,13 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
return compaction_status;
}
// CompactionServiceJobStatus::kSuccess was returned, but somehow we failed to
// read the result. Consider this as an installation failure
if (!s.ok()) {
sub_compact->status = s;
compaction_result.status.PermitUncheckedError();
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
return CompactionServiceJobStatus::kFailure;
}
sub_compact->status = compaction_result.status;
@ -154,18 +158,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
is_first_one = false;
}
ROCKS_LOG_INFO(db_options_.info_log,
"[%s] [JOB %d] Receive remote compaction result, output path: "
"%s, files: %s",
compaction_input.column_family.name.c_str(), job_id_,
compaction_result.output_path.c_str(),
output_files_oss.str().c_str());
if (!s.ok()) {
sub_compact->status = s;
return CompactionServiceJobStatus::kFailure;
}
ROCKS_LOG_INFO(
db_options_.info_log,
"[%s] [JOB %d] Received remote compaction result, output path: "
"%s, files: %s",
compaction_input.column_family.name.c_str(), job_id_,
compaction_result.output_path.c_str(), output_files_oss.str().c_str());
// Installation Starts
for (const auto& file : compaction_result.output_files) {
uint64_t file_num = versions_->NewFileNumber();
auto src_file = compaction_result.output_path + "/" + file.file_name;
@ -174,6 +174,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
if (!s.ok()) {
sub_compact->status = s;
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
return CompactionServiceJobStatus::kFailure;
}
@ -182,6 +184,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
if (!s.ok()) {
sub_compact->status = s;
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
return CompactionServiceJobStatus::kFailure;
}
meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
@ -206,6 +210,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
compaction_result.bytes_written);
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kSuccess);
return CompactionServiceJobStatus::kSuccess;
}

View File

@ -108,6 +108,11 @@ class MyTestCompactionService : public CompactionService {
}
}
void OnInstallation(const std::string& /*scheduled_job_id*/,
CompactionServiceJobStatus status) override {
final_updated_status_ = status;
}
int GetCompactionNum() { return compaction_num_.load(); }
CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; }
@ -136,6 +141,10 @@ class MyTestCompactionService : public CompactionService {
void SetCanceled(bool canceled) { canceled_ = canceled; }
CompactionServiceJobStatus GetFinalCompactionServiceJobStatus() {
return final_updated_status_.load();
}
private:
InstrumentedMutex mutex_;
std::atomic_int compaction_num_{0};
@ -158,6 +167,8 @@ class MyTestCompactionService : public CompactionService {
std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
table_properties_collector_factories_;
std::atomic_bool canceled_{false};
std::atomic<CompactionServiceJobStatus> final_updated_status_{
CompactionServiceJobStatus::kUseLocal};
};
class CompactionServiceTest : public DBTestBase {
@ -255,6 +266,8 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
auto my_cs = GetCompactionService();
ASSERT_GE(my_cs->GetCompactionNum(), 1);
ASSERT_EQ(CompactionServiceJobStatus::kSuccess,
my_cs->GetFinalCompactionServiceJobStatus());
// make sure the compaction statistics is only recorded on the remote side
ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1);
@ -437,6 +450,8 @@ TEST_F(CompactionServiceTest, InvalidResult) {
Slice end(end_str);
Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
ASSERT_FALSE(s.ok());
ASSERT_EQ(CompactionServiceJobStatus::kFailure,
my_cs->GetFinalCompactionServiceJobStatus());
}
TEST_F(CompactionServiceTest, SubCompaction) {

View File

@ -3407,6 +3407,46 @@ class TableFileListener : public EventListener {
InstrumentedMutex mutex_;
std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_;
};
class FlushTableFileListener : public EventListener {
public:
void OnTableFileCreated(const TableFileCreationInfo& info) override {
InstrumentedMutexLock lock(&mutex_);
if (info.reason != TableFileCreationReason::kFlush) {
return;
}
cf_to_flushed_files_[info.cf_name].push_back(info.file_path);
}
std::vector<std::string>& GetFlushedFiles(const std::string& cf_name) {
InstrumentedMutexLock lock(&mutex_);
return cf_to_flushed_files_[cf_name];
}
private:
InstrumentedMutex mutex_;
std::unordered_map<std::string, std::vector<std::string>>
cf_to_flushed_files_;
};
class FlushBlobFileListener : public EventListener {
public:
void OnBlobFileCreated(const BlobFileCreationInfo& info) override {
InstrumentedMutexLock lock(&mutex_);
if (info.reason != BlobFileCreationReason::kFlush) {
return;
}
cf_to_flushed_blobs_files_[info.cf_name].push_back(info.file_path);
}
std::vector<std::string>& GetFlushedBlobFiles(const std::string& cf_name) {
InstrumentedMutexLock lock(&mutex_);
return cf_to_flushed_blobs_files_[cf_name];
}
private:
InstrumentedMutex mutex_;
std::unordered_map<std::string, std::vector<std::string>>
cf_to_flushed_blobs_files_;
};
} // anonymous namespace
TEST_F(DBBasicTest, LastSstFileNotInManifest) {
@ -3512,6 +3552,121 @@ TEST_F(DBBasicTest, RecoverWithMissingFiles) {
}
}
// Param 0: whether to enable blob DB.
// Param 1: when blob DB is enabled, whether to also delete the missing L0
// file's associated blob file.
class BestEffortsRecoverIncompleteVersionTest
: public DBTestBase,
public testing::WithParamInterface<std::tuple<bool, bool>> {
public:
BestEffortsRecoverIncompleteVersionTest()
: DBTestBase("best_efforts_recover_incomplete_version_test",
/*env_do_fsync=*/false) {}
};
TEST_P(BestEffortsRecoverIncompleteVersionTest, Basic) {
Options options = CurrentOptions();
options.enable_blob_files = std::get<0>(GetParam());
bool delete_blob_file_too = std::get<1>(GetParam());
DestroyAndReopen(options);
FlushTableFileListener* flush_table_listener = new FlushTableFileListener();
FlushBlobFileListener* flush_blob_listener = new FlushBlobFileListener();
// Disable auto compaction to simplify SST file name tracking.
options.disable_auto_compactions = true;
options.listeners.emplace_back(flush_table_listener);
options.listeners.emplace_back(flush_blob_listener);
CreateAndReopenWithCF({"pikachu", "eevee"}, options);
std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
"eevee"};
int num_cfs = static_cast<int>(handles_.size());
ASSERT_EQ(3, num_cfs);
std::string start = "a";
Slice start_slice = start;
std::string end = "d";
Slice end_slice = end;
for (int cf = 0; cf != num_cfs; ++cf) {
ASSERT_OK(Put(cf, "a", "a_value"));
ASSERT_OK(Flush(cf));
// Compact file to L1 to avoid trivial file move in the next compaction
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf],
&start_slice, &end_slice));
ASSERT_OK(Put(cf, "a", "a_value_new"));
ASSERT_OK(Flush(cf));
ASSERT_OK(Put(cf, "b", "b_value"));
ASSERT_OK(Flush(cf));
ASSERT_OK(Put(cf, "f", "f_value"));
ASSERT_OK(Flush(cf));
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf],
&start_slice, &end_slice));
}
dbfull()->TEST_DeleteObsoleteFiles();
// Delete the most recent L0 file which is before a compaction.
for (int i = 0; i < num_cfs; ++i) {
std::vector<std::string>& files =
flush_table_listener->GetFlushedFiles(all_cf_names[i]);
ASSERT_EQ(4, files.size());
ASSERT_OK(env_->DeleteFile(files[files.size() - 1]));
if (options.enable_blob_files) {
std::vector<std::string>& blob_files =
flush_blob_listener->GetFlushedBlobFiles(all_cf_names[i]);
ASSERT_EQ(4, blob_files.size());
if (delete_blob_file_too) {
ASSERT_OK(env_->DeleteFile(blob_files[files.size() - 1]));
}
}
}
options.best_efforts_recovery = true;
ReopenWithColumnFamilies(all_cf_names, options);
for (int i = 0; i < num_cfs; ++i) {
auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
ColumnFamilyData* cfd = cfh->cfd();
VersionStorageInfo* vstorage = cfd->current()->storage_info();
// The L0 file flushed right before the last compaction is missing.
ASSERT_EQ(0, vstorage->LevelFiles(0).size());
// Only the output of the last compaction is available.
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
}
// Verify data
ReadOptions read_opts;
read_opts.total_order_seek = true;
for (int i = 0; i < num_cfs; ++i) {
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[i]));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("a", iter->key());
ASSERT_EQ("a_value_new", iter->value());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("b", iter->key());
ASSERT_EQ("b_value", iter->value());
iter->Next();
ASSERT_FALSE(iter->Valid());
ASSERT_OK(iter->status());
}
// Write more data.
for (int cf = 0; cf < num_cfs; ++cf) {
ASSERT_OK(Put(cf, "g", "g_value"));
ASSERT_OK(Flush(cf));
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr,
nullptr));
std::string value;
ASSERT_OK(db_->Get(ReadOptions(), handles_[cf], "g", &value));
ASSERT_EQ("g_value", value);
}
}
INSTANTIATE_TEST_CASE_P(BestEffortsRecoverIncompleteVersionTest,
BestEffortsRecoverIncompleteVersionTest,
testing::Values(std::make_tuple(false, false),
std::make_tuple(true, false),
std::make_tuple(true, true)));
TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
Options options = CurrentOptions();
options.env = env_;

File diff suppressed because it is too large Load Diff

View File

@ -289,10 +289,12 @@ TEST_F(DBFollowerTest, RetryCatchup) {
{"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
{"DBImpl::BackgroundCompaction:Start",
"DBImplFollower::TryCatchupWithLeader:Begin2"},
{"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1",
{"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin1",
"DBImpl::BackgroundCompaction:BeforeCompaction"},
{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"},
"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin2"},
{"DBImplFollower::TryCatchupWithLeader:End", "Follower::WaitForCatchup"},
});
SyncPoint::GetInstance()->EnableProcessing();
@ -335,10 +337,12 @@ TEST_F(DBFollowerTest, RetryCatchupManifestRollover) {
SyncPoint::GetInstance()->LoadDependency({
{"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
{"Leader::Flushed", "DBImplFollower::TryCatchupWithLeader:Begin2"},
{"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1",
{"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin1",
"Leader::Done"},
{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"},
"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin2"},
{"DBImplFollower::TryCatchupWithLeader:End",
"Follower::WaitForCatchup:1"},
});

View File

@ -17,6 +17,7 @@
#include <cstdio>
#include <map>
#include <memory>
#include <optional>
#include <set>
#include <sstream>
#include <stdexcept>
@ -2475,7 +2476,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
RecordTick(stats_, MEMTABLE_HIT);
}
}
if (!done && !s.ok() && !s.IsMergeInProgress()) {
if (!s.ok() && !s.IsMergeInProgress() && !s.IsNotFound()) {
assert(done);
ReturnAndCleanupSuperVersion(cfd, sv);
return s;
}
@ -3141,10 +3143,11 @@ Status DBImpl::MultiGetImpl(
StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
assert(sorted_keys);
assert(start_key + num_keys <= sorted_keys->size());
// Clear the timestamps for returning results so that we can distinguish
// between tombstone or key that has never been written
for (auto* kctx : *sorted_keys) {
assert(kctx);
for (size_t i = start_key; i < start_key + num_keys; ++i) {
KeyContext* kctx = (*sorted_keys)[i];
if (kctx->timestamp) {
kctx->timestamp->clear();
}
@ -5240,6 +5243,14 @@ Status DestroyDB(const std::string& dbname, const Options& options,
Env* env = soptions.env;
std::vector<std::string> filenames;
bool wal_in_db_path = soptions.IsWalDirSameAsDBPath();
auto sfm = static_cast_with_check<SstFileManagerImpl>(
options.sst_file_manager.get());
// Allocate a separate trash bucket to be used by all the to be deleted
// files, so we can later wait for this bucket to be empty before return.
std::optional<int32_t> bucket;
if (sfm) {
bucket = sfm->NewTrashBucket();
}
// Reset the logger because it holds a handle to the
// log file and prevents cleanup and directory removal
@ -5251,6 +5262,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
/*IODebugContext*=*/nullptr)
.PermitUncheckedError();
std::set<std::string> paths_to_delete;
FileLock* lock;
const std::string lockname = LockFileName(dbname);
Status result = env->LockFile(lockname, &lock);
@ -5267,10 +5279,9 @@ Status DestroyDB(const std::string& dbname, const Options& options,
del = DestroyDB(path_to_delete, options);
} else if (type == kTableFile || type == kWalFile ||
type == kBlobFile) {
del = DeleteDBFile(
&soptions, path_to_delete, dbname,
/*force_bg=*/false,
/*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false);
del = DeleteUnaccountedDBFile(&soptions, path_to_delete, dbname,
/*force_bg=*/false,
/*force_fg=*/false, bucket);
} else {
del = env->DeleteFile(path_to_delete);
}
@ -5279,6 +5290,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
}
}
}
paths_to_delete.insert(dbname);
std::set<std::string> paths;
for (const DbPath& db_path : options.db_paths) {
@ -5300,18 +5312,19 @@ Status DestroyDB(const std::string& dbname, const Options& options,
(type == kTableFile ||
type == kBlobFile)) { // Lock file will be deleted at end
std::string file_path = path + "/" + fname;
Status del = DeleteDBFile(&soptions, file_path, dbname,
/*force_bg=*/false, /*force_fg=*/false);
Status del = DeleteUnaccountedDBFile(&soptions, file_path, dbname,
/*force_bg=*/false,
/*force_fg=*/false, bucket);
if (!del.ok() && result.ok()) {
result = del;
}
}
}
// TODO: Should we return an error if we cannot delete the directory?
env->DeleteDir(path).PermitUncheckedError();
}
}
paths_to_delete.merge(paths);
std::vector<std::string> walDirFiles;
std::string archivedir = ArchivalDirectory(dbname);
bool wal_dir_exists = false;
@ -5335,46 +5348,49 @@ Status DestroyDB(const std::string& dbname, const Options& options,
// Delete archival files.
for (const auto& file : archiveFiles) {
if (ParseFileName(file, &number, &type) && type == kWalFile) {
Status del =
DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
/*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
Status del = DeleteUnaccountedDBFile(
&soptions, archivedir + "/" + file, archivedir,
/*force_bg=*/false, /*force_fg=*/!wal_in_db_path, bucket);
if (!del.ok() && result.ok()) {
result = del;
}
}
}
// Ignore error in case dir contains other files
env->DeleteDir(archivedir).PermitUncheckedError();
paths_to_delete.insert(archivedir);
}
// Delete log files in the WAL dir
if (wal_dir_exists) {
for (const auto& file : walDirFiles) {
if (ParseFileName(file, &number, &type) && type == kWalFile) {
Status del =
DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
soptions.wal_dir, /*force_bg=*/false,
/*force_fg=*/!wal_in_db_path);
Status del = DeleteUnaccountedDBFile(
&soptions, LogFileName(soptions.wal_dir, number),
soptions.wal_dir, /*force_bg=*/false,
/*force_fg=*/!wal_in_db_path, bucket);
if (!del.ok() && result.ok()) {
result = del;
}
}
}
// Ignore error in case dir contains other files
env->DeleteDir(soptions.wal_dir).PermitUncheckedError();
paths_to_delete.insert(soptions.wal_dir);
}
// Ignore error since state is already gone
env->UnlockFile(lock).PermitUncheckedError();
env->DeleteFile(lockname).PermitUncheckedError();
// Make sure trash files are all cleared before return.
if (sfm && bucket.has_value()) {
sfm->WaitForEmptyTrashBucket(bucket.value());
}
// sst_file_manager holds a ref to the logger. Make sure the logger is
// gone before trying to remove the directory.
soptions.sst_file_manager.reset();
// Ignore error in case dir contains other files
env->DeleteDir(dbname).PermitUncheckedError();
;
for (const auto& path_to_delete : paths_to_delete) {
env->DeleteDir(path_to_delete).PermitUncheckedError();
}
}
return result;
}
@ -5820,11 +5836,6 @@ Status DBImpl::IngestExternalFiles(
"write_global_seqno is deprecated and does not work with "
"allow_db_generated_files.");
}
if (ingest_opts.move_files) {
return Status::NotSupported(
"Options move_files and allow_db_generated_files are not "
"compatible.");
}
}
}

View File

@ -1226,6 +1226,8 @@ class DBImpl : public DB {
return logs_.back().number;
}
void TEST_DeleteObsoleteFiles();
const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
return files_grabbed_for_purge_;
}

View File

@ -314,6 +314,11 @@ const autovector<uint64_t>& DBImpl::TEST_GetFilesToQuarantine() const {
return error_handler_.GetFilesToQuarantine();
}
void DBImpl::TEST_DeleteObsoleteFiles() {
InstrumentedMutexLock l(&mutex_);
DeleteObsoleteFiles();
}
size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
InstrumentedMutexLock l(&const_cast<DBImpl*>(this)->stats_history_mutex_);
return EstimateInMemoryStatsHistorySize();

View File

@ -970,7 +970,9 @@ Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only,
}
// Persist it to IDENTITY file if allowed
if (!read_only) {
s = SetIdentityFile(write_options, env_, dbname_, db_id_);
s = SetIdentityFile(write_options, env_, dbname_,
immutable_db_options_.metadata_write_temperature,
db_id_);
}
return s;
}

View File

@ -295,7 +295,8 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) {
Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
VersionEdit new_db;
const WriteOptions write_options(Env::IOActivity::kDBOpen);
Status s = SetIdentityFile(write_options, env_, dbname_);
Status s = SetIdentityFile(write_options, env_, dbname_,
immutable_db_options_.metadata_write_temperature);
if (!s.ok()) {
return s;
}
@ -319,6 +320,12 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
}
std::unique_ptr<FSWritableFile> file;
FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
// DB option takes precedence when not kUnknown
if (immutable_db_options_.metadata_write_temperature !=
Temperature::kUnknown) {
file_options.temperature =
immutable_db_options_.metadata_write_temperature;
}
s = NewWritableFile(fs_.get(), manifest, &file, file_options);
if (!s.ok()) {
return s;
@ -344,6 +351,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
if (s.ok()) {
// Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(write_options, fs_.get(), dbname_, 1,
immutable_db_options_.metadata_write_temperature,
directories_.GetDbDir());
if (new_filenames) {
new_filenames->emplace_back(
@ -530,6 +538,12 @@ Status DBImpl::Recover(
/*no_error_if_files_missing=*/false, is_retry,
&desc_status);
desc_status.PermitUncheckedError();
if (is_retry) {
RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_COUNT);
if (desc_status.ok()) {
RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
}
}
if (can_retry) {
// If we're opening for the first time and the failure is likely due to
// a corrupt MANIFEST file (could result in either the log::Reader
@ -1930,6 +1944,10 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
BuildDBOptions(immutable_db_options_, mutable_db_options_);
FileOptions opt_file_options =
fs_->OptimizeForLogWrite(file_options_, db_options);
// DB option takes precedence when not kUnknown
if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) {
opt_file_options.temperature = immutable_db_options_.wal_write_temperature;
}
std::string wal_dir = immutable_db_options_.GetWalDir();
std::string log_fname = LogFileName(wal_dir, log_file_num);

View File

@ -969,21 +969,17 @@ Status DBImpl::WriteImplWALOnly(
assert(w.state == WriteThread::STATE_GROUP_LEADER);
if (publish_last_seq == kDoPublishLastSeq) {
Status status;
// Currently we only use kDoPublishLastSeq in unordered_write
assert(immutable_db_options_.unordered_write);
WriteContext write_context;
if (error_handler_.IsDBStopped()) {
status = error_handler_.GetBGError();
}
// TODO(myabandeh): Make preliminary checks thread-safe so we could do them
// without paying the cost of obtaining the mutex.
if (status.ok()) {
LogContext log_context;
status = PreprocessWrite(write_options, &log_context, &write_context);
WriteStatusCheckOnLocked(status);
}
LogContext log_context;
WriteContext write_context;
Status status =
PreprocessWrite(write_options, &log_context, &write_context);
WriteStatusCheckOnLocked(status);
if (!status.ok()) {
WriteThread::WriteGroup write_group;
write_thread->EnterAsBatchGroupLeader(&w, &write_group);

View File

@ -705,6 +705,7 @@ class DBIOCorruptionTest
DBIOCorruptionTest() : DBIOFailureTest() {
BlockBasedTableOptions bbto;
options_ = CurrentOptions();
options_.statistics = CreateDBStatistics();
base_env_ = env_;
EXPECT_NE(base_env_, nullptr);
@ -727,6 +728,8 @@ class DBIOCorruptionTest
Status ReopenDB() { return TryReopen(options_); }
Statistics* stats() { return options_.statistics.get(); }
protected:
std::unique_ptr<Env> env_guard_;
std::shared_ptr<CorruptionFS> fs_;
@ -749,8 +752,12 @@ TEST_P(DBIOCorruptionTest, GetReadCorruptionRetry) {
if (std::get<2>(GetParam())) {
ASSERT_OK(s);
ASSERT_EQ(val, "val1");
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else {
ASSERT_TRUE(s.IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
}
}
@ -773,8 +780,12 @@ TEST_P(DBIOCorruptionTest, IterReadCorruptionRetry) {
}
if (std::get<2>(GetParam())) {
ASSERT_OK(iter->status());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else {
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
}
delete iter;
}
@ -799,9 +810,13 @@ TEST_P(DBIOCorruptionTest, MultiGetReadCorruptionRetry) {
if (std::get<2>(GetParam())) {
ASSERT_EQ(values[0].ToString(), "val1");
ASSERT_EQ(values[1].ToString(), "val2");
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else {
ASSERT_TRUE(statuses[0].IsCorruption());
ASSERT_TRUE(statuses[1].IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
}
}
@ -818,6 +833,9 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
if (std::get<2>(GetParam())) {
ASSERT_OK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
std::string val;
ReadOptions ro;
@ -826,6 +844,7 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
ASSERT_EQ(val, "val1");
} else {
ASSERT_TRUE(s.IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
}
}
@ -838,6 +857,9 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
Status s = Flush();
if (std::get<2>(GetParam())) {
ASSERT_OK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
std::string val;
ReadOptions ro;
@ -846,6 +868,7 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
ASSERT_EQ(val, "val1");
} else {
ASSERT_NOK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
}
}
@ -862,8 +885,12 @@ TEST_P(DBIOCorruptionTest, ManifestCorruptionRetry) {
if (std::get<2>(GetParam())) {
ASSERT_OK(ReopenDB());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else {
ASSERT_EQ(ReopenDB(), Status::Corruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
}
SyncPoint::GetInstance()->DisableProcessing();
}

View File

@ -684,13 +684,14 @@ class DbMemtableKVChecksumTest : public DbKvChecksumTest {
DbMemtableKVChecksumTest() : DbKvChecksumTest() {}
protected:
const size_t kValueLenOffset = 12;
// Indices in the memtable entry that we will not corrupt.
// For memtable entry format, see comments in MemTable::Add().
// We do not corrupt key length and value length fields in this test
// case since it causes segfault and ASAN will complain.
// For this test case, key and value are all of length 3, so
// key length field is at index 0 and value length field is at index 12.
const std::set<size_t> index_not_to_corrupt{0, 12};
const std::set<size_t> index_not_to_corrupt{0, kValueLenOffset};
void SkipNotToCorruptEntry() {
if (index_not_to_corrupt.find(corrupt_byte_offset_) !=
@ -737,6 +738,8 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
buf[corrupt_byte_offset_] += corrupt_byte_addend_;
++corrupt_byte_offset_;
});
// Corrupt value only so that MultiGet below can find the key.
corrupt_byte_offset_ = kValueLenOffset + 1;
SyncPoint::GetInstance()->EnableProcessing();
Options options = CurrentOptions();
options.memtable_protection_bytes_per_key =
@ -745,12 +748,17 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
options.merge_operator = MergeOperators::CreateStringAppendOperator();
}
std::string key = "key";
SkipNotToCorruptEntry();
while (MoreBytesToCorrupt()) {
Reopen(options);
ASSERT_OK(ExecuteWrite(nullptr));
std::string val;
ASSERT_TRUE(db_->Get(ReadOptions(), "key", &val).IsCorruption());
ASSERT_TRUE(db_->Get(ReadOptions(), key, &val).IsCorruption());
std::vector<std::string> vals = {val};
std::vector<Status> statuses = db_->MultiGet(
ReadOptions(), {db_->DefaultColumnFamily()}, {key}, &vals, nullptr);
ASSERT_TRUE(statuses[0].IsCorruption());
Destroy(options);
SkipNotToCorruptEntry();
}

View File

@ -339,6 +339,91 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
}
}
TEST_F(DBMemTableTest, IntegrityChecks) {
// We insert keys key000000, key000001 and key000002 into skiplist at fixed
// height 1 (smallest height). Then we corrupt the second key to aey000001 to
// make it smaller. With `paranoid_memory_checks` set to true, if the
// skip list sees key000000 and then aey000001, then it will report out of
// order keys with corruption status. With `paranoid_memory_checks` set
// to false, read/scan may return wrong results.
for (bool allow_data_in_error : {false, true}) {
Options options = CurrentOptions();
options.allow_data_in_errors = allow_data_in_error;
options.paranoid_memory_checks = true;
DestroyAndReopen(options);
SyncPoint::GetInstance()->SetCallBack(
"InlineSkipList::RandomHeight::height", [](void* h) {
auto height_ptr = static_cast<int*>(h);
*height_ptr = 1;
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(Put(Key(0), "val0"));
ASSERT_OK(Put(Key(2), "val2"));
// p will point to the buffer for encoded key000001
char* p = nullptr;
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:BeforeReturn:Encoded", [&](void* encoded) {
p = const_cast<char*>(static_cast<Slice*>(encoded)->data());
});
ASSERT_OK(Put(Key(1), "val1"));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
ASSERT_TRUE(p);
// Offset 0 is key size, key bytes start at offset 1.
// "key000001 -> aey000001"
p[1] = 'a';
ReadOptions rops;
std::string val;
Status s = db_->Get(rops, Key(1), &val);
ASSERT_TRUE(s.IsCorruption());
std::string key0 = Slice(Key(0)).ToString(true);
ASSERT_EQ(s.ToString().find(key0) != std::string::npos,
allow_data_in_error);
// Without `paranoid_memory_checks`, NotFound will be returned.
// This would fail an assertion in InlineSkipList::FindGreaterOrEqual().
// If we remove the assertion, this passes.
// ASSERT_TRUE(db_->Get(ReadOptions(), Key(1), &val).IsNotFound());
std::vector<std::string> vals;
std::vector<Status> statuses = db_->MultiGet(
rops, {db_->DefaultColumnFamily()}, {Key(1)}, &vals, nullptr);
ASSERT_TRUE(statuses[0].IsCorruption());
ASSERT_EQ(statuses[0].ToString().find(key0) != std::string::npos,
allow_data_in_error);
std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
ASSERT_OK(iter->status());
iter->Seek(Key(1));
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
iter->Seek(Key(0));
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
// iterating through skip list at height at 1 should catch out-of-order keys
iter->Next();
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
ASSERT_FALSE(iter->Valid());
iter->SeekForPrev(Key(2));
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
// Internally DB Iter will iterate backwards (call Prev()) after
// SeekToLast() to find the correct internal key with the last user key.
// Prev() will do integrity checks and catch corruption.
iter->SeekToLast();
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
ASSERT_FALSE(iter->Valid());
}
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

View File

@ -507,6 +507,23 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
ASSERT_EQ(files_deleted, 0);
ASSERT_EQ(files_scheduled_to_delete, 0);
Close();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
assert(arg);
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
++files_scheduled_to_delete;
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
files_deleted++;
}
});
ASSERT_OK(DestroyDB(dbname_, options));
ASSERT_EQ(files_deleted, blob_files.size());
ASSERT_EQ(files_scheduled_to_delete, blob_files.size());
@ -649,6 +666,23 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
}
Close();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
assert(arg);
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
++files_scheduled_to_delete;
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
files_deleted++;
}
});
ASSERT_OK(DestroyDB(dbname_, options));
sfm->WaitForEmptyTrash();
ASSERT_EQ(files_deleted, 5);
@ -883,8 +917,9 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
// Create 4 files in L0
for (char v = 'a'; v <= 'd'; v++) {
if (v == 'c') {
// Maximize the change that the last log file will be preserved in trash
// before restarting the DB.
// Maximize the chance that the last log file will be preserved in trash
// before restarting the DB. (Enable slow deletion but at a very slow
// deletion rate)
// We have to set this on the 2nd to last file for it to delay deletion
// on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash())
options.sst_file_manager->SetDeleteRateBytesPerSecond(1);
@ -1902,6 +1937,24 @@ TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) {
ASSERT_EQ(files_deleted, 1);
Close();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
assert(arg);
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
++files_scheduled_to_delete;
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
files_deleted++;
}
});
ASSERT_OK(DestroyDB(dbname_, options));
ASSERT_EQ(files_scheduled_to_delete, 4);

View File

@ -10,6 +10,7 @@
#include <atomic>
#include <cstdlib>
#include <functional>
#include <iostream>
#include <memory>
#include "db/db_test_util.h"
@ -26,6 +27,7 @@
#include "rocksdb/utilities/replayer.h"
#include "rocksdb/wal_filter.h"
#include "test_util/testutil.h"
#include "util/defer.h"
#include "util/random.h"
#include "utilities/fault_injection_env.h"
@ -6544,6 +6546,235 @@ TEST_P(RenameCurrentTest, Compaction) {
ASSERT_EQ("d_value", Get("d"));
}
TEST_F(DBTest2, VariousFileTemperatures) {
constexpr size_t kNumberFileTypes = static_cast<size_t>(kBlobFile) + 1U;
struct MyTestFS : public FileTemperatureTestFS {
explicit MyTestFS(const std::shared_ptr<FileSystem>& fs)
: FileTemperatureTestFS(fs) {
Reset();
}
IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts,
std::unique_ptr<FSWritableFile>* result,
IODebugContext* dbg) override {
IOStatus ios =
FileTemperatureTestFS::NewWritableFile(fname, opts, result, dbg);
if (ios.ok()) {
uint64_t number;
FileType type;
if (ParseFileName(GetFileName(fname), &number, "LOG", &type)) {
if (type == kTableFile) {
// Not checked here
} else if (type == kWalFile) {
if (opts.temperature != expected_wal_temperature) {
std::cerr << "Attempt to open " << fname << " with temperature "
<< temperature_to_string[opts.temperature]
<< " rather than "
<< temperature_to_string[expected_wal_temperature]
<< std::endl;
assert(false);
}
} else if (type == kDescriptorFile) {
if (opts.temperature != expected_manifest_temperature) {
std::cerr << "Attempt to open " << fname << " with temperature "
<< temperature_to_string[opts.temperature]
<< " rather than "
<< temperature_to_string[expected_wal_temperature]
<< std::endl;
assert(false);
}
} else if (opts.temperature != expected_other_metadata_temperature) {
std::cerr << "Attempt to open " << fname << " with temperature "
<< temperature_to_string[opts.temperature]
<< " rather than "
<< temperature_to_string[expected_wal_temperature]
<< std::endl;
assert(false);
}
UpdateCount(type, 1);
}
}
return ios;
}
IOStatus RenameFile(const std::string& src, const std::string& dst,
const IOOptions& options,
IODebugContext* dbg) override {
IOStatus ios = FileTemperatureTestFS::RenameFile(src, dst, options, dbg);
if (ios.ok()) {
uint64_t number;
FileType src_type;
FileType dst_type;
assert(ParseFileName(GetFileName(src), &number, "LOG", &src_type));
assert(ParseFileName(GetFileName(dst), &number, "LOG", &dst_type));
UpdateCount(src_type, -1);
UpdateCount(dst_type, 1);
}
return ios;
}
void UpdateCount(FileType type, int delta) {
size_t i = static_cast<size_t>(type);
assert(i < kNumberFileTypes);
counts[i].FetchAddRelaxed(delta);
}
std::map<FileType, size_t> PopCounts() {
std::map<FileType, size_t> ret;
for (size_t i = 0; i < kNumberFileTypes; ++i) {
int c = counts[i].ExchangeRelaxed(0);
if (c > 0) {
ret[static_cast<FileType>(i)] = c;
}
}
return ret;
}
FileOptions OptimizeForLogWrite(
const FileOptions& file_options,
const DBOptions& /*db_options*/) const override {
FileOptions opts = file_options;
if (optimize_wal_temperature != Temperature::kUnknown) {
opts.temperature = optimize_wal_temperature;
}
return opts;
}
FileOptions OptimizeForManifestWrite(
const FileOptions& file_options) const override {
FileOptions opts = file_options;
if (optimize_manifest_temperature != Temperature::kUnknown) {
opts.temperature = optimize_manifest_temperature;
}
return opts;
}
void Reset() {
optimize_manifest_temperature = Temperature::kUnknown;
optimize_wal_temperature = Temperature::kUnknown;
expected_manifest_temperature = Temperature::kUnknown;
expected_other_metadata_temperature = Temperature::kUnknown;
expected_wal_temperature = Temperature::kUnknown;
for (auto& c : counts) {
c.StoreRelaxed(0);
}
}
Temperature optimize_manifest_temperature;
Temperature optimize_wal_temperature;
Temperature expected_manifest_temperature;
Temperature expected_other_metadata_temperature;
Temperature expected_wal_temperature;
std::array<RelaxedAtomic<int>, kNumberFileTypes> counts;
};
// We don't have enough non-unknown temps to confidently distinguish that
// a specific setting caused a specific outcome, in a single run. This is a
// reasonable work-around without blowing up test time. Only returns
// non-unknown temperatures.
auto RandomTemp = [] {
static std::vector<Temperature> temps = {
Temperature::kHot, Temperature::kWarm, Temperature::kCold};
return temps[Random::GetTLSInstance()->Uniform(
static_cast<int>(temps.size()))];
};
auto test_fs = std::make_shared<MyTestFS>(env_->GetFileSystem());
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
for (bool use_optimize : {false, true}) {
std::cerr << "use_optimize: " << std::to_string(use_optimize) << std::endl;
for (bool use_temp_options : {false, true}) {
std::cerr << "use_temp_options: " << std::to_string(use_temp_options)
<< std::endl;
Options options = CurrentOptions();
// Currently require for last level temperature
options.compaction_style = kCompactionStyleUniversal;
options.env = env.get();
test_fs->Reset();
if (use_optimize) {
test_fs->optimize_manifest_temperature = RandomTemp();
test_fs->expected_manifest_temperature =
test_fs->optimize_manifest_temperature;
test_fs->optimize_wal_temperature = RandomTemp();
test_fs->expected_wal_temperature = test_fs->optimize_wal_temperature;
}
if (use_temp_options) {
options.metadata_write_temperature = RandomTemp();
test_fs->expected_manifest_temperature =
options.metadata_write_temperature;
test_fs->expected_other_metadata_temperature =
options.metadata_write_temperature;
options.wal_write_temperature = RandomTemp();
test_fs->expected_wal_temperature = options.wal_write_temperature;
options.last_level_temperature = RandomTemp();
options.default_write_temperature = RandomTemp();
}
DestroyAndReopen(options);
Defer closer([&] { Close(); });
using FTC = std::map<FileType, size_t>;
// Files on DB startup
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1},
{kDescriptorFile, 2},
{kCurrentFile, 2},
{kIdentityFile, 1},
{kOptionsFile, 1}}));
// Temperature count map
using TCM = std::map<Temperature, size_t>;
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), TCM({}));
ASSERT_OK(Put("foo", "1"));
ASSERT_OK(Put("bar", "1"));
ASSERT_OK(Flush());
ASSERT_OK(Put("foo", "2"));
ASSERT_OK(Put("bar", "2"));
ASSERT_OK(Flush());
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(),
TCM({{options.default_write_temperature, 2}}));
ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(),
TCM({{options.last_level_temperature, 1}}));
ASSERT_OK(Put("foo", "3"));
ASSERT_OK(Put("bar", "3"));
ASSERT_OK(Flush());
// Just in memtable/WAL
ASSERT_OK(Put("dog", "3"));
{
TCM expected;
expected[options.default_write_temperature] += 1;
expected[options.last_level_temperature] += 1;
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), expected);
}
// New files during operation
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 3}, {kTableFile, 4}}));
Reopen(options);
// New files during re-open/recovery
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1},
{kTableFile, 1},
{kDescriptorFile, 1},
{kCurrentFile, 1},
{kOptionsFile, 1}}));
Destroy(options);
}
}
}
TEST_F(DBTest2, LastLevelTemperature) {
class TestListener : public EventListener {
public:

View File

@ -366,6 +366,11 @@ Options DBTestBase::GetOptions(
table_options.block_cache = NewLRUCache(/* too small */ 1);
}
// Test anticipated new default as much as reasonably possible (and remove
// this code when obsolete)
assert(!table_options.decouple_partitioned_filters);
table_options.decouple_partitioned_filters = true;
bool can_allow_mmap = IsMemoryMappedAccessSupported();
switch (option_config) {
case kHashSkipList:

View File

@ -831,6 +831,15 @@ class FileTemperatureTestFS : public FileSystemWrapper {
return count;
}
std::map<Temperature, size_t> CountCurrentSstFilesByTemp() {
MutexLock lock(&mu_);
std::map<Temperature, size_t> ret;
for (const auto& e : current_sst_file_temperatures_) {
ret[e.second]++;
}
return ret;
}
void OverrideSstFileTemperature(uint64_t number, Temperature temp) {
MutexLock lock(&mu_);
current_sst_file_temperatures_[number] = temp;
@ -842,7 +851,7 @@ class FileTemperatureTestFS : public FileSystemWrapper {
requested_sst_file_temperatures_;
std::map<uint64_t, Temperature> current_sst_file_temperatures_;
std::string GetFileName(const std::string& fname) {
static std::string GetFileName(const std::string& fname) {
auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
// workaround only for Windows that the file path could contain both Windows
// FilePathSeparator and '/'

View File

@ -213,7 +213,6 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
options.num_levels = num_levels_;
options.write_buffer_size = 105 << 10; // 105KB
options.arena_block_size = 4 << 10;
options.target_file_size_base = 32 << 10; // 32KB
// trigger compaction if there are >= 4 files
options.level0_file_num_compaction_trigger = 4;
KeepFilterFactory* filter = new KeepFilterFactory(true);

View File

@ -1472,6 +1472,126 @@ TEST_F(DBWALTest, SyncMultipleLogs) {
ASSERT_OK(dbfull()->SyncWAL());
}
TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) {
Options options = CurrentOptions();
options.max_write_buffer_number = 5;
options.track_and_verify_wals_in_manifest = true;
options.max_bgerror_resume_count = 0; // manual resume
options.recycle_log_file_num = 3;
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
// Disable truncating recycled WALs to new size in posix env
// (approximating a crash)
SyncPoint::GetInstance()->SetCallBack(
"PosixWritableFile::Close",
[](void* arg) { *(static_cast<size_t*>(arg)) = 0; });
SyncPoint::GetInstance()->EnableProcessing();
// Re-open with desired options
DestroyAndReopen(options);
Defer closer([this]() { Close(); });
// Ensure WAL recycling wasn't sanitized away
ASSERT_EQ(db_->GetOptions().recycle_log_file_num,
options.recycle_log_file_num);
// Prepare external files for later ingestion
std::string sst_files_dir = dbname_ + "/sst_files/";
ASSERT_OK(DestroyDir(env_, sst_files_dir));
ASSERT_OK(env_->CreateDir(sst_files_dir));
std::string external_file1 = sst_files_dir + "file1.sst";
{
SstFileWriter sst_file_writer(EnvOptions(), options);
ASSERT_OK(sst_file_writer.Open(external_file1));
ASSERT_OK(sst_file_writer.Put("external1", "ex1"));
ExternalSstFileInfo file_info;
ASSERT_OK(sst_file_writer.Finish(&file_info));
}
std::string external_file2 = sst_files_dir + "file2.sst";
{
SstFileWriter sst_file_writer(EnvOptions(), options);
ASSERT_OK(sst_file_writer.Open(external_file2));
ASSERT_OK(sst_file_writer.Put("external2", "ex2"));
ExternalSstFileInfo file_info;
ASSERT_OK(sst_file_writer.Finish(&file_info));
}
// Populate some WALs to be recycled such that there will be extra data
// from an old incarnation of the WAL on recovery
ASSERT_OK(db_->PauseBackgroundWork());
ASSERT_OK(Put("ignore1", Random::GetTLSInstance()->RandomString(500)));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(Put("ignore2", Random::GetTLSInstance()->RandomString(500)));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(db_->ContinueBackgroundWork());
ASSERT_OK(Flush());
ASSERT_OK(Put("ignore3", Random::GetTLSInstance()->RandomString(500)));
ASSERT_OK(Flush());
// Verify expected log files (still there for recycling)
std::vector<FileAttributes> files;
int log_count = 0;
ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files));
for (const auto& f : files) {
if (EndsWith(f.name, ".log")) {
EXPECT_GT(f.size_bytes, 500);
++log_count;
}
}
EXPECT_EQ(log_count, 3);
// (Re-used recipe) Generate two inactive WALs and one active WAL, with a
// gap in sequence numbers to interfere with recovery
ASSERT_OK(db_->PauseBackgroundWork());
ASSERT_OK(Put("key1", "val1"));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(Put("key2", "val2"));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
// Need a gap in sequence numbers, so e.g. ingest external file
// with an open snapshot
{
ManagedSnapshot snapshot(db_);
ASSERT_OK(
db_->IngestExternalFile({external_file1}, IngestExternalFileOptions()));
}
ASSERT_OK(Put("key3", "val3"));
ASSERT_OK(db_->SyncWAL());
// Need an SST file that is logically after that WAL, so that dropping WAL
// data is not a valid point in time.
{
ManagedSnapshot snapshot(db_);
ASSERT_OK(
db_->IngestExternalFile({external_file2}, IngestExternalFileOptions()));
}
// Approximate a crash, with respect to recycled WAL data extending past
// the end of the current WAL data (see SyncPoint callback above)
Close();
// Verify recycled log files haven't been truncated
files.clear();
log_count = 0;
ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files));
for (const auto& f : files) {
if (EndsWith(f.name, ".log")) {
EXPECT_GT(f.size_bytes, 500);
++log_count;
}
}
EXPECT_EQ(log_count, 3);
// Verify no data loss after reopen.
Reopen(options);
EXPECT_EQ("val1", Get("key1"));
EXPECT_EQ("val2", Get("key2")); // Passes because of adjacent seqnos
EXPECT_EQ("ex1", Get("external1"));
EXPECT_EQ("val3", Get("key3")); // <- ONLY FAILURE! (Not a point in time)
EXPECT_EQ("ex2", Get("external2"));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_F(DBWALTest, SyncWalPartialFailure) {
class MyTestFileSystem : public FileSystemWrapper {
public:
@ -1532,7 +1652,7 @@ TEST_F(DBWALTest, SyncWalPartialFailure) {
// * one inactive WAL, not synced, and
// * one active WAL, not synced
// with a single thread, to exercise as much logic as we reasonably can.
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->PauseBackgroundWork());
ASSERT_OK(db_->PauseBackgroundWork());
ASSERT_OK(Put("key1", "val1"));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(db_->SyncWAL());

View File

@ -172,6 +172,70 @@ TEST_F(DBBasicTestWithTimestamp, MixedCfs) {
Close();
}
TEST_F(DBBasicTestWithTimestamp, MultiGetMultipleCfs) {
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
Options options = CurrentOptions();
options.env = env_;
options.create_if_missing = true;
options.avoid_flush_during_shutdown = true;
options.comparator = &test_cmp;
DestroyAndReopen(options);
Options options1 = CurrentOptions();
options1.env = env_;
options1.comparator = &test_cmp;
ColumnFamilyHandle* handle = nullptr;
Status s = db_->CreateColumnFamily(options1, "data", &handle);
ASSERT_OK(s);
std::string ts = Timestamp(1, 0);
WriteBatch wb(0, 0, 0, kTimestampSize);
ASSERT_OK(wb.Put("a", "value"));
ASSERT_OK(wb.Put(handle, "a", "value"));
const auto ts_sz_func = [kTimestampSize](uint32_t /*cf_id*/) {
return kTimestampSize;
};
ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func));
ASSERT_OK(db_->Write(WriteOptions(), &wb));
int num_keys = 2;
std::vector<Slice> keys;
std::vector<std::string> expected_values;
for (int i = 0; i < num_keys; i++) {
keys.push_back("a");
expected_values.push_back("value");
}
std::vector<ColumnFamilyHandle*> handles;
handles.push_back(db_->DefaultColumnFamily());
handles.push_back(handle);
{
Slice read_ts_slice(ts);
ReadOptions read_opts;
read_opts.timestamp = &read_ts_slice;
std::vector<PinnableSlice> values;
values.resize(num_keys);
std::vector<Status> statuses;
statuses.resize(num_keys);
std::vector<std::string> timestamps;
timestamps.resize(num_keys);
db_->MultiGet(read_opts, num_keys, handles.data(), keys.data(),
values.data(), timestamps.data(), statuses.data());
for (int i = 0; i < num_keys; i++) {
ASSERT_OK(statuses[i]);
ASSERT_EQ(expected_values[i], values[i].ToString());
ASSERT_EQ(ts, timestamps[i]);
}
}
delete handle;
Close();
}
TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) {
Options options = CurrentOptions();
options.env = env_;

View File

@ -330,17 +330,16 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
// output : <user_provided_key>
inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
size_t ts_sz) {
Slice ret = internal_key;
ret.remove_suffix(kNumInternalBytes + ts_sz);
return ret;
assert(internal_key.size() >= kNumInternalBytes + ts_sz);
return Slice(internal_key.data(),
internal_key.size() - (kNumInternalBytes + ts_sz));
}
// input [user key]: <user_provided_key | ts>
// output: <user_provided_key>
inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
Slice ret = user_key;
ret.remove_suffix(ts_sz);
return ret;
assert(user_key.size() >= ts_sz);
return Slice(user_key.data(), user_key.size() - ts_sz);
}
// input [user key]: <user_provided_key | ts>

View File

@ -124,6 +124,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
<< "comparator" << table_properties.comparator_name
<< "user_defined_timestamps_persisted"
<< table_properties.user_defined_timestamps_persisted
<< "key_largest_seqno" << table_properties.key_largest_seqno
<< "merge_operator" << table_properties.merge_operator_name
<< "prefix_extractor_name"
<< table_properties.prefix_extractor_name << "property_collectors"

View File

@ -114,7 +114,6 @@ Status ExternalSstFileIngestionJob::Prepare(
const std::string path_inside_db = TableFileName(
cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
if (ingestion_options_.move_files) {
assert(!ingestion_options_.allow_db_generated_files);
status =
fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
if (status.ok()) {
@ -627,7 +626,8 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
DeleteInternalFiles();
consumed_seqno_count_ = 0;
files_overlap_ = false;
} else if (status.ok() && ingestion_options_.move_files) {
} else if (status.ok() && ingestion_options_.move_files &&
!ingestion_options_.allow_db_generated_files) {
// The files were moved and added successfully, remove original file links
for (IngestedFileInfo& f : files_to_ingest_) {
Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
@ -914,9 +914,18 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
} else if (!iter->status().ok()) {
return iter->status();
}
if (ingestion_options_.allow_db_generated_files) {
// Verify that all keys have seqno zero.
// TODO: store largest seqno in table property and validate it instead.
SequenceNumber largest_seqno =
table_reader.get()->GetTableProperties()->key_largest_seqno;
// UINT64_MAX means unknown and the file is generated before table property
// `key_largest_seqno` is introduced.
if (largest_seqno != UINT64_MAX && largest_seqno > 0) {
return Status::Corruption(
"External file has non zero largest sequence number " +
std::to_string(largest_seqno));
}
if (ingestion_options_.allow_db_generated_files &&
largest_seqno == UINT64_MAX) {
// Need to verify that all keys have seqno zero.
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Status pik_status =
ParseInternalKey(iter->key(), &key, allow_data_in_errors);

View File

@ -674,10 +674,8 @@ class SstFileWriterCollector : public TablePropertiesCollector {
Status Finish(UserCollectedProperties* properties) override {
std::string count = std::to_string(count_);
*properties = UserCollectedProperties{
{prefix_ + "_SstFileWriterCollector", "YES"},
{prefix_ + "_Count", count},
};
properties->insert({prefix_ + "_SstFileWriterCollector", "YES"});
properties->insert({prefix_ + "_Count", count});
return Status::OK();
}
@ -3727,13 +3725,14 @@ INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
std::make_tuple(true, true),
std::make_tuple(false, false)));
class IngestDBGeneratedFileTest : public ExternalSSTFileTestBase,
public ::testing::WithParamInterface<bool> {
class IngestDBGeneratedFileTest
: public ExternalSSTFileTestBase,
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
public:
IngestDBGeneratedFileTest() {
ingest_opts.allow_db_generated_files = true;
ingest_opts.move_files = false;
ingest_opts.verify_checksums_before_ingest = GetParam();
ingest_opts.move_files = std::get<0>(GetParam());
ingest_opts.verify_checksums_before_ingest = std::get<1>(GetParam());
ingest_opts.snapshot_consistency = false;
}
@ -3742,9 +3741,16 @@ class IngestDBGeneratedFileTest : public ExternalSSTFileTestBase,
};
INSTANTIATE_TEST_CASE_P(BasicMultiConfig, IngestDBGeneratedFileTest,
testing::Bool());
testing::Combine(testing::Bool(), testing::Bool()));
TEST_P(IngestDBGeneratedFileTest, FailureCase) {
if (encrypted_env_ && ingest_opts.move_files) {
// FIXME: should fail ingestion or support this combination.
ROCKSDB_GTEST_SKIP(
"Encrypted env and move_files do not work together, as we reopen the "
"file after linking it which appends an extra encryption prefix.");
return;
}
// Ingesting overlapping data should always fail.
do {
SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
@ -3778,6 +3784,7 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
live_meta[0].relative_filename);
// Ingesting a file whose boundary key has non-zero seqno.
Status s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
// This error msg is from checking seqno of boundary keys.
ASSERT_TRUE(
s.ToString().find("External file has non zero sequence number") !=
std::string::npos);
@ -3824,10 +3831,9 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
live_meta[0].directory + "/" + live_meta[0].relative_filename;
s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
ASSERT_NOK(s);
ASSERT_TRUE(
s.ToString().find(
"External file has a key with non zero sequence number") !=
std::string::npos);
// This error msg is from checking largest seqno in table property.
ASSERT_TRUE(s.ToString().find("non zero largest sequence number") !=
std::string::npos);
db_->ReleaseSnapshot(snapshot);
}
@ -3897,14 +3903,6 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
ASSERT_TRUE(s.ToString().find(err) != std::string::npos);
ASSERT_NOK(s);
ingest_opts.move_files = true;
s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
ingest_opts.move_files = false;
ASSERT_TRUE(
s.ToString().find("Options move_files and allow_db_generated_files are "
"not compatible") != std::string::npos);
ASSERT_NOK(s);
ingest_opts.snapshot_consistency = false;
ASSERT_OK(db_->IngestExternalFile(to_ingest_files, ingest_opts));
db_->ReleaseSnapshot(snapshot);
@ -3924,14 +3922,16 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
class IngestDBGeneratedFileTest2
: public ExternalSSTFileTestBase,
public ::testing::WithParamInterface<std::tuple<bool, bool, bool, bool>> {
public ::testing::WithParamInterface<
std::tuple<bool, bool, bool, bool, bool>> {
public:
IngestDBGeneratedFileTest2() = default;
};
INSTANTIATE_TEST_CASE_P(VaryingOptions, IngestDBGeneratedFileTest2,
testing::Combine(testing::Bool(), testing::Bool(),
testing::Bool(), testing::Bool()));
testing::Bool(), testing::Bool(),
testing::Bool()));
TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
// Use a separate column family to sort some data, generate multiple SST
@ -3939,11 +3939,11 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
// to be ingested does not overlap with existing data.
IngestExternalFileOptions ingest_opts;
ingest_opts.allow_db_generated_files = true;
ingest_opts.move_files = false;
ingest_opts.snapshot_consistency = std::get<0>(GetParam());
ingest_opts.allow_global_seqno = std::get<1>(GetParam());
ingest_opts.allow_blocking_flush = std::get<2>(GetParam());
ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam());
ingest_opts.move_files = std::get<4>(GetParam());
do {
SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));

View File

@ -1156,6 +1156,11 @@ void FlushJob::GetEffectiveCutoffUDTForPickedMemTables() {
// Find the newest user-defined timestamps from all the flushed memtables.
for (MemTable* m : mems_) {
Slice table_newest_udt = m->GetNewestUDT();
// Empty memtables can be legitimately created and flushed, for example
// by error recovery flush attempts.
if (table_newest_udt.empty()) {
continue;
}
if (cutoff_udt_.empty() ||
ucmp->CompareTimestamp(table_newest_udt, cutoff_udt_) > 0) {
if (!cutoff_udt_.empty()) {

View File

@ -68,7 +68,8 @@ class FlushJobTestBase : public testing::Test {
}
void NewDB() {
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_));
ASSERT_OK(
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
VersionEdit new_db;
new_db.SetLogNumber(0);
@ -114,7 +115,8 @@ class FlushJobTestBase : public testing::Test {
}
ASSERT_OK(s);
// Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
Temperature::kUnknown, nullptr);
ASSERT_OK(s);
}

View File

@ -354,13 +354,13 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
}
TEST_F(EventListenerTest, MultiCF) {
Options options;
options.env = CurrentOptions().env;
options.write_buffer_size = k110KB;
#ifdef ROCKSDB_USING_THREAD_STATUS
options.enable_thread_tracking = true;
#endif // ROCKSDB_USING_THREAD_STATUS
for (auto atomic_flush : {false, true}) {
Options options;
options.env = CurrentOptions().env;
options.write_buffer_size = k110KB;
#ifdef ROCKSDB_USING_THREAD_STATUS
options.enable_thread_tracking = true;
#endif // ROCKSDB_USING_THREAD_STATUS
options.atomic_flush = atomic_flush;
options.create_if_missing = true;
DestroyAndReopen(options);

View File

@ -67,9 +67,10 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
statistics(ioptions.stats),
merge_operator(ioptions.merge_operator.get()),
info_log(ioptions.logger),
allow_data_in_errors(ioptions.allow_data_in_errors),
protection_bytes_per_key(
mutable_cf_options.memtable_protection_bytes_per_key) {}
mutable_cf_options.memtable_protection_bytes_per_key),
allow_data_in_errors(ioptions.allow_data_in_errors),
paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {}
MemTable::MemTable(const InternalKeyComparator& cmp,
const ImmutableOptions& ioptions,
@ -370,15 +371,17 @@ class MemTableIterator : public InternalIterator {
: bloom_(nullptr),
prefix_extractor_(mem.prefix_extractor_),
comparator_(mem.comparator_),
valid_(false),
seqno_to_time_mapping_(seqno_to_time_mapping),
arena_mode_(arena != nullptr),
value_pinned_(
!mem.GetImmutableMemTableOptions()->inplace_update_support),
protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
status_(Status::OK()),
logger_(mem.moptions_.info_log),
ts_sz_(mem.ts_sz_) {
ts_sz_(mem.ts_sz_),
protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
valid_(false),
value_pinned_(
!mem.GetImmutableMemTableOptions()->inplace_update_support),
arena_mode_(arena != nullptr),
paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
allow_data_in_error(mem.moptions_.allow_data_in_errors) {
if (use_range_del_table) {
iter_ = mem.range_del_table_->GetIterator(arena);
} else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
@ -406,6 +409,7 @@ class MemTableIterator : public InternalIterator {
} else {
delete iter_;
}
status_.PermitUncheckedError();
}
#ifndef NDEBUG
@ -415,10 +419,16 @@ class MemTableIterator : public InternalIterator {
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
#endif
bool Valid() const override { return valid_ && status_.ok(); }
bool Valid() const override {
// If inner iter_ is not valid, then this iter should also not be valid.
assert(iter_->Valid() || !(valid_ && status_.ok()));
return valid_ && status_.ok();
}
void Seek(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
status_ = Status::OK();
if (bloom_) {
// iterator should only use prefix bloom filter
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
@ -433,13 +443,18 @@ class MemTableIterator : public InternalIterator {
}
}
}
iter_->Seek(k, nullptr);
if (paranoid_memory_checks_) {
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
} else {
iter_->Seek(k, nullptr);
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
void SeekForPrev(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
status_ = Status::OK();
if (bloom_) {
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
if (prefix_extractor_->InDomain(user_k_without_ts)) {
@ -453,7 +468,11 @@ class MemTableIterator : public InternalIterator {
}
}
}
iter_->Seek(k, nullptr);
if (paranoid_memory_checks_) {
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
} else {
iter_->Seek(k, nullptr);
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
if (!Valid() && status().ok()) {
@ -464,11 +483,13 @@ class MemTableIterator : public InternalIterator {
}
}
void SeekToFirst() override {
status_ = Status::OK();
iter_->SeekToFirst();
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
void SeekToLast() override {
status_ = Status::OK();
iter_->SeekToLast();
valid_ = iter_->Valid();
VerifyEntryChecksum();
@ -476,8 +497,12 @@ class MemTableIterator : public InternalIterator {
void Next() override {
PERF_COUNTER_ADD(next_on_memtable_count, 1);
assert(Valid());
iter_->Next();
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
if (paranoid_memory_checks_) {
status_ = iter_->NextAndValidate(allow_data_in_error);
} else {
iter_->Next();
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
@ -494,7 +519,11 @@ class MemTableIterator : public InternalIterator {
void Prev() override {
PERF_COUNTER_ADD(prev_on_memtable_count, 1);
assert(Valid());
iter_->Prev();
if (paranoid_memory_checks_) {
status_ = iter_->PrevAndValidate(allow_data_in_error);
} else {
iter_->Prev();
}
valid_ = iter_->Valid();
VerifyEntryChecksum();
}
@ -540,15 +569,17 @@ class MemTableIterator : public InternalIterator {
const SliceTransform* const prefix_extractor_;
const MemTable::KeyComparator comparator_;
MemTableRep::Iterator* iter_;
bool valid_;
// The seqno to time mapping is owned by the SuperVersion.
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping_;
bool arena_mode_;
bool value_pinned_;
uint32_t protection_bytes_per_key_;
Status status_;
Logger* logger_;
size_t ts_sz_;
uint32_t protection_bytes_per_key_;
bool valid_;
bool value_pinned_;
bool arena_mode_;
const bool paranoid_memory_checks_;
const bool allow_data_in_error;
void VerifyEntryChecksum() {
if (protection_bytes_per_key_ > 0 && Valid()) {
@ -933,6 +964,8 @@ static bool SaveValue(void* arg, const char* entry) {
Saver* s = static_cast<Saver*>(arg);
assert(s != nullptr);
assert(!s->value || !s->columns);
assert(!*(s->found_final_value));
assert(s->status->ok() || s->status->IsMergeInProgress());
MergeContext* merge_context = s->merge_context;
SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
@ -966,6 +999,7 @@ static bool SaveValue(void* arg, const char* entry) {
*(s->status) = MemTable::VerifyEntryChecksum(
entry, s->protection_bytes_per_key, s->allow_data_in_errors);
if (!s->status->ok()) {
*(s->found_final_value) = true;
ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState());
// Memtable entry corrupted
return false;
@ -1231,6 +1265,7 @@ static bool SaveValue(void* arg, const char* entry) {
". ");
msg.append("seq: " + std::to_string(seq) + ".");
}
*(s->found_final_value) = true;
*(s->status) = Status::Corruption(msg.c_str());
return false;
}
@ -1310,8 +1345,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value,
// No change to value, since we have not yet found a Put/Delete
// Propagate corruption error
if (!found_final_value && merge_in_progress && !s->IsCorruption()) {
*s = Status::MergeInProgress();
if (!found_final_value && merge_in_progress) {
if (s->ok()) {
*s = Status::MergeInProgress();
} else {
assert(s->IsMergeInProgress());
}
}
PERF_COUNTER_ADD(get_from_memtable_count, 1);
return found_final_value;
@ -1347,7 +1386,19 @@ void MemTable::GetFromTable(const LookupKey& key,
saver.do_merge = do_merge;
saver.allow_data_in_errors = moptions_.allow_data_in_errors;
saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
table_->Get(key, &saver, SaveValue);
if (!moptions_.paranoid_memory_checks) {
table_->Get(key, &saver, SaveValue);
} else {
Status check_s = table_->GetAndValidate(key, &saver, SaveValue,
moptions_.allow_data_in_errors);
if (check_s.IsCorruption()) {
*(saver.status) = check_s;
// Should stop searching the LSM.
*(saver.found_final_value) = true;
}
}
assert(s->ok() || s->IsMergeInProgress() || *found_final_value);
*seq = saver.seq;
}
@ -1421,10 +1472,19 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
&found_final_value, &merge_in_progress);
if (!found_final_value && merge_in_progress) {
*(iter->s) = Status::MergeInProgress();
if (iter->s->ok()) {
*(iter->s) = Status::MergeInProgress();
} else {
assert(iter->s->IsMergeInProgress());
}
}
if (found_final_value) {
if (found_final_value ||
(!iter->s->ok() && !iter->s->IsMergeInProgress())) {
// `found_final_value` should be set if an error/corruption occurs.
// The check on iter->s is just there in case GetFromTable() did not
// set `found_final_value` properly.
assert(found_final_value);
if (iter->value) {
iter->value->PinSelf();
range->AddValueSize(iter->value->size());

View File

@ -60,8 +60,9 @@ struct ImmutableMemTableOptions {
Statistics* statistics;
MergeOperator* merge_operator;
Logger* info_log;
bool allow_data_in_errors;
uint32_t protection_bytes_per_key;
bool allow_data_in_errors;
bool paranoid_memory_checks;
};
// Batched counters to updated when inserting keys in one write batch.
@ -249,12 +250,14 @@ class MemTable {
// If do_merge = true the default behavior which is Get value for key is
// executed. Expected behavior is described right below.
// If memtable contains a value for key, store it in *value and return true.
// If memtable contains a deletion for key, store a NotFound() error
// in *status and return true.
// If memtable contains a deletion for key, store NotFound() in *status and
// return true.
// If memtable contains Merge operation as the most recent entry for a key,
// and the merge process does not stop (not reaching a value or delete),
// prepend the current merge operand to *operands.
// store MergeInProgress in s, and return false.
// If an unexpected error or corruption occurs, store Corruption() or other
// error in *status and return true.
// Else, return false.
// If any operation was found, its most recent sequence number
// will be stored in *seq on success (regardless of whether true/false is
@ -264,6 +267,11 @@ class MemTable {
// If do_merge = false then any Merge Operands encountered for key are simply
// stored in merge_context.operands_list and never actually merged to get a
// final value. The raw Merge Operands are eventually returned to the user.
// @param value If not null and memtable contains a value for key, `value`
// will be set to the result value.
// @param column If not null and memtable contains a value/WideColumn for key,
// `column` will be set to the result value/WideColumn.
// Note: only one of `value` and `column` can be non-nullptr.
// @param immutable_memtable Whether this memtable is immutable. Used
// internally by NewRangeTombstoneIterator(). See comment above
// NewRangeTombstoneIterator() for more detail.

View File

@ -181,7 +181,8 @@ bool MemTableListVersion::GetFromList(
}
if (done) {
assert(*seq != kMaxSequenceNumber || s->IsNotFound());
assert(*seq != kMaxSequenceNumber ||
(!s->ok() && !s->IsMergeInProgress()));
return true;
}
if (!s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {

View File

@ -287,6 +287,7 @@ TEST_F(MemTableListTest, GetTest) {
// Fetch the newly written keys
merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(),
@ -295,6 +296,7 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_EQ(value, "value1");
merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key1", 2), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(),
@ -303,6 +305,7 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(),
@ -311,6 +314,7 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_EQ(value, "value2.2");
merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key3", seq), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(),
@ -350,6 +354,7 @@ TEST_F(MemTableListTest, GetTest) {
// Fetch keys via MemTableList
merge_context.Clear();
s = Status::OK();
found =
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context,
@ -357,6 +362,7 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear();
s = Status::OK();
found = list.current()->Get(LookupKey("key1", saved_seq), &value,
/*columns=*/nullptr, /*timestamp=*/nullptr, &s,
&merge_context, &max_covering_tombstone_seq,
@ -365,6 +371,7 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_EQ("value1", value);
merge_context.Clear();
s = Status::OK();
found =
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context,
@ -373,12 +380,14 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_EQ(value, "value2.3");
merge_context.Clear();
s = Status::OK();
found = list.current()->Get(LookupKey("key2", 1), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions());
ASSERT_FALSE(found);
merge_context.Clear();
s = Status::OK();
found =
list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context,
@ -438,6 +447,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Fetch the newly written keys
merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(),
@ -446,6 +456,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(),
@ -462,6 +473,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Fetch keys via MemTableList
merge_context.Clear();
s = Status::OK();
found =
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context,
@ -469,6 +481,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear();
s = Status::OK();
found =
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context,
@ -508,6 +521,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Verify keys are present in history
merge_context.Clear();
s = Status::OK();
found = list.current()->GetFromHistory(
LookupKey("key1", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@ -515,6 +529,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear();
s = Status::OK();
found = list.current()->GetFromHistory(
LookupKey("key2", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@ -568,6 +583,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Verify keys are no longer in MemTableList
merge_context.Clear();
s = Status::OK();
found =
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context,
@ -575,6 +591,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_FALSE(found);
merge_context.Clear();
s = Status::OK();
found =
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context,
@ -582,6 +599,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_FALSE(found);
merge_context.Clear();
s = Status::OK();
found =
list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context,
@ -590,6 +608,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Verify that the second memtable's keys are in the history
merge_context.Clear();
s = Status::OK();
found = list.current()->GetFromHistory(
LookupKey("key1", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@ -597,6 +616,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear();
s = Status::OK();
found = list.current()->GetFromHistory(
LookupKey("key3", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@ -606,6 +626,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Verify that key2 from the first memtable is no longer in the history
merge_context.Clear();
s = Status::OK();
found =
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context,

View File

@ -29,6 +29,7 @@
#include "db/internal_stats.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "db/version_edit_handler.h"
#include "db/version_set.h"
#include "port/port.h"
#include "table/table_reader.h"
@ -37,6 +38,25 @@
namespace ROCKSDB_NAMESPACE {
class VersionBuilder::Rep {
class NewestFirstBySeqNo {
public:
bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
assert(lhs);
assert(rhs);
if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
}
if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
}
// Break ties by file number
return lhs->fd.GetNumber() > rhs->fd.GetNumber();
}
};
class NewestFirstByEpochNumber {
private:
inline static const NewestFirstBySeqNo seqno_cmp;
@ -249,9 +269,10 @@ class VersionBuilder::Rep {
std::unordered_map<uint64_t, int> table_file_levels_;
// Current compact cursors that should be changed after the last compaction
std::unordered_map<int, InternalKey> updated_compact_cursors_;
NewestFirstByEpochNumber level_zero_cmp_by_epochno_;
NewestFirstBySeqNo level_zero_cmp_by_seqno_;
BySmallestKey level_nonzero_cmp_;
const std::shared_ptr<const NewestFirstByEpochNumber>
level_zero_cmp_by_epochno_;
const std::shared_ptr<const NewestFirstBySeqNo> level_zero_cmp_by_seqno_;
const std::shared_ptr<const BySmallestKey> level_nonzero_cmp_;
// Mutable metadata objects for all blob files affected by the series of
// version edits.
@ -259,11 +280,56 @@ class VersionBuilder::Rep {
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
ColumnFamilyData* cfd_;
VersionEditHandler* version_edit_handler_;
bool track_found_and_missing_files_;
// If false, only a complete Version with all files consisting it found is
// considered valid. If true, besides complete Version, if the Version is
// never edited in an atomic group, an incomplete Version with only a suffix
// of L0 files missing is also considered valid.
bool allow_incomplete_valid_version_;
// These are only tracked if `track_found_and_missing_files_` is enabled.
// The SST files that are found (blob files not included yet).
std::unordered_set<uint64_t> found_files_;
// Missing SST files for L0
std::unordered_set<uint64_t> l0_missing_files_;
// Missing SST files for non L0 levels
std::unordered_set<uint64_t> non_l0_missing_files_;
// Intermediate SST files (blob files not included yet)
std::vector<std::string> intermediate_files_;
// The highest file number for all the missing blob files, useful to check
// if a complete Version is available.
uint64_t missing_blob_files_high_ = kInvalidBlobFileNumber;
// Missing blob files, useful to check if only the missing L0 files'
// associated blob files are missing.
std::unordered_set<uint64_t> missing_blob_files_;
// True if all files consisting the Version can be found. Or if
// `allow_incomplete_valid_version_` is true and the version history is not
// ever edited in an atomic group, this will be true if only a
// suffix of L0 SST files and their associated blob files are missing.
bool valid_version_available_;
// True if version is ever edited in an atomic group.
bool edited_in_atomic_group_;
// Flag to indicate if the Version is updated since last validity check. If no
// `Apply` call is made between a `Rep`'s construction and a
// `ValidVersionAvailable` check or between two `ValidVersionAvailable` calls.
// This flag will be true to indicate the cached validity value can be
// directly used without a recheck.
bool version_updated_since_last_check_;
// End of fields that are only tracked when `track_found_and_missing_files_`
// is enabled.
public:
Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions,
TableCache* table_cache, VersionStorageInfo* base_vstorage,
VersionSet* version_set,
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr,
ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler,
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
: file_options_(file_options),
ioptions_(ioptions),
table_cache_(table_cache),
@ -271,11 +337,76 @@ class VersionBuilder::Rep {
version_set_(version_set),
num_levels_(base_vstorage->num_levels()),
has_invalid_levels_(false),
level_nonzero_cmp_(base_vstorage_->InternalComparator()),
file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) {
level_zero_cmp_by_epochno_(
std::make_shared<NewestFirstByEpochNumber>()),
level_zero_cmp_by_seqno_(std::make_shared<NewestFirstBySeqNo>()),
level_nonzero_cmp_(std::make_shared<BySmallestKey>(
base_vstorage_->InternalComparator())),
file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr),
cfd_(cfd),
version_edit_handler_(version_edit_handler),
track_found_and_missing_files_(track_found_and_missing_files),
allow_incomplete_valid_version_(allow_incomplete_valid_version) {
assert(ioptions_);
levels_ = new LevelState[num_levels_];
if (track_found_and_missing_files_) {
assert(cfd_);
assert(version_edit_handler_);
// `track_found_and_missing_files_` mode used by VersionEditHandlerPIT
// assumes the initial base version is valid. For best efforts recovery,
// base will be empty. For manifest tailing usage like secondary instance,
// they do not allow incomplete version, so the base version in subsequent
// catch up attempts should be valid too.
valid_version_available_ = true;
edited_in_atomic_group_ = false;
version_updated_since_last_check_ = false;
}
}
Rep(const Rep& other)
: file_options_(other.file_options_),
ioptions_(other.ioptions_),
table_cache_(other.table_cache_),
base_vstorage_(other.base_vstorage_),
version_set_(other.version_set_),
num_levels_(other.num_levels_),
invalid_level_sizes_(other.invalid_level_sizes_),
has_invalid_levels_(other.has_invalid_levels_),
table_file_levels_(other.table_file_levels_),
updated_compact_cursors_(other.updated_compact_cursors_),
level_zero_cmp_by_epochno_(other.level_zero_cmp_by_epochno_),
level_zero_cmp_by_seqno_(other.level_zero_cmp_by_seqno_),
level_nonzero_cmp_(other.level_nonzero_cmp_),
mutable_blob_file_metas_(other.mutable_blob_file_metas_),
file_metadata_cache_res_mgr_(other.file_metadata_cache_res_mgr_),
cfd_(other.cfd_),
version_edit_handler_(other.version_edit_handler_),
track_found_and_missing_files_(other.track_found_and_missing_files_),
allow_incomplete_valid_version_(other.allow_incomplete_valid_version_),
found_files_(other.found_files_),
l0_missing_files_(other.l0_missing_files_),
non_l0_missing_files_(other.non_l0_missing_files_),
intermediate_files_(other.intermediate_files_),
missing_blob_files_high_(other.missing_blob_files_high_),
missing_blob_files_(other.missing_blob_files_),
valid_version_available_(other.valid_version_available_),
edited_in_atomic_group_(other.edited_in_atomic_group_),
version_updated_since_last_check_(
other.version_updated_since_last_check_) {
assert(ioptions_);
levels_ = new LevelState[num_levels_];
for (int level = 0; level < num_levels_; level++) {
levels_[level] = other.levels_[level];
const auto& added = levels_[level].added_files;
for (auto& pair : added) {
RefFile(pair.second);
}
}
if (track_found_and_missing_files_) {
assert(cfd_);
assert(version_edit_handler_);
}
}
~Rep() {
@ -289,6 +420,12 @@ class VersionBuilder::Rep {
delete[] levels_;
}
void RefFile(FileMetaData* f) {
assert(f);
assert(f->refs > 0);
f->refs++;
}
void UnrefFile(FileMetaData* f) {
f->refs--;
if (f->refs <= 0) {
@ -397,7 +534,7 @@ class VersionBuilder::Rep {
if (epoch_number_requirement ==
EpochNumberRequirement::kMightMissing) {
if (!level_zero_cmp_by_seqno_(lhs, rhs)) {
if (!level_zero_cmp_by_seqno_->operator()(lhs, rhs)) {
std::ostringstream oss;
oss << "L0 files are not sorted properly: files #"
<< lhs->fd.GetNumber() << " with seqnos (largest, smallest) "
@ -429,7 +566,7 @@ class VersionBuilder::Rep {
}
}
if (!level_zero_cmp_by_epochno_(lhs, rhs)) {
if (!level_zero_cmp_by_epochno_->operator()(lhs, rhs)) {
std::ostringstream oss;
oss << "L0 files are not sorted properly: files #"
<< lhs->fd.GetNumber() << " with epoch number "
@ -458,7 +595,7 @@ class VersionBuilder::Rep {
assert(lhs);
assert(rhs);
if (!level_nonzero_cmp_(lhs, rhs)) {
if (!level_nonzero_cmp_->operator()(lhs, rhs)) {
std::ostringstream oss;
oss << 'L' << level << " files are not sorted properly: files #"
<< lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
@ -634,7 +771,22 @@ class VersionBuilder::Rep {
mutable_blob_file_metas_.emplace(
blob_file_number, MutableBlobFileMetaData(std::move(shared_meta)));
return Status::OK();
Status s;
if (track_found_and_missing_files_) {
assert(version_edit_handler_);
s = version_edit_handler_->VerifyBlobFile(cfd_, blob_file_number,
blob_file_addition);
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
missing_blob_files_high_ =
std::max(missing_blob_files_high_, blob_file_number);
missing_blob_files_.insert(blob_file_number);
s = Status::OK();
} else if (!s.ok()) {
return s;
}
}
return s;
}
Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) {
@ -752,6 +904,29 @@ class VersionBuilder::Rep {
table_file_levels_[file_number] =
VersionStorageInfo::FileLocation::Invalid().GetLevel();
if (track_found_and_missing_files_) {
assert(version_edit_handler_);
if (l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
l0_missing_files_.erase(file_number);
} else if (non_l0_missing_files_.find(file_number) !=
non_l0_missing_files_.end()) {
non_l0_missing_files_.erase(file_number);
} else {
auto fiter = found_files_.find(file_number);
// Only mark new files added during this catchup attempt for deletion.
// These files were never installed in VersionStorageInfo.
// Already referenced files that are deleted by a VersionEdit will
// be added to the VersionStorageInfo's obsolete files when the old
// version is dereferenced.
if (fiter != found_files_.end()) {
assert(!ioptions_->cf_paths.empty());
intermediate_files_.emplace_back(
MakeTableFileName(ioptions_->cf_paths[0].path, file_number));
found_files_.erase(fiter);
}
}
}
return Status::OK();
}
@ -824,7 +999,31 @@ class VersionBuilder::Rep {
table_file_levels_[file_number] = level;
return Status::OK();
Status s;
if (track_found_and_missing_files_) {
assert(version_edit_handler_);
assert(!ioptions_->cf_paths.empty());
const std::string fpath =
MakeTableFileName(ioptions_->cf_paths[0].path, file_number);
s = version_edit_handler_->VerifyFile(cfd_, fpath, level, meta);
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
if (0 == level) {
l0_missing_files_.insert(file_number);
} else {
non_l0_missing_files_.insert(file_number);
}
if (s.IsCorruption()) {
found_files_.insert(file_number);
}
s = Status::OK();
} else if (!s.ok()) {
return s;
} else {
found_files_.insert(file_number);
}
}
return s;
}
Status ApplyCompactCursors(int level,
@ -845,6 +1044,7 @@ class VersionBuilder::Rep {
// Apply all of the edits in *edit to the current state.
Status Apply(const VersionEdit* edit) {
bool version_updated = false;
{
const Status s = CheckConsistency(base_vstorage_);
if (!s.ok()) {
@ -862,6 +1062,7 @@ class VersionBuilder::Rep {
if (!s.ok()) {
return s;
}
version_updated = true;
}
// Increase the amount of garbage for blob files affected by GC
@ -870,6 +1071,7 @@ class VersionBuilder::Rep {
if (!s.ok()) {
return s;
}
version_updated = true;
}
// Delete table files
@ -881,6 +1083,7 @@ class VersionBuilder::Rep {
if (!s.ok()) {
return s;
}
version_updated = true;
}
// Add new table files
@ -892,6 +1095,7 @@ class VersionBuilder::Rep {
if (!s.ok()) {
return s;
}
version_updated = true;
}
// Populate compact cursors for round-robin compaction, leave
@ -904,6 +1108,13 @@ class VersionBuilder::Rep {
return s;
}
}
if (track_found_and_missing_files_ && version_updated) {
version_updated_since_last_check_ = true;
if (!edited_in_atomic_group_ && edit->IsInAtomicGroup()) {
edited_in_atomic_group_ = true;
}
}
return Status::OK();
}
@ -1046,14 +1257,35 @@ class VersionBuilder::Rep {
mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes());
}
bool OnlyLinkedToMissingL0Files(
const std::unordered_set<uint64_t>& linked_ssts) const {
return std::all_of(
linked_ssts.begin(), linked_ssts.end(), [&](const uint64_t& element) {
return l0_missing_files_.find(element) != l0_missing_files_.end();
});
}
// Add the blob file specified by meta to *vstorage if it is determined to
// contain valid data (blobs).
template <typename Meta>
static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) {
void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta,
uint64_t blob_file_number) const {
assert(vstorage);
assert(meta);
if (meta->GetLinkedSsts().empty() &&
const auto& linked_ssts = meta->GetLinkedSsts();
if (track_found_and_missing_files_) {
if (missing_blob_files_.find(blob_file_number) !=
missing_blob_files_.end()) {
return;
}
// Leave the empty case for the below blob garbage collection logic.
if (!linked_ssts.empty() && OnlyLinkedToMissingL0Files(linked_ssts)) {
return;
}
}
if (linked_ssts.empty() &&
meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) {
return;
}
@ -1065,6 +1297,7 @@ class VersionBuilder::Rep {
// applied, and save the result into *vstorage.
void SaveBlobFilesTo(VersionStorageInfo* vstorage) const {
assert(vstorage);
assert(!track_found_and_missing_files_ || valid_version_available_);
assert(base_vstorage_);
vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() +
@ -1080,22 +1313,24 @@ class VersionBuilder::Rep {
}
auto process_base =
[vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) {
[this, vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) {
assert(base_meta);
AddBlobFileIfNeeded(vstorage, base_meta);
AddBlobFileIfNeeded(vstorage, base_meta,
base_meta->GetBlobFileNumber());
return true;
};
auto process_mutable =
[vstorage](const MutableBlobFileMetaData& mutable_meta) {
AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
[this, vstorage](const MutableBlobFileMetaData& mutable_meta) {
AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta),
mutable_meta.GetBlobFileNumber());
return true;
};
auto process_both = [vstorage](
auto process_both = [this, vstorage](
const std::shared_ptr<BlobFileMetaData>& base_meta,
const MutableBlobFileMetaData& mutable_meta) {
assert(base_meta);
@ -1108,12 +1343,14 @@ class VersionBuilder::Rep {
mutable_meta.GetGarbageBlobBytes());
assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts());
AddBlobFileIfNeeded(vstorage, base_meta);
AddBlobFileIfNeeded(vstorage, base_meta,
base_meta->GetBlobFileNumber());
return true;
}
AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta),
mutable_meta.GetBlobFileNumber());
return true;
};
@ -1125,6 +1362,10 @@ class VersionBuilder::Rep {
void MaybeAddFile(VersionStorageInfo* vstorage, int level,
FileMetaData* f) const {
const uint64_t file_number = f->fd.GetNumber();
if (track_found_and_missing_files_ && level == 0 &&
l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
return;
}
const auto& level_state = levels_[level];
@ -1148,6 +1389,29 @@ class VersionBuilder::Rep {
}
}
bool ContainsCompleteVersion() const {
assert(track_found_and_missing_files_);
return l0_missing_files_.empty() && non_l0_missing_files_.empty() &&
(missing_blob_files_high_ == kInvalidBlobFileNumber ||
missing_blob_files_high_ < GetMinOldestBlobFileNumber());
}
bool HasMissingFiles() const {
assert(track_found_and_missing_files_);
return !l0_missing_files_.empty() || !non_l0_missing_files_.empty() ||
missing_blob_files_high_ != kInvalidBlobFileNumber;
}
std::vector<std::string>& GetAndClearIntermediateFiles() {
assert(track_found_and_missing_files_);
return intermediate_files_;
}
void ClearFoundFiles() {
assert(track_found_and_missing_files_);
found_files_.clear();
}
template <typename Cmp>
void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const {
// Merge the set of added files with the set of pre-existing files.
@ -1156,6 +1420,16 @@ class VersionBuilder::Rep {
const auto& unordered_added_files = levels_[level].added_files;
vstorage->Reserve(level, base_files.size() + unordered_added_files.size());
MergeUnorderdAddedFilesWithBase(
base_files, unordered_added_files, cmp,
[&](FileMetaData* file) { MaybeAddFile(vstorage, level, file); });
}
template <typename Cmp, typename AddFileFunc>
void MergeUnorderdAddedFilesWithBase(
const std::vector<FileMetaData*>& base_files,
const std::unordered_map<uint64_t, FileMetaData*>& unordered_added_files,
Cmp cmp, AddFileFunc add_file_func) const {
// Sort added files for the level.
std::vector<FileMetaData*> added_files;
added_files.reserve(unordered_added_files.size());
@ -1171,9 +1445,9 @@ class VersionBuilder::Rep {
while (added_iter != added_end || base_iter != base_end) {
if (base_iter == base_end ||
(added_iter != added_end && cmp(*added_iter, *base_iter))) {
MaybeAddFile(vstorage, level, *added_iter++);
add_file_func(*added_iter++);
} else {
MaybeAddFile(vstorage, level, *base_iter++);
add_file_func(*base_iter++);
}
}
}
@ -1215,13 +1489,13 @@ class VersionBuilder::Rep {
}
if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) {
SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_seqno_);
SaveSSTFilesTo(vstorage, /* level */ 0, *level_zero_cmp_by_seqno_);
} else {
SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_epochno_);
SaveSSTFilesTo(vstorage, /* level */ 0, *level_zero_cmp_by_epochno_);
}
for (int level = 1; level < num_levels_; ++level) {
SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_);
SaveSSTFilesTo(vstorage, level, *level_nonzero_cmp_);
}
}
@ -1232,8 +1506,111 @@ class VersionBuilder::Rep {
}
}
bool ValidVersionAvailable() {
assert(track_found_and_missing_files_);
if (version_updated_since_last_check_) {
valid_version_available_ = ContainsCompleteVersion();
if (!valid_version_available_ && !edited_in_atomic_group_ &&
allow_incomplete_valid_version_) {
valid_version_available_ = OnlyMissingL0Suffix();
}
version_updated_since_last_check_ = false;
}
return valid_version_available_;
}
bool OnlyMissingL0Suffix() const {
if (!non_l0_missing_files_.empty()) {
return false;
}
assert(!(l0_missing_files_.empty() && missing_blob_files_.empty()));
if (!l0_missing_files_.empty() && !MissingL0FilesAreL0Suffix()) {
return false;
}
if (!missing_blob_files_.empty() &&
!RemainingSstFilesNotMissingBlobFiles()) {
return false;
}
return true;
}
// Check missing L0 files are a suffix of expected sorted L0 files.
bool MissingL0FilesAreL0Suffix() const {
assert(non_l0_missing_files_.empty());
assert(!l0_missing_files_.empty());
std::vector<FileMetaData*> expected_sorted_l0_files;
const auto& base_files = base_vstorage_->LevelFiles(0);
const auto& unordered_added_files = levels_[0].added_files;
expected_sorted_l0_files.reserve(base_files.size() +
unordered_added_files.size());
EpochNumberRequirement epoch_number_requirement =
base_vstorage_->GetEpochNumberRequirement();
if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) {
MergeUnorderdAddedFilesWithBase(
base_files, unordered_added_files, *level_zero_cmp_by_seqno_,
[&](FileMetaData* file) {
expected_sorted_l0_files.push_back(file);
});
} else {
MergeUnorderdAddedFilesWithBase(
base_files, unordered_added_files, *level_zero_cmp_by_epochno_,
[&](FileMetaData* file) {
expected_sorted_l0_files.push_back(file);
});
}
assert(expected_sorted_l0_files.size() >= l0_missing_files_.size());
std::unordered_set<uint64_t> unaddressed_missing_files = l0_missing_files_;
for (auto iter = expected_sorted_l0_files.begin();
iter != expected_sorted_l0_files.end(); iter++) {
uint64_t file_number = (*iter)->fd.GetNumber();
if (l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
assert(unaddressed_missing_files.find(file_number) !=
unaddressed_missing_files.end());
unaddressed_missing_files.erase(file_number);
} else if (!unaddressed_missing_files.empty()) {
return false;
} else {
break;
}
}
return true;
}
// Check for each of the missing blob file missing, it either is older than
// the minimum oldest blob file required by this Version or only linked to
// the missing L0 files.
bool RemainingSstFilesNotMissingBlobFiles() const {
assert(non_l0_missing_files_.empty());
assert(!missing_blob_files_.empty());
bool no_l0_files_missing = l0_missing_files_.empty();
uint64_t min_oldest_blob_file_num = GetMinOldestBlobFileNumber();
for (const auto& missing_blob_file : missing_blob_files_) {
if (missing_blob_file < min_oldest_blob_file_num) {
continue;
}
auto iter = mutable_blob_file_metas_.find(missing_blob_file);
assert(iter != mutable_blob_file_metas_.end());
const std::unordered_set<uint64_t>& linked_ssts =
iter->second.GetLinkedSsts();
// TODO(yuzhangyu): In theory, if no L0 SST files ara missing, and only
// blob files exclusively linked to a L0 suffix are missing, we can
// recover to a valid point in time too. We don't recover that type of
// incomplete Version yet.
if (!linked_ssts.empty() && no_l0_files_missing) {
return false;
}
if (!OnlyLinkedToMissingL0Files(linked_ssts)) {
return false;
}
}
return true;
}
// Save the current state in *vstorage.
Status SaveTo(VersionStorageInfo* vstorage) const {
assert(!track_found_and_missing_files_ || valid_version_available_);
Status s;
#ifndef NDEBUG
@ -1266,6 +1643,7 @@ class VersionBuilder::Rep {
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
uint8_t block_protection_bytes_per_key) {
assert(table_cache_ != nullptr);
assert(!track_found_and_missing_files_ || valid_version_available_);
size_t table_cache_capacity =
table_cache_->get_cache().get()->GetCapacity();
@ -1305,6 +1683,11 @@ class VersionBuilder::Rep {
for (int level = 0; level < num_levels_; level++) {
for (auto& file_meta_pair : levels_[level].added_files) {
auto* file_meta = file_meta_pair.second;
uint64_t file_number = file_meta->fd.GetNumber();
if (track_found_and_missing_files_ && level == 0 &&
l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
continue;
}
// If the file has been opened before, just skip it.
if (!file_meta->table_reader_handle) {
files_meta.emplace_back(file_meta, level);
@ -1369,9 +1752,13 @@ VersionBuilder::VersionBuilder(
const FileOptions& file_options, const ImmutableCFOptions* ioptions,
TableCache* table_cache, VersionStorageInfo* base_vstorage,
VersionSet* version_set,
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr,
ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler,
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
: rep_(new Rep(file_options, ioptions, table_cache, base_vstorage,
version_set, file_metadata_cache_res_mgr)) {}
version_set, file_metadata_cache_res_mgr, cfd,
version_edit_handler, track_found_and_missing_files,
allow_incomplete_valid_version)) {}
VersionBuilder::~VersionBuilder() = default;
@ -1399,27 +1786,71 @@ Status VersionBuilder::LoadTableHandlers(
read_options, block_protection_bytes_per_key);
}
uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const {
return rep_->GetMinOldestBlobFileNumber();
void VersionBuilder::CreateOrReplaceSavePoint() {
assert(rep_);
savepoint_ = std::move(rep_);
rep_ = std::make_unique<Rep>(*savepoint_);
}
bool VersionBuilder::ValidVersionAvailable() {
return rep_->ValidVersionAvailable();
}
bool VersionBuilder::HasMissingFiles() const { return rep_->HasMissingFiles(); }
std::vector<std::string>& VersionBuilder::GetAndClearIntermediateFiles() {
return rep_->GetAndClearIntermediateFiles();
}
void VersionBuilder::ClearFoundFiles() { return rep_->ClearFoundFiles(); }
Status VersionBuilder::SaveSavePointTo(VersionStorageInfo* vstorage) const {
if (!savepoint_ || !savepoint_->ValidVersionAvailable()) {
return Status::InvalidArgument();
}
return savepoint_->SaveTo(vstorage);
}
Status VersionBuilder::LoadSavePointTableHandlers(
InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
uint8_t block_protection_bytes_per_key) {
if (!savepoint_ || !savepoint_->ValidVersionAvailable()) {
return Status::InvalidArgument();
}
return savepoint_->LoadTableHandlers(
internal_stats, max_threads, prefetch_index_and_filter_in_cache,
is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin,
read_options, block_protection_bytes_per_key);
}
void VersionBuilder::ClearSavePoint() { savepoint_.reset(nullptr); }
BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
ColumnFamilyData* cfd)
ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler,
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
: version_builder_(new VersionBuilder(
cfd->current()->version_set()->file_options(), cfd->ioptions(),
cfd->table_cache(), cfd->current()->storage_info(),
cfd->current()->version_set(),
cfd->GetFileMetadataCacheReservationManager())),
cfd->GetFileMetadataCacheReservationManager(), cfd,
version_edit_handler, track_found_and_missing_files,
allow_incomplete_valid_version)),
version_(cfd->current()) {
version_->Ref();
}
BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
ColumnFamilyData* cfd, Version* v)
ColumnFamilyData* cfd, Version* v, VersionEditHandler* version_edit_handler,
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
: version_builder_(new VersionBuilder(
cfd->current()->version_set()->file_options(), cfd->ioptions(),
cfd->table_cache(), v->storage_info(), v->version_set(),
cfd->GetFileMetadataCacheReservationManager())),
cfd->GetFileMetadataCacheReservationManager(), cfd,
version_edit_handler, track_found_and_missing_files,
allow_incomplete_valid_version)),
version_(v) {
assert(version_ != cfd->current());
}

View File

@ -26,6 +26,7 @@ struct FileMetaData;
class InternalStats;
class Version;
class VersionSet;
class VersionEditHandler;
class ColumnFamilyData;
class CacheReservationManager;
@ -38,22 +39,80 @@ class VersionBuilder {
const ImmutableCFOptions* ioptions, TableCache* table_cache,
VersionStorageInfo* base_vstorage, VersionSet* version_set,
std::shared_ptr<CacheReservationManager>
file_metadata_cache_res_mgr = nullptr);
file_metadata_cache_res_mgr = nullptr,
ColumnFamilyData* cfd = nullptr,
VersionEditHandler* version_edit_handler = nullptr,
bool track_found_and_missing_files = false,
bool allow_incomplete_valid_version = false);
~VersionBuilder();
bool CheckConsistencyForNumLevels();
Status Apply(const VersionEdit* edit);
// Save the current Version to the provided `vstorage`.
Status SaveTo(VersionStorageInfo* vstorage) const;
// Load all the table handlers for the current Version in the builder.
Status LoadTableHandlers(
InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
uint8_t block_protection_bytes_per_key);
uint64_t GetMinOldestBlobFileNumber() const;
//============APIs only used by VersionEditHandlerPointInTime ============//
// Creates a save point for the Version that has been built so far. Subsequent
// VersionEdits applied to the builder will not affect the Version in this
// save point. VersionBuilder currently only supports creating one save point,
// so when `CreateOrReplaceSavePoint` is called again, the previous save point
// is cleared. `ClearSavePoint` can be called explicitly to clear
// the save point too.
void CreateOrReplaceSavePoint();
// The builder can find all the files to build a `Version`. Or if
// `allow_incomplete_valid_version_` is true and the version history is never
// edited in an atomic group, and only a suffix of L0 SST files and their
// associated blob files are missing.
// From the users' perspective, missing a suffix of L0 files means missing the
// user's most recently written data. So the remaining available files still
// presents a valid point in time view, although for some previous time.
// This validity check result will be cached and reused if the Version is not
// updated between two validity checks.
bool ValidVersionAvailable();
bool HasMissingFiles() const;
// When applying a sequence of VersionEdit, intermediate files are the ones
// that are added and then deleted. The caller should clear this intermediate
// files tracking after calling this API. So that the tracking for subsequent
// VersionEdits can start over with a clean state.
std::vector<std::string>& GetAndClearIntermediateFiles();
// Clearing all the found files in this Version.
void ClearFoundFiles();
// Save the Version in the save point to the provided `vstorage`.
// Non-OK status will be returned if there is not a valid save point.
Status SaveSavePointTo(VersionStorageInfo* vstorage) const;
// Load all the table handlers for the Version in the save point.
// Non-OK status will be returned if there is not a valid save point.
Status LoadSavePointTableHandlers(
InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
uint8_t block_protection_bytes_per_key);
void ClearSavePoint();
//======= End of APIs only used by VersionEditPointInTime==========//
private:
class Rep;
std::unique_ptr<Rep> savepoint_;
std::unique_ptr<Rep> rep_;
};
@ -62,8 +121,15 @@ class VersionBuilder {
// Both of the constructor and destructor need to be called inside DB Mutex.
class BaseReferencedVersionBuilder {
public:
explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd);
BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v);
explicit BaseReferencedVersionBuilder(
ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler = nullptr,
bool track_found_and_missing_files = false,
bool allow_incomplete_valid_version = false);
BaseReferencedVersionBuilder(
ColumnFamilyData* cfd, Version* v,
VersionEditHandler* version_edit_handler = nullptr,
bool track_found_and_missing_files = false,
bool allow_incomplete_valid_version = false);
~BaseReferencedVersionBuilder();
VersionBuilder* version_builder() const { return version_builder_.get(); }
@ -71,23 +137,4 @@ class BaseReferencedVersionBuilder {
std::unique_ptr<VersionBuilder> version_builder_;
Version* version_;
};
class NewestFirstBySeqNo {
public:
bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
assert(lhs);
assert(rhs);
if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
}
if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
}
// Break ties by file number
return lhs->fd.GetNumber() > rhs->fd.GetNumber();
}
};
} // namespace ROCKSDB_NAMESPACE

View File

@ -155,6 +155,7 @@ VersionEditHandler::VersionEditHandler(
VersionSet* version_set, bool track_found_and_missing_files,
bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options, bool skip_load_table_files,
bool allow_incomplete_valid_version,
EpochNumberRequirement epoch_number_requirement)
: VersionEditHandlerBase(read_options),
read_only_(read_only),
@ -165,6 +166,7 @@ VersionEditHandler::VersionEditHandler(
io_tracer_(io_tracer),
skip_load_table_files_(skip_load_table_files),
initialized_(false),
allow_incomplete_valid_version_(allow_incomplete_valid_version),
epoch_number_requirement_(epoch_number_requirement) {
assert(version_set_ != nullptr);
}
@ -218,15 +220,15 @@ Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit,
Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
ColumnFamilyData** cfd) {
bool cf_in_not_found = false;
bool do_not_open_cf = false;
bool cf_in_builders = false;
CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders);
assert(cfd != nullptr);
*cfd = nullptr;
const std::string& cf_name = edit.GetColumnFamilyName();
Status s;
if (cf_in_builders || cf_in_not_found) {
if (cf_in_builders || do_not_open_cf) {
s = Status::Corruption("MANIFEST adding the same column family twice: " +
cf_name);
}
@ -239,7 +241,7 @@ Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
cf_name.compare(kPersistentStatsColumnFamilyName) == 0;
if (cf_options == name_to_options_.end() &&
!is_persistent_stats_column_family) {
column_families_not_found_.emplace(edit.GetColumnFamily(), cf_name);
do_not_open_column_families_.emplace(edit.GetColumnFamily(), cf_name);
} else {
if (is_persistent_stats_column_family) {
ColumnFamilyOptions cfo;
@ -256,9 +258,9 @@ Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit,
ColumnFamilyData** cfd) {
bool cf_in_not_found = false;
bool do_not_open_cf = false;
bool cf_in_builders = false;
CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders);
assert(cfd != nullptr);
*cfd = nullptr;
@ -266,8 +268,8 @@ Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit,
Status s;
if (cf_in_builders) {
tmp_cfd = DestroyCfAndCleanup(edit);
} else if (cf_in_not_found) {
column_families_not_found_.erase(edit.GetColumnFamily());
} else if (do_not_open_cf) {
do_not_open_column_families_.erase(edit.GetColumnFamily());
} else {
s = Status::Corruption("MANIFEST - dropping non-existing column family");
}
@ -288,22 +290,20 @@ Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) {
Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit,
ColumnFamilyData** cfd) {
bool cf_in_not_found = false;
bool do_not_open_cf = false;
bool cf_in_builders = false;
CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders);
assert(cfd != nullptr);
*cfd = nullptr;
Status s;
if (!cf_in_not_found) {
if (!do_not_open_cf) {
if (!cf_in_builders) {
s = Status::Corruption(
"MANIFEST record referencing unknown column family");
}
ColumnFamilyData* tmp_cfd = nullptr;
if (s.ok()) {
auto builder_iter = builders_.find(edit.GetColumnFamily());
assert(builder_iter != builders_.end());
tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily(
edit.GetColumnFamily());
assert(tmp_cfd != nullptr);
@ -318,56 +318,33 @@ Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit,
if (!s.ok()) {
return s;
}
s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false);
if (s.ok()) {
s = builder_iter->second->version_builder()->Apply(&edit);
}
s = MaybeCreateVersionBeforeApplyEdit(edit, tmp_cfd,
/*force_create_version=*/false);
}
*cfd = tmp_cfd;
}
return s;
}
// TODO maybe cache the computation result
bool VersionEditHandler::HasMissingFiles() const {
bool ret = false;
for (const auto& elem : cf_to_missing_files_) {
const auto& missing_files = elem.second;
if (!missing_files.empty()) {
ret = true;
break;
}
}
if (!ret) {
for (const auto& elem : cf_to_missing_blob_files_high_) {
if (elem.second != kInvalidBlobFileNumber) {
ret = true;
break;
}
}
}
return ret;
}
void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit,
bool* cf_in_not_found,
bool* do_not_open_cf,
bool* cf_in_builders) const {
assert(cf_in_not_found != nullptr);
assert(do_not_open_cf != nullptr);
assert(cf_in_builders != nullptr);
// Not found means that user didn't supply that column
// family option AND we encountered column family add
// record. Once we encounter column family drop record,
// we will delete the column family from
// column_families_not_found.
// do_not_open_column_families_.
uint32_t cf_id = edit.GetColumnFamily();
bool in_not_found = column_families_not_found_.find(cf_id) !=
column_families_not_found_.end();
bool in_do_not_open = do_not_open_column_families_.find(cf_id) !=
do_not_open_column_families_.end();
// in builders means that user supplied that column family
// option AND that we encountered column family add record
bool in_builders = builders_.find(cf_id) != builders_.end();
// They cannot both be true
assert(!(in_not_found && in_builders));
*cf_in_not_found = in_not_found;
assert(!(in_do_not_open && in_builders));
*do_not_open_cf = in_do_not_open;
*cf_in_builders = in_builders;
}
@ -396,9 +373,9 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
// There were some column families in the MANIFEST that weren't specified
// in the argument. This is OK in read_only mode
if (s->ok() && MustOpenAllColumnFamilies() &&
!column_families_not_found_.empty()) {
!do_not_open_column_families_.empty()) {
std::string msg;
for (const auto& cf : column_families_not_found_) {
for (const auto& cf : do_not_open_column_families_) {
msg.append(", ");
msg.append(cf.second);
}
@ -453,7 +430,8 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
}
assert(cfd->initialized());
VersionEdit edit;
*s = MaybeCreateVersion(edit, cfd, /*force_create_version=*/true);
*s = MaybeCreateVersionBeforeApplyEdit(edit, cfd,
/*force_create_version=*/true);
if (!s->ok()) {
break;
}
@ -498,13 +476,9 @@ ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
assert(cfd != nullptr);
cfd->set_initialized();
assert(builders_.find(cf_id) == builders_.end());
builders_.emplace(cf_id,
VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd)));
if (track_found_and_missing_files_) {
cf_to_found_files_.emplace(cf_id, std::unordered_set<uint64_t>());
cf_to_missing_files_.emplace(cf_id, std::unordered_set<uint64_t>());
cf_to_missing_blob_files_high_.emplace(cf_id, kInvalidBlobFileNumber);
}
builders_.emplace(cf_id, VersionBuilderUPtr(new BaseReferencedVersionBuilder(
cfd, this, track_found_and_missing_files_,
allow_incomplete_valid_version_)));
return cfd;
}
@ -514,21 +488,6 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
auto builder_iter = builders_.find(cf_id);
assert(builder_iter != builders_.end());
builders_.erase(builder_iter);
if (track_found_and_missing_files_) {
auto found_files_iter = cf_to_found_files_.find(cf_id);
assert(found_files_iter != cf_to_found_files_.end());
cf_to_found_files_.erase(found_files_iter);
auto missing_files_iter = cf_to_missing_files_.find(cf_id);
assert(missing_files_iter != cf_to_missing_files_.end());
cf_to_missing_files_.erase(missing_files_iter);
auto missing_blob_files_high_iter =
cf_to_missing_blob_files_high_.find(cf_id);
assert(missing_blob_files_high_iter !=
cf_to_missing_blob_files_high_.end());
cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter);
}
ColumnFamilyData* ret =
version_set_->GetColumnFamilySet()->GetColumnFamily(cf_id);
assert(ret != nullptr);
@ -538,15 +497,14 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
return ret;
}
Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/,
ColumnFamilyData* cfd,
bool force_create_version) {
Status VersionEditHandler::MaybeCreateVersionBeforeApplyEdit(
const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) {
assert(cfd->initialized());
Status s;
auto builder_iter = builders_.find(cfd->GetID());
assert(builder_iter != builders_.end());
auto* builder = builder_iter->second->version_builder();
if (force_create_version) {
auto builder_iter = builders_.find(cfd->GetID());
assert(builder_iter != builders_.end());
auto* builder = builder_iter->second->version_builder();
auto* v = new Version(cfd, version_set_, version_set_->file_options_,
*cfd->GetLatestMutableCFOptions(), io_tracer_,
version_set_->current_version_number_++,
@ -562,6 +520,7 @@ Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/,
delete v;
}
}
s = builder->Apply(&edit);
return s;
}
@ -731,12 +690,13 @@ Status VersionEditHandler::MaybeHandleFileBoundariesForNewFiles(
VersionEditHandlerPointInTime::VersionEditHandlerPointInTime(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options,
const ReadOptions& read_options, bool allow_incomplete_valid_version,
EpochNumberRequirement epoch_number_requirement)
: VersionEditHandler(read_only, column_families, version_set,
/*track_found_and_missing_files=*/true,
/*no_error_if_files_missing=*/true, io_tracer,
read_options, epoch_number_requirement) {}
read_options, allow_incomplete_valid_version,
epoch_number_requirement) {}
VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() {
for (const auto& cfid_and_version : atomic_update_versions_) {
@ -762,7 +722,8 @@ Status VersionEditHandlerPointInTime::OnAtomicGroupReplayBegin() {
assert(!cfd->IsDropped());
assert(cfd->initialized());
VersionEdit edit;
Status s = MaybeCreateVersion(edit, cfd, true /* force_create_version */);
Status s = MaybeCreateVersionBeforeApplyEdit(
edit, cfd, true /* force_create_version */);
if (!s.ok()) {
return s;
}
@ -824,17 +785,17 @@ void VersionEditHandlerPointInTime::CheckIterationResult(
}
assert(cfd->initialized());
auto v_iter = versions_.find(cfd->GetID());
auto builder_iter = builders_.find(cfd->GetID());
if (v_iter != versions_.end()) {
assert(v_iter->second != nullptr);
assert(builder_iter != builders_.end());
version_set_->AppendVersion(cfd, v_iter->second);
versions_.erase(v_iter);
// Let's clear found_files, since any files in that are part of the
// installed Version. Any files that got obsoleted would have already
// been moved to intermediate_files_
auto found_files_iter = cf_to_found_files_.find(cfd->GetID());
assert(found_files_iter != cf_to_found_files_.end());
found_files_iter->second.clear();
builder_iter->second->version_builder()->ClearFoundFiles();
}
}
} else {
@ -863,147 +824,50 @@ ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup(
return cfd;
}
Status VersionEditHandlerPointInTime::MaybeCreateVersion(
Status VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit(
const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) {
TEST_SYNC_POINT("VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1");
TEST_SYNC_POINT("VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2");
TEST_SYNC_POINT(
"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin1");
TEST_SYNC_POINT(
"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin2");
assert(cfd != nullptr);
if (!force_create_version) {
assert(edit.GetColumnFamily() == cfd->GetID());
}
auto found_files_iter = cf_to_found_files_.find(cfd->GetID());
assert(found_files_iter != cf_to_found_files_.end());
std::unordered_set<uint64_t>& found_files = found_files_iter->second;
auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID());
assert(missing_files_iter != cf_to_missing_files_.end());
std::unordered_set<uint64_t>& missing_files = missing_files_iter->second;
auto missing_blob_files_high_iter =
cf_to_missing_blob_files_high_.find(cfd->GetID());
assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end());
const uint64_t prev_missing_blob_file_high =
missing_blob_files_high_iter->second;
VersionBuilder* builder = nullptr;
if (prev_missing_blob_file_high != kInvalidBlobFileNumber) {
auto builder_iter = builders_.find(cfd->GetID());
assert(builder_iter != builders_.end());
builder = builder_iter->second->version_builder();
assert(builder != nullptr);
}
// At this point, we have not yet applied the new version edits read from the
// MANIFEST. We check whether we have any missing table and blob files.
const bool prev_has_missing_files =
!missing_files.empty() ||
(prev_missing_blob_file_high != kInvalidBlobFileNumber &&
prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber());
for (const auto& file : edit.GetDeletedFiles()) {
uint64_t file_num = file.second;
auto fiter = missing_files.find(file_num);
if (fiter != missing_files.end()) {
missing_files.erase(fiter);
} else {
fiter = found_files.find(file_num);
// Only mark new files added during this catchup attempt for deletion.
// These files were never installed in VersionStorageInfo.
// Already referenced files that are deleted by a VersionEdit will
// be added to the VersionStorageInfo's obsolete files when the old
// version is dereferenced.
if (fiter != found_files.end()) {
intermediate_files_.emplace_back(
MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num));
found_files.erase(fiter);
}
}
}
assert(!cfd->ioptions()->cf_paths.empty());
Status s;
for (const auto& elem : edit.GetNewFiles()) {
int level = elem.first;
const FileMetaData& meta = elem.second;
const FileDescriptor& fd = meta.fd;
uint64_t file_num = fd.GetNumber();
const std::string fpath =
MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num);
s = VerifyFile(cfd, fpath, level, meta);
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
missing_files.insert(file_num);
if (s.IsCorruption()) {
found_files.insert(file_num);
}
s = Status::OK();
} else if (!s.ok()) {
break;
} else {
found_files.insert(file_num);
}
}
uint64_t missing_blob_file_num = prev_missing_blob_file_high;
for (const auto& elem : edit.GetBlobFileAdditions()) {
uint64_t file_num = elem.GetBlobFileNumber();
s = VerifyBlobFile(cfd, file_num, elem);
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
missing_blob_file_num = std::max(missing_blob_file_num, file_num);
s = Status::OK();
} else if (!s.ok()) {
break;
}
}
bool has_missing_blob_files = false;
if (missing_blob_file_num != kInvalidBlobFileNumber &&
missing_blob_file_num >= prev_missing_blob_file_high) {
missing_blob_files_high_iter->second = missing_blob_file_num;
has_missing_blob_files = true;
} else if (missing_blob_file_num < prev_missing_blob_file_high) {
assert(false);
}
// We still have not applied the new version edit, but have tried to add new
// table and blob files after verifying their presence and consistency.
// Therefore, we know whether we will see new missing table and blob files
// later after actually applying the version edit. We perform the check here
// and record the result.
const bool has_missing_files =
!missing_files.empty() || has_missing_blob_files;
bool missing_info = !version_edit_params_.HasLogNumber() ||
!version_edit_params_.HasNextFile() ||
!version_edit_params_.HasLastSequence();
// Create version before apply edit. The version will represent the state
// before applying the version edit.
Status s;
auto builder_iter = builders_.find(cfd->GetID());
assert(builder_iter != builders_.end());
VersionBuilder* builder = builder_iter->second->version_builder();
const bool valid_pit_before_edit = builder->ValidVersionAvailable();
builder->CreateOrReplaceSavePoint();
s = builder->Apply(&edit);
const bool valid_pit_after_edit = builder->ValidVersionAvailable();
// A new version will be created if:
// 1) no error has occurred so far, and
// 2) log_number_, next_file_number_ and last_sequence_ are known, and
// 3) not in an AtomicGroup
// 4) any of the following:
// a) no missing file before, but will have missing file(s) after applying
// this version edit.
// b) no missing file after applying the version edit, and the caller
// explicitly request that a new version be created.
// a) a valid Version is available before applying the edit
// and a valid Version is not available after the edit.
// b) a valid Version is available after the edit and the
// caller explicitly request that a new version be created.
if (s.ok() && !missing_info && !in_atomic_group_ &&
((has_missing_files && !prev_has_missing_files) ||
(!has_missing_files && force_create_version))) {
if (!builder) {
auto builder_iter = builders_.find(cfd->GetID());
assert(builder_iter != builders_.end());
builder = builder_iter->second->version_builder();
assert(builder);
}
((!valid_pit_after_edit && valid_pit_before_edit) ||
(valid_pit_after_edit && force_create_version))) {
const MutableCFOptions* cf_opts_ptr = cfd->GetLatestMutableCFOptions();
auto* version = new Version(cfd, version_set_, version_set_->file_options_,
*cf_opts_ptr, io_tracer_,
version_set_->current_version_number_++,
epoch_number_requirement_);
s = builder->LoadTableHandlers(
s = builder->LoadSavePointTableHandlers(
cfd->internal_stats(),
version_set_->db_options_->max_file_opening_threads, false, true,
cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr),
@ -1015,7 +879,7 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
}
return s;
}
s = builder->SaveTo(version->storage_info());
s = builder->SaveSavePointTo(version->storage_info());
if (s.ok()) {
if (AtomicUpdateVersionsContains(cfd->GetID())) {
AtomicUpdateVersionsPut(version);
@ -1038,6 +902,8 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
delete version;
}
}
builder->ClearSavePoint();
return s;
}
@ -1072,6 +938,15 @@ Status VersionEditHandlerPointInTime::LoadTables(
return Status::OK();
}
bool VersionEditHandlerPointInTime::HasMissingFiles() const {
for (const auto& builder : builders_) {
if (builder.second->version_builder()->HasMissingFiles()) {
return true;
}
}
return false;
}
bool VersionEditHandlerPointInTime::AtomicUpdateVersionsCompleted() {
return atomic_update_versions_missing_ == 0;
}
@ -1145,8 +1020,9 @@ Status ManifestTailer::Initialize() {
Version* base_version = dummy_version->Next();
assert(base_version);
base_version->Ref();
VersionBuilderUPtr new_builder(
new BaseReferencedVersionBuilder(default_cfd, base_version));
VersionBuilderUPtr new_builder(new BaseReferencedVersionBuilder(
default_cfd, base_version, this, track_found_and_missing_files_,
allow_incomplete_valid_version_));
builder_iter->second = std::move(new_builder);
initialized_ = true;
@ -1189,8 +1065,8 @@ Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit,
Version* base_version = dummy_version->Next();
assert(base_version);
base_version->Ref();
VersionBuilderUPtr new_builder(
new BaseReferencedVersionBuilder(tmp_cfd, base_version));
VersionBuilderUPtr new_builder(new BaseReferencedVersionBuilder(
tmp_cfd, base_version, this, track_found_and_missing_files_));
builder_iter->second = std::move(new_builder);
#ifndef NDEBUG
@ -1213,6 +1089,18 @@ void ManifestTailer::CheckIterationResult(const log::Reader& reader,
}
}
std::vector<std::string> ManifestTailer::GetAndClearIntermediateFiles() {
std::vector<std::string> res;
for (const auto& builder : builders_) {
auto files =
builder.second->version_builder()->GetAndClearIntermediateFiles();
res.insert(res.end(), std::make_move_iterator(files.begin()),
std::make_move_iterator(files.end()));
files.erase(files.begin(), files.end());
}
return res;
}
Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd,
const std::string& fpath, int level,
const FileMetaData& fmeta) {

View File

@ -100,7 +100,9 @@ using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
// A class used for scanning MANIFEST file.
// VersionEditHandler reads a MANIFEST file, parses the version edits, and
// builds the version set's in-memory state, e.g. the version storage info for
// the versions of column families.
// the versions of column families. It replays all the version edits in one
// MANIFEST file to build the end version.
//
// To use this class and its subclasses,
// 1. Create an object of VersionEditHandler or its subclasses.
// VersionEditHandler handler(read_only, column_families, version_set,
@ -119,13 +121,14 @@ class VersionEditHandler : public VersionEditHandlerBase {
VersionSet* version_set, bool track_found_and_missing_files,
bool no_error_if_files_missing,
const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options,
const ReadOptions& read_options, bool allow_incomplete_valid_version,
EpochNumberRequirement epoch_number_requirement =
EpochNumberRequirement::kMustPresent)
: VersionEditHandler(read_only, column_families, version_set,
track_found_and_missing_files,
no_error_if_files_missing, io_tracer, read_options,
/*skip_load_table_files=*/false,
allow_incomplete_valid_version,
epoch_number_requirement) {}
~VersionEditHandler() override {}
@ -134,14 +137,24 @@ class VersionEditHandler : public VersionEditHandlerBase {
return version_edit_params_;
}
bool HasMissingFiles() const;
void GetDbId(std::string* db_id) const {
if (db_id && version_edit_params_.HasDbId()) {
*db_id = version_edit_params_.GetDbId();
}
}
virtual Status VerifyFile(ColumnFamilyData* /*cfd*/,
const std::string& /*fpath*/, int /*level*/,
const FileMetaData& /*fmeta*/) {
return Status::OK();
}
virtual Status VerifyBlobFile(ColumnFamilyData* /*cfd*/,
uint64_t /*blob_file_num*/,
const BlobFileAddition& /*blob_addition*/) {
return Status::OK();
}
protected:
explicit VersionEditHandler(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
@ -149,6 +162,7 @@ class VersionEditHandler : public VersionEditHandlerBase {
bool no_error_if_files_missing,
const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options, bool skip_load_table_files,
bool allow_incomplete_valid_version,
EpochNumberRequirement epoch_number_requirement =
EpochNumberRequirement::kMustPresent);
@ -166,7 +180,7 @@ class VersionEditHandler : public VersionEditHandlerBase {
Status Initialize() override;
void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found,
void CheckColumnFamilyId(const VersionEdit& edit, bool* do_not_open_cf,
bool* cf_in_builders) const;
void CheckIterationResult(const log::Reader& reader, Status* s) override;
@ -176,9 +190,9 @@ class VersionEditHandler : public VersionEditHandlerBase {
virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit);
virtual Status MaybeCreateVersion(const VersionEdit& edit,
ColumnFamilyData* cfd,
bool force_create_version);
virtual Status MaybeCreateVersionBeforeApplyEdit(const VersionEdit& edit,
ColumnFamilyData* cfd,
bool force_create_version);
virtual Status LoadTables(ColumnFamilyData* cfd,
bool prefetch_index_and_filter_in_cache,
@ -191,21 +205,23 @@ class VersionEditHandler : public VersionEditHandlerBase {
VersionSet* version_set_;
std::unordered_map<uint32_t, VersionBuilderUPtr> builders_;
std::unordered_map<std::string, ColumnFamilyOptions> name_to_options_;
// Keeps track of column families in manifest that were not found in
// column families parameters. if those column families are not dropped
// by subsequent manifest records, Recover() will return failure status.
std::unordered_map<uint32_t, std::string> column_families_not_found_;
VersionEditParams version_edit_params_;
const bool track_found_and_missing_files_;
std::unordered_map<uint32_t, std::unordered_set<uint64_t>> cf_to_found_files_;
std::unordered_map<uint32_t, std::unordered_set<uint64_t>>
cf_to_missing_files_;
std::unordered_map<uint32_t, uint64_t> cf_to_missing_blob_files_high_;
// Keeps track of column families in manifest that were not found in
// column families parameters. Namely, the user asks to not open these column
// families. In non read only mode, if those column families are not dropped
// by subsequent manifest records, Recover() will return failure status.
std::unordered_map<uint32_t, std::string> do_not_open_column_families_;
VersionEditParams version_edit_params_;
bool no_error_if_files_missing_;
std::shared_ptr<IOTracer> io_tracer_;
bool skip_load_table_files_;
bool initialized_;
std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_;
// If false, only a complete Version for which all files consisting it can be
// found is considered a valid Version. If true, besides complete Version, an
// incomplete Version with only a suffix of L0 files missing is also
// considered valid if the Version is never edited in an atomic group.
const bool allow_incomplete_valid_version_;
EpochNumberRequirement epoch_number_requirement_;
std::unordered_set<uint32_t> cfds_to_mark_no_udt_;
@ -226,8 +242,18 @@ class VersionEditHandler : public VersionEditHandlerBase {
};
// A class similar to its base class, i.e. VersionEditHandler.
// VersionEditHandlerPointInTime restores the versions to the most recent point
// in time such that at this point, the version does not have missing files.
// Unlike VersionEditHandler that only aims to build the end version, this class
// supports building the most recent point in time version. A point in time
// version is a version for which no files are missing, or if
// `allow_incomplete_valid_version` is true, only a suffix of L0 files (and
// their associated blob files) are missing.
//
// Building a point in time version when end version is not available can
// be useful for best efforts recovery (options.best_efforts_recovery), which
// uses this class and sets `allow_incomplete_valid_version` to true.
// It's also useful for secondary instances/follower instances for which end
// version could be transiently unavailable. These two cases use subclass
// `ManifestTailer` and sets `allow_incomplete_valid_version` to false.
//
// Not thread-safe, external synchronization is necessary if an object of
// VersionEditHandlerPointInTime is shared by multiple threads.
@ -236,28 +262,32 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
VersionEditHandlerPointInTime(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options,
const ReadOptions& read_options, bool allow_incomplete_valid_version,
EpochNumberRequirement epoch_number_requirement =
EpochNumberRequirement::kMustPresent);
~VersionEditHandlerPointInTime() override;
bool HasMissingFiles() const;
virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath,
int level, const FileMetaData& fmeta) override;
virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
const BlobFileAddition& blob_addition) override;
protected:
Status OnAtomicGroupReplayBegin() override;
Status OnAtomicGroupReplayEnd() override;
void CheckIterationResult(const log::Reader& reader, Status* s) override;
ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override;
// `MaybeCreateVersion(..., false)` creates a version upon a negative edge
// trigger (transition from valid to invalid).
// `MaybeCreateVersionBeforeApplyEdit(..., false)` creates a version upon a
// negative edge trigger (transition from valid to invalid).
//
// `MaybeCreateVersion(..., true)` creates a version on a positive level
// trigger (state is valid).
Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd,
bool force_create_version) override;
virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath,
int level, const FileMetaData& fmeta);
virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
const BlobFileAddition& blob_addition);
// `MaybeCreateVersionBeforeApplyEdit(..., true)` creates a version on a
// positive level trigger (state is valid).
Status MaybeCreateVersionBeforeApplyEdit(const VersionEdit& edit,
ColumnFamilyData* cfd,
bool force_create_version) override;
Status LoadTables(ColumnFamilyData* cfd,
bool prefetch_index_and_filter_in_cache,
@ -275,8 +305,6 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
bool in_atomic_group_ = false;
std::vector<std::string> intermediate_files_;
private:
bool AtomicUpdateVersionsCompleted();
bool AtomicUpdateVersionsContains(uint32_t cfid);
@ -292,6 +320,12 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
void AtomicUpdateVersionsApply();
};
// A class similar to `VersionEditHandlerPointInTime` that parse MANIFEST and
// builds point in time version.
// `ManifestTailer` supports reading one MANIFEST file in multiple tailing
// attempts and supports switching to a different MANIFEST after
// `PrepareToReadNewManifest` is called. This class is used by secondary and
// follower instance.
class ManifestTailer : public VersionEditHandlerPointInTime {
public:
explicit ManifestTailer(std::vector<ColumnFamilyDescriptor> column_families,
@ -302,9 +336,13 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
EpochNumberRequirement::kMustPresent)
: VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
version_set, io_tracer, read_options,
/*allow_incomplete_valid_version=*/false,
epoch_number_requirement),
mode_(Mode::kRecovery) {}
Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
const FileMetaData& fmeta) override;
void PrepareToReadNewManifest() {
initialized_ = false;
ClearReadBuffer();
@ -314,9 +352,7 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
return cfds_changed_;
}
std::vector<std::string>& GetIntermediateFiles() {
return intermediate_files_;
}
std::vector<std::string> GetAndClearIntermediateFiles();
protected:
Status Initialize() override;
@ -329,9 +365,6 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
void CheckIterationResult(const log::Reader& reader, Status* s) override;
Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
const FileMetaData& fmeta) override;
enum Mode : uint8_t {
kRecovery = 0,
kCatchUp = 1,
@ -352,7 +385,9 @@ class DumpManifestHandler : public VersionEditHandler {
/*read_only=*/true, column_families, version_set,
/*track_found_and_missing_files=*/false,
/*no_error_if_files_missing=*/false, io_tracer, read_options,
/*skip_load_table_files=*/true),
/*skip_load_table_files=*/true,
/*allow_incomplete_valid_version=*/false,
/*epoch_number_requirement=*/EpochNumberRequirement::kMustPresent),
verbose_(verbose),
hex_(hex),
json_(json),

View File

@ -5511,6 +5511,10 @@ Status VersionSet::ProcessManifestWrites(
std::unique_ptr<log::Writer> new_desc_log_ptr;
{
FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
// DB option (in file_options_) takes precedence when not kUnknown
if (file_options_.temperature != Temperature::kUnknown) {
opt_file_opts.temperature = file_options_.temperature;
}
mu->Unlock();
TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart");
TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr);
@ -5637,9 +5641,9 @@ Status VersionSet::ProcessManifestWrites(
assert(manifest_io_status.ok());
}
if (s.ok() && new_descriptor_log) {
io_s = SetCurrentFile(write_options, fs_.get(), dbname_,
pending_manifest_file_number_,
dir_contains_current_file);
io_s = SetCurrentFile(
write_options, fs_.get(), dbname_, pending_manifest_file_number_,
file_options_.temperature, dir_contains_current_file);
if (!io_s.ok()) {
s = io_s;
// Quarantine old manifest file in case new manifest file's CURRENT file
@ -6080,7 +6084,8 @@ Status VersionSet::Recover(
VersionEditHandler handler(
read_only, column_families, const_cast<VersionSet*>(this),
/*track_found_and_missing_files=*/false, no_error_if_files_missing,
io_tracer_, read_options, EpochNumberRequirement::kMightMissing);
io_tracer_, read_options, /*allow_incomplete_valid_version=*/false,
EpochNumberRequirement::kMightMissing);
handler.Iterate(reader, &log_read_status);
s = handler.status();
if (s.ok()) {
@ -6256,7 +6261,8 @@ Status VersionSet::TryRecoverFromOneManifest(
/*checksum=*/true, /*log_num=*/0);
VersionEditHandlerPointInTime handler_pit(
read_only, column_families, const_cast<VersionSet*>(this), io_tracer_,
read_options, EpochNumberRequirement::kMightMissing);
read_options, /*allow_incomplete_valid_version=*/true,
EpochNumberRequirement::kMightMissing);
handler_pit.Iterate(reader, &s);
@ -7477,7 +7483,7 @@ Status ReactiveVersionSet::ReadAndApply(
*cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
}
if (files_to_delete) {
*files_to_delete = std::move(manifest_tailer_->GetIntermediateFiles());
*files_to_delete = manifest_tailer_->GetAndClearIntermediateFiles();
}
return s;

View File

@ -1277,6 +1277,15 @@ class VersionSet {
bool no_error_if_files_missing = false, bool is_retry = false,
Status* log_status = nullptr);
// Do a best-efforts recovery (Options.best_efforts_recovery=true) from all
// available MANIFEST files. Similar to `Recover` with these differences:
// 1) not only the latest MANIFEST can be used, if it's not available or
// no successful recovery can be achieved with it, this function also tries
// to recover from previous MANIFEST files, in reverse chronological order
// until a successful recovery can be achieved.
// 2) this function doesn't just aim to recover to the latest version, if that
// is not available, the most recent point in time version will be saved in
// memory. Check doc for `VersionEditHandlerPointInTime` for more details.
Status TryRecover(const std::vector<ColumnFamilyDescriptor>& column_families,
bool read_only,
const std::vector<std::string>& files_in_dbname,

View File

@ -11,6 +11,7 @@
#include <algorithm>
#include "db/blob/blob_log_writer.h"
#include "db/db_impl/db_impl.h"
#include "db/db_test_util.h"
#include "db/log_writer.h"
@ -1345,18 +1346,27 @@ class VersionSetTestBase {
std::string key; // the only key
int level = 0;
uint64_t epoch_number;
bool file_missing = false;
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
SstInfo(uint64_t file_num, const std::string& cf_name,
const std::string& _key,
uint64_t _epoch_number = kUnknownEpochNumber)
: SstInfo(file_num, cf_name, _key, 0, _epoch_number) {}
uint64_t _epoch_number = kUnknownEpochNumber,
bool _file_missing = false,
uint64_t _oldest_blob_file_number = kInvalidBlobFileNumber)
: SstInfo(file_num, cf_name, _key, 0, _epoch_number, _file_missing,
_oldest_blob_file_number) {}
SstInfo(uint64_t file_num, const std::string& cf_name,
const std::string& _key, int lvl,
uint64_t _epoch_number = kUnknownEpochNumber)
uint64_t _epoch_number = kUnknownEpochNumber,
bool _file_missing = false,
uint64_t _oldest_blob_file_number = kInvalidBlobFileNumber)
: file_number(file_num),
column_family(cf_name),
key(_key),
level(lvl),
epoch_number(_epoch_number) {}
epoch_number(_epoch_number),
file_missing(_file_missing),
oldest_blob_file_number(_oldest_blob_file_number) {}
};
// Create dummy sst, return their metadata. Note that only file name and size
@ -1395,22 +1405,32 @@ class VersionSetTestBase {
ASSERT_NE(0, file_size);
file_metas->emplace_back(
file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false,
Temperature::kUnknown, 0, 0, 0, info.epoch_number,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2,
0, 0, /* user_defined_timestamps_persisted */ true);
Temperature::kUnknown, info.oldest_blob_file_number, 0, 0,
info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
kNullUniqueId64x2, 0, 0,
/* user_defined_timestamps_persisted */ true);
if (info.file_missing) {
ASSERT_OK(fs_->DeleteFile(fname, IOOptions(), nullptr));
}
}
}
void CreateCurrentFile() {
// Make "CURRENT" file point to the new manifest file.
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
Temperature::kUnknown,
/* dir_contains_current_file */ nullptr));
}
// Create DB with 3 column families.
void NewDB() {
SequenceNumber last_seqno;
std::unique_ptr<log::Writer> log_writer;
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_));
ASSERT_OK(
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
PrepareManifest(&column_families_, &last_seqno, &log_writer);
log_writer.reset();
// Make "CURRENT" file point to the new manifest file.
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
ASSERT_OK(s);
CreateCurrentFile();
EXPECT_OK(versions_->Recover(column_families_, false));
EXPECT_EQ(column_families_.size(),
@ -2586,7 +2606,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
edits_[i].MarkAtomicGroup(--remaining);
edits_[i].SetLastSequence(last_seqno_++);
}
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr));
CreateCurrentFile();
}
void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
@ -2598,7 +2618,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
edits_[i].MarkAtomicGroup(--remaining);
edits_[i].SetLastSequence(last_seqno_++);
}
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr));
CreateCurrentFile();
}
void SetupCorruptedAtomicGroup(int atomic_group_size) {
@ -2612,7 +2632,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
}
edits_[i].SetLastSequence(last_seqno_++);
}
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr));
CreateCurrentFile();
}
void SetupIncorrectAtomicGroup(int atomic_group_size) {
@ -2628,7 +2648,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
}
edits_[i].SetLastSequence(last_seqno_++);
}
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr));
CreateCurrentFile();
}
void SetupTestSyncPoints() {
@ -3394,8 +3414,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
SequenceNumber last_seqno;
std::unique_ptr<log::Writer> log_writer;
PrepareManifest(&column_families, &last_seqno, &log_writer);
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
ASSERT_OK(s);
CreateCurrentFile();
EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
EXPECT_EQ(column_families.size(),
@ -3417,7 +3436,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
cfd_to_drop->Ref();
drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
mutex_.Lock();
s = versions_->LogAndApply(
Status s = versions_->LogAndApply(
cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), read_options,
write_options, &drop_cf_edit, &mutex_, nullptr);
mutex_.Unlock();
@ -3527,9 +3546,7 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase,
TEST_F(EmptyDefaultCfNewManifest, Recover) {
PrepareManifest(nullptr, nullptr, &log_writer_);
log_writer_.reset();
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
CreateCurrentFile();
std::string manifest_path;
VerifyManifest(&manifest_path);
std::vector<ColumnFamilyDescriptor> column_families;
@ -3538,7 +3555,7 @@ TEST_F(EmptyDefaultCfNewManifest, Recover) {
cf_options_);
std::string db_id;
bool has_missing_table_file = false;
s = versions_->TryRecoverFromOneManifest(
Status s = versions_->TryRecoverFromOneManifest(
manifest_path, column_families, false, &db_id, &has_missing_table_file);
ASSERT_OK(s);
ASSERT_FALSE(has_missing_table_file);
@ -3559,7 +3576,8 @@ class VersionSetTestEmptyDb
assert(nullptr != log_writer);
VersionEdit new_db;
if (db_options_.write_dbid_to_manifest) {
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_));
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_,
Temperature::kUnknown));
DBOptions tmp_db_options;
tmp_db_options.env = env_;
std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
@ -3592,9 +3610,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
PrepareManifest(nullptr, nullptr, &log_writer_);
log_writer_.reset();
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
CreateCurrentFile();
std::string manifest_path;
VerifyManifest(&manifest_path);
@ -3609,9 +3625,9 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
std::string db_id;
bool has_missing_table_file = false;
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
read_only, &db_id,
&has_missing_table_file);
Status s = versions_->TryRecoverFromOneManifest(
manifest_path, column_families, read_only, &db_id,
&has_missing_table_file);
auto iter =
std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
if (iter == cf_names.end()) {
@ -3637,9 +3653,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
ASSERT_OK(s);
}
log_writer_.reset();
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
CreateCurrentFile();
std::string manifest_path;
VerifyManifest(&manifest_path);
@ -3685,9 +3699,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
ASSERT_OK(s);
}
log_writer_.reset();
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
CreateCurrentFile();
std::string manifest_path;
VerifyManifest(&manifest_path);
@ -3744,9 +3756,7 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
ASSERT_OK(s);
}
log_writer_.reset();
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
CreateCurrentFile();
std::string manifest_path;
VerifyManifest(&manifest_path);
@ -3802,9 +3812,7 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
ASSERT_OK(s);
}
log_writer_.reset();
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
CreateCurrentFile();
std::string manifest_path;
VerifyManifest(&manifest_path);
@ -3869,8 +3877,9 @@ INSTANTIATE_TEST_CASE_P(
class VersionSetTestMissingFiles : public VersionSetTestBase,
public testing::Test {
public:
VersionSetTestMissingFiles()
: VersionSetTestBase("version_set_test_missing_files"),
explicit VersionSetTestMissingFiles(
const std::string& test_name = "version_set_test_missing_files")
: VersionSetTestBase(test_name),
internal_comparator_(
std::make_shared<InternalKeyComparator>(options_.comparator)) {}
@ -3947,7 +3956,8 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
// This method updates last_sequence_.
void WriteFileAdditionAndDeletionToManifest(
uint32_t cf, const std::vector<std::pair<int, FileMetaData>>& added_files,
const std::vector<std::pair<int, uint64_t>>& deleted_files) {
const std::vector<std::pair<int, uint64_t>>& deleted_files,
const std::vector<BlobFileAddition>& blob_files = {}) {
VersionEdit edit;
edit.SetColumnFamily(cf);
for (const auto& elem : added_files) {
@ -3958,6 +3968,9 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
int level = elem.first;
edit.DeleteFile(level, elem.second);
}
for (const auto& elem : blob_files) {
edit.AddBlobFile(elem);
}
edit.SetLastSequence(last_seqno_);
++last_seqno_;
assert(log_writer_.get() != nullptr);
@ -4006,15 +4019,14 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
WriteFileAdditionAndDeletionToManifest(
/*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
log_writer_.reset();
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
ASSERT_OK(s);
CreateCurrentFile();
std::string manifest_path;
VerifyManifest(&manifest_path);
std::string db_id;
bool has_missing_table_file = false;
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
/*read_only=*/false, &db_id,
&has_missing_table_file);
Status s = versions_->TryRecoverFromOneManifest(
manifest_path, column_families_,
/*read_only=*/false, &db_id, &has_missing_table_file);
ASSERT_OK(s);
ASSERT_TRUE(has_missing_table_file);
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
@ -4064,15 +4076,14 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
WriteFileAdditionAndDeletionToManifest(
/*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
log_writer_.reset();
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
ASSERT_OK(s);
CreateCurrentFile();
std::string manifest_path;
VerifyManifest(&manifest_path);
std::string db_id;
bool has_missing_table_file = false;
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
/*read_only=*/false, &db_id,
&has_missing_table_file);
Status s = versions_->TryRecoverFromOneManifest(
manifest_path, column_families_,
/*read_only=*/false, &db_id, &has_missing_table_file);
ASSERT_OK(s);
ASSERT_TRUE(has_missing_table_file);
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
@ -4118,15 +4129,14 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
WriteFileAdditionAndDeletionToManifest(
/*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
log_writer_.reset();
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
ASSERT_OK(s);
CreateCurrentFile();
std::string manifest_path;
VerifyManifest(&manifest_path);
std::string db_id;
bool has_missing_table_file = false;
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
/*read_only=*/false, &db_id,
&has_missing_table_file);
Status s = versions_->TryRecoverFromOneManifest(
manifest_path, column_families_,
/*read_only=*/false, &db_id, &has_missing_table_file);
ASSERT_OK(s);
ASSERT_FALSE(has_missing_table_file);
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
@ -4171,6 +4181,250 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
}
}
class BestEffortsRecoverIncompleteVersionTest
: public VersionSetTestMissingFiles {
public:
BestEffortsRecoverIncompleteVersionTest()
: VersionSetTestMissingFiles("best_efforts_recover_incomplete_version") {}
struct BlobInfo {
uint64_t file_number;
bool file_missing;
std::string key;
std::string blob;
BlobInfo(uint64_t _file_number, bool _file_missing, std::string _key,
std::string _blob)
: file_number(_file_number),
file_missing(_file_missing),
key(_key),
blob(_blob) {}
};
void CreateDummyBlobFiles(const std::vector<BlobInfo>& infos,
std::vector<BlobFileAddition>* blob_metas) {
for (const auto& info : infos) {
if (!info.file_missing) {
WriteDummyBlobFile(info.file_number, info.key, info.blob);
}
blob_metas->emplace_back(
info.file_number, 1 /*total_blob_count*/,
info.key.size() + info.blob.size() /*total_blob_bytes*/,
"" /*checksum_method*/, "" /*check_sum_value*/);
}
}
// Creates a test blob file that is valid so it can pass the
// `VersionEditHandlerPointInTime::VerifyBlobFile` check.
void WriteDummyBlobFile(uint64_t blob_file_number, const Slice& key,
const Slice& blob) {
ImmutableOptions options;
std::string blob_file_path = BlobFileName(dbname_, blob_file_number);
std::unique_ptr<FSWritableFile> file;
ASSERT_OK(
fs_->NewWritableFile(blob_file_path, FileOptions(), &file, nullptr));
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
std::move(file), blob_file_path, FileOptions(), options.clock));
BlobLogWriter blob_log_writer(std::move(file_writer), options.clock,
/*statistics*/ nullptr, blob_file_number,
/*use_fsync*/ true,
/*do_flush*/ false);
constexpr ExpirationRange expiration_range;
BlobLogHeader header(/*column_family_id*/ 0, kNoCompression,
/*has_ttl*/ false, expiration_range);
ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header));
std::string compressed_blob;
uint64_t key_offset = 0;
uint64_t blob_offset = 0;
ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), key, blob, &key_offset,
&blob_offset));
BlobLogFooter footer;
footer.blob_count = 1;
footer.expiration_range = expiration_range;
std::string checksum_method;
std::string checksum_value;
ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer,
&checksum_method, &checksum_value));
}
void RecoverFromManifestWithMissingFiles(
const std::vector<std::pair<int, FileMetaData>>& added_files,
const std::vector<BlobFileAddition>& blob_files) {
PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
WriteFileAdditionAndDeletionToManifest(
/*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>(),
blob_files);
log_writer_.reset();
CreateCurrentFile();
std::string manifest_path;
VerifyManifest(&manifest_path);
std::string db_id;
bool has_missing_table_file = false;
Status s = versions_->TryRecoverFromOneManifest(
manifest_path, column_families_,
/*read_only=*/false, &db_id, &has_missing_table_file);
ASSERT_OK(s);
ASSERT_TRUE(has_missing_table_file);
}
};
TEST_F(BestEffortsRecoverIncompleteVersionTest, NonL0MissingFiles) {
std::vector<SstInfo> sst_files = {
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
100 /* epoch_number */, true /* file_missing */),
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
101 /* epoch_number */, false /* file_missing */),
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
102 /* epoch_number */, false /* file_missing */),
};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(sst_files, &file_metas);
std::vector<std::pair<int, FileMetaData>> added_files;
for (size_t i = 0; i < sst_files.size(); i++) {
const auto& info = sst_files[i];
const auto& meta = file_metas[i];
added_files.emplace_back(info.level, meta);
}
RecoverFromManifestWithMissingFiles(added_files,
std::vector<BlobFileAddition>());
std::vector<uint64_t> all_table_files;
std::vector<uint64_t> all_blob_files;
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
ASSERT_TRUE(all_table_files.empty());
}
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingNonSuffixL0Files) {
std::vector<SstInfo> sst_files = {
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
100 /* epoch_number */, false /* file_missing */),
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
101 /* epoch_number */, true /* file_missing */),
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
102 /* epoch_number */, false /* file_missing */),
};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(sst_files, &file_metas);
std::vector<std::pair<int, FileMetaData>> added_files;
for (size_t i = 0; i < sst_files.size(); i++) {
const auto& info = sst_files[i];
const auto& meta = file_metas[i];
added_files.emplace_back(info.level, meta);
}
RecoverFromManifestWithMissingFiles(added_files,
std::vector<BlobFileAddition>());
std::vector<uint64_t> all_table_files;
std::vector<uint64_t> all_blob_files;
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
ASSERT_TRUE(all_table_files.empty());
}
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingBlobFiles) {
std::vector<SstInfo> sst_files = {
SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */,
100 /* epoch_number */, false /* file_missing */,
102 /*oldest_blob_file_number*/),
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
101 /* epoch_number */, false /* file_missing */,
103 /*oldest_blob_file_number*/),
};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(sst_files, &file_metas);
std::vector<BlobInfo> blob_files = {
BlobInfo(102, true /*file_missing*/, "a", "blob1"),
BlobInfo(103, true /*file_missing*/, "a", "blob2"),
};
std::vector<BlobFileAddition> blob_meta;
CreateDummyBlobFiles(blob_files, &blob_meta);
std::vector<std::pair<int, FileMetaData>> added_files;
for (size_t i = 0; i < sst_files.size(); i++) {
const auto& info = sst_files[i];
const auto& meta = file_metas[i];
added_files.emplace_back(info.level, meta);
}
RecoverFromManifestWithMissingFiles(added_files, blob_meta);
std::vector<uint64_t> all_table_files;
std::vector<uint64_t> all_blob_files;
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
ASSERT_TRUE(all_table_files.empty());
}
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingL0SuffixOnly) {
std::vector<SstInfo> sst_files = {
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
100 /* epoch_number */, false /* file_missing */),
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
101 /* epoch_number */, false /* file_missing */),
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
102 /* epoch_number */, true /* file_missing */),
};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(sst_files, &file_metas);
std::vector<std::pair<int, FileMetaData>> added_files;
for (size_t i = 0; i < sst_files.size(); i++) {
const auto& info = sst_files[i];
const auto& meta = file_metas[i];
added_files.emplace_back(info.level, meta);
}
RecoverFromManifestWithMissingFiles(added_files,
std::vector<BlobFileAddition>());
std::vector<uint64_t> all_table_files;
std::vector<uint64_t> all_blob_files;
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
ASSERT_EQ(2, all_table_files.size());
ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
VersionStorageInfo* vstorage = cfd->current()->storage_info();
ASSERT_EQ(1, vstorage->LevelFiles(0).size());
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
}
TEST_F(BestEffortsRecoverIncompleteVersionTest,
MissingL0SuffixAndTheirBlobFiles) {
std::vector<SstInfo> sst_files = {
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
100 /* epoch_number */, false /* file_missing */),
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
101 /* epoch_number */, false /* file_missing */,
103 /*oldest_blob_file_number*/),
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
102 /* epoch_number */, true /* file_missing */,
104 /*oldest_blob_file_number*/),
};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(sst_files, &file_metas);
std::vector<BlobInfo> blob_files = {
BlobInfo(103, false /*file_missing*/, "a", "blob1"),
BlobInfo(104, true /*file_missing*/, "a", "blob2"),
};
std::vector<BlobFileAddition> blob_meta;
CreateDummyBlobFiles(blob_files, &blob_meta);
std::vector<std::pair<int, FileMetaData>> added_files;
for (size_t i = 0; i < sst_files.size(); i++) {
const auto& info = sst_files[i];
const auto& meta = file_metas[i];
added_files.emplace_back(info.level, meta);
}
RecoverFromManifestWithMissingFiles(added_files, blob_meta);
std::vector<uint64_t> all_table_files;
std::vector<uint64_t> all_blob_files;
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
ASSERT_EQ(2, all_table_files.size());
ASSERT_EQ(1, all_blob_files.size());
ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
VersionStorageInfo* vstorage = cfd->current()->storage_info();
ASSERT_EQ(1, vstorage->LevelFiles(0).size());
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
ASSERT_EQ(1, vstorage->GetBlobFiles().size());
}
class ChargeFileMetadataTest : public DBTestBase {
public:
ChargeFileMetadataTest()

View File

@ -929,15 +929,19 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
}
if (0 == ts_sz) {
return WriteBatchInternal::Put(this, cf_id, key, value);
s = WriteBatchInternal::Put(this, cf_id, key, value);
} else {
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
SliceParts(&value, 1));
}
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
SliceParts(&value, 1));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
}
Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key,
@ -962,7 +966,7 @@ Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key,
Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& ts, const Slice& value) {
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
if (!s.ok()) {
return s;
}
@ -970,8 +974,12 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
assert(column_family);
uint32_t cf_id = column_family->GetID();
std::array<Slice, 2> key_with_ts{{key, ts}};
return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
SliceParts(&value, 1));
s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
SliceParts(&value, 1));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts.size());
}
return s;
}
Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key,
@ -1039,7 +1047,11 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
}
if (ts_sz == 0) {
return WriteBatchInternal::Put(this, cf_id, key, value);
s = WriteBatchInternal::Put(this, cf_id, key, value);
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
}
return Status::InvalidArgument(
@ -1246,20 +1258,24 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
}
if (0 == ts_sz) {
return WriteBatchInternal::Delete(this, cf_id, key);
s = WriteBatchInternal::Delete(this, cf_id, key);
} else {
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
s = WriteBatchInternal::Delete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
}
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
return WriteBatchInternal::Delete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
}
Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& ts) {
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
if (!s.ok()) {
return s;
}
@ -1267,8 +1283,12 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
has_key_with_ts_ = true;
uint32_t cf_id = column_family->GetID();
std::array<Slice, 2> key_with_ts{{key, ts}};
return WriteBatchInternal::Delete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
s = WriteBatchInternal::Delete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts.size());
}
return s;
}
Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
@ -1313,7 +1333,11 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family,
}
if (0 == ts_sz) {
return WriteBatchInternal::Delete(this, cf_id, key);
s = WriteBatchInternal::Delete(this, cf_id, key);
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
}
return Status::InvalidArgument(
@ -1361,20 +1385,24 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
}
if (0 == ts_sz) {
return WriteBatchInternal::SingleDelete(this, cf_id, key);
s = WriteBatchInternal::SingleDelete(this, cf_id, key);
} else {
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
s = WriteBatchInternal::SingleDelete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
}
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
return WriteBatchInternal::SingleDelete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
}
Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
const Slice& key, const Slice& ts) {
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
if (!s.ok()) {
return s;
}
@ -1382,8 +1410,12 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
assert(column_family);
uint32_t cf_id = column_family->GetID();
std::array<Slice, 2> key_with_ts{{key, ts}};
return WriteBatchInternal::SingleDelete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
s = WriteBatchInternal::SingleDelete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts.size());
}
return s;
}
Status WriteBatchInternal::SingleDelete(WriteBatch* b,
@ -1430,7 +1462,11 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
}
if (0 == ts_sz) {
return WriteBatchInternal::SingleDelete(this, cf_id, key);
s = WriteBatchInternal::SingleDelete(this, cf_id, key);
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
}
return Status::InvalidArgument(
@ -1480,23 +1516,27 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
}
if (0 == ts_sz) {
return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
} else {
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
s = WriteBatchInternal::DeleteRange(this, cf_id,
SliceParts(begin_key_with_ts.data(), 2),
SliceParts(end_key_with_ts.data(), 2));
}
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
return WriteBatchInternal::DeleteRange(
this, cf_id, SliceParts(begin_key_with_ts.data(), 2),
SliceParts(end_key_with_ts.data(), 2));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
}
Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
const Slice& begin_key, const Slice& end_key,
const Slice& ts) {
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
if (!s.ok()) {
return s;
}
@ -1505,9 +1545,13 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
uint32_t cf_id = column_family->GetID();
std::array<Slice, 2> key_with_ts{{begin_key, ts}};
std::array<Slice, 2> end_key_with_ts{{end_key, ts}};
return WriteBatchInternal::DeleteRange(this, cf_id,
SliceParts(key_with_ts.data(), 2),
SliceParts(end_key_with_ts.data(), 2));
s = WriteBatchInternal::DeleteRange(this, cf_id,
SliceParts(key_with_ts.data(), 2),
SliceParts(end_key_with_ts.data(), 2));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts.size());
}
return s;
}
Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
@ -1554,7 +1598,11 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
}
if (0 == ts_sz) {
return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
}
return Status::InvalidArgument(
@ -1608,21 +1656,25 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
}
if (0 == ts_sz) {
return WriteBatchInternal::Merge(this, cf_id, key, value);
s = WriteBatchInternal::Merge(this, cf_id, key, value);
} else {
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
s = WriteBatchInternal::Merge(
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
}
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
return WriteBatchInternal::Merge(
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
}
Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& ts, const Slice& value) {
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
if (!s.ok()) {
return s;
}
@ -1630,8 +1682,12 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
assert(column_family);
uint32_t cf_id = column_family->GetID();
std::array<Slice, 2> key_with_ts{{key, ts}};
return WriteBatchInternal::Merge(
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
s = WriteBatchInternal::Merge(this, cf_id, SliceParts(key_with_ts.data(), 2),
SliceParts(&value, 1));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts.size());
}
return s;
}
Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
@ -1680,7 +1736,11 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
}
if (0 == ts_sz) {
return WriteBatchInternal::Merge(this, cf_id, key, value);
s = WriteBatchInternal::Merge(this, cf_id, key, value);
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
}
return Status::InvalidArgument(

View File

@ -125,7 +125,6 @@ DECLARE_int32(level0_stop_writes_trigger);
DECLARE_int32(block_size);
DECLARE_int32(format_version);
DECLARE_int32(index_block_restart_interval);
DECLARE_bool(disable_auto_compactions);
DECLARE_int32(max_background_compactions);
DECLARE_int32(num_bottom_pri_threads);
DECLARE_int32(compaction_thread_pool_adjust_interval);
@ -151,6 +150,7 @@ DECLARE_bool(charge_filter_construction);
DECLARE_bool(charge_table_reader);
DECLARE_bool(charge_file_metadata);
DECLARE_bool(charge_blob_cache);
DECLARE_bool(decouple_partitioned_filters);
DECLARE_int32(top_level_index_pinning);
DECLARE_int32(partition_pinning);
DECLARE_int32(unpartitioned_pinning);
@ -274,6 +274,7 @@ DECLARE_bool(verification_only);
DECLARE_string(last_level_temperature);
DECLARE_string(default_write_temperature);
DECLARE_string(default_temperature);
DECLARE_bool(paranoid_memory_checks);
// Options for transaction dbs.
// Use TransactionDB (a.k.a. Pessimistic Transaction DB)
@ -318,7 +319,6 @@ DECLARE_int32(prepopulate_blob_cache);
DECLARE_int32(approximate_size_one_in);
DECLARE_bool(best_efforts_recovery);
DECLARE_bool(skip_verifydb);
DECLARE_bool(enable_compaction_filter);
DECLARE_bool(paranoid_file_checks);
DECLARE_bool(fail_if_options_file_error);
DECLARE_uint64(batch_protection_bytes_per_key);

View File

@ -49,7 +49,7 @@ class DbStressCompactionFilter : public CompactionFilter {
return Decision::kKeep;
}
// Reaching here means we acquired the lock.
key_mutex->AssertHeld();
bool key_exists = state_->Exists(cf_id_, key_num);
const bool allow_overwrite = state_->AllowsOverwrite(key_num);

View File

@ -167,7 +167,10 @@ bool RunStressTestImpl(SharedState* shared) {
{FileType::kWalFile});
}
}
now = clock->NowMicros();
if (ShouldDisableAutoCompactionsBeforeVerifyDb()) {
Status s = stress->EnableAutoCompaction();
assert(s.ok());
}
fprintf(stdout, "%s Starting database operations\n",
clock->TimeToString(now / 1000000).c_str());

View File

@ -380,6 +380,11 @@ DEFINE_bool(charge_blob_cache, false,
"CacheEntryRoleOptions::charged of "
"kBlobCache");
DEFINE_bool(
decouple_partitioned_filters,
ROCKSDB_NAMESPACE::BlockBasedTableOptions().decouple_partitioned_filters,
"Decouple filter partitioning from index partitioning.");
DEFINE_int32(
top_level_index_pinning,
static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
@ -1443,4 +1448,8 @@ DEFINE_uint32(uncache_aggressiveness,
"obsolete. 0 = disabled, 1 = minimum, 100 = moderate, 10000 = "
"normal max");
DEFINE_bool(paranoid_memory_checks,
ROCKSDB_NAMESPACE::Options().paranoid_memory_checks,
"Sets CF option paranoid_memory_checks.");
#endif // GFLAGS

View File

@ -45,6 +45,8 @@ DECLARE_int32(open_write_fault_one_in);
DECLARE_int32(open_read_fault_one_in);
DECLARE_int32(inject_error_severity);
DECLARE_bool(disable_auto_compactions);
DECLARE_bool(enable_compaction_filter);
namespace ROCKSDB_NAMESPACE {
class StressTest;
@ -262,14 +264,10 @@ class SharedState {
// This is useful for crash-recovery testing when the process may crash
// before updating the corresponding expected value
//
// It can fail and `*prepared` will be set to false if the previous write or
// delete is still in pending state (e.g, still in recovery for retryable IO
// errors). If succeeds,`*prepared` will be set to true
//
// Requires external locking covering `key` in `cf` to prevent
// concurrent write or delete to the same `key`.
PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared) {
return expected_state_manager_->PreparePut(cf, key, prepared);
PendingExpectedValue PreparePut(int cf, int64_t key) {
return expected_state_manager_->PreparePut(cf, key);
}
// Does not requires external locking.
@ -281,31 +279,24 @@ class SharedState {
// This is useful for crash-recovery testing when the process may crash
// before updating the corresponding expected value
//
// It can fail and `*prepared` will be set to false if the previous write or
// delete is still in pending state (e.g, still in recovery for retryable IO
// errors). If succeeds,`*prepared` will be set to true
//
// Requires external locking covering `key` in `cf` to prevent concurrent
// write or delete to the same `key`.
PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared) {
return expected_state_manager_->PrepareDelete(cf, key, prepared);
PendingExpectedValue PrepareDelete(int cf, int64_t key) {
return expected_state_manager_->PrepareDelete(cf, key);
}
// Requires external locking covering `key` in `cf` to prevent concurrent
// write or delete to the same `key`.
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key,
bool* prepared) {
return expected_state_manager_->PrepareSingleDelete(cf, key, prepared);
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) {
return expected_state_manager_->PrepareSingleDelete(cf, key);
}
// Requires external locking covering keys in `[begin_key, end_key)` in `cf`
// to prevent concurrent write or delete to the same `key`.
std::vector<PendingExpectedValue> PrepareDeleteRange(int cf,
int64_t begin_key,
int64_t end_key,
bool* prepared) {
return expected_state_manager_->PrepareDeleteRange(cf, begin_key, end_key,
prepared);
int64_t end_key) {
return expected_state_manager_->PrepareDeleteRange(cf, begin_key, end_key);
}
bool AllowsOverwrite(int64_t key) const {

View File

@ -632,10 +632,8 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
for (auto cfh : column_families_) {
for (int64_t k = 0; k != number_of_keys; ++k) {
const std::string key = Key(k);
bool prepare = false;
PendingExpectedValue pending_expected_value =
shared->PreparePut(cf_idx, k, &prepare);
assert(prepare);
shared->PreparePut(cf_idx, k);
const uint32_t value_base = pending_expected_value.GetFinalValueBase();
const size_t sz = GenerateValue(value_base, value, sizeof(value));
@ -3676,7 +3674,7 @@ void StressTest::Reopen(ThreadState* thread) {
// crash-recovery verification does. Therefore it always expects no data loss
// and we should ensure no data loss in testing.
// TODO(hx235): eliminate the FlushWAL(true /* sync */)/SyncWAL() below
if (!FLAGS_disable_wal && !FLAGS_avoid_flush_during_shutdown) {
if (!FLAGS_disable_wal && FLAGS_avoid_flush_during_shutdown) {
Status s;
if (FLAGS_manual_wal_flush_one_in > 0) {
s = db_->FlushWAL(/*sync=*/true);
@ -3834,6 +3832,10 @@ void CheckAndSetOptionsForUserTimestamp(Options& options) {
FLAGS_persist_user_defined_timestamps;
}
bool ShouldDisableAutoCompactionsBeforeVerifyDb() {
return !FLAGS_disable_auto_compactions && FLAGS_enable_compaction_filter;
}
bool InitializeOptionsFromFile(Options& options) {
DBOptions db_options;
ConfigOptions config_options;
@ -3861,6 +3863,8 @@ void InitializeOptionsFromFlags(
const std::shared_ptr<const FilterPolicy>& filter_policy,
Options& options) {
BlockBasedTableOptions block_based_options;
block_based_options.decouple_partitioned_filters =
FLAGS_decouple_partitioned_filters;
block_based_options.block_cache = cache;
block_based_options.cache_index_and_filter_blocks =
FLAGS_cache_index_and_filter_blocks;
@ -3947,7 +3951,11 @@ void InitializeOptionsFromFlags(
new WriteBufferManager(FLAGS_db_write_buffer_size, block_cache));
}
options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
options.disable_auto_compactions = FLAGS_disable_auto_compactions;
if (ShouldDisableAutoCompactionsBeforeVerifyDb()) {
options.disable_auto_compactions = true;
} else {
options.disable_auto_compactions = FLAGS_disable_auto_compactions;
}
options.max_background_compactions = FLAGS_max_background_compactions;
options.max_background_flushes = FLAGS_max_background_flushes;
options.compaction_style =
@ -4047,6 +4055,7 @@ void InitializeOptionsFromFlags(
options.memtable_protection_bytes_per_key =
FLAGS_memtable_protection_bytes_per_key;
options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key;
options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
// Integrated BlobDB
options.enable_blob_files = FLAGS_enable_blob_files;
@ -4262,6 +4271,7 @@ void InitializeOptionsGeneral(
options.disable_auto_compactions = true;
}
options.table_properties_collector_factories.clear();
options.table_properties_collector_factories.emplace_back(
std::make_shared<DbStressTablePropertiesCollectorFactory>());

View File

@ -48,7 +48,11 @@ class StressTest {
return FLAGS_sync_fault_injection || FLAGS_disable_wal ||
FLAGS_manual_wal_flush_one_in > 0;
}
Status EnableAutoCompaction() {
assert(options_.disable_auto_compactions);
Status s = db_->EnableAutoCompaction(column_families_);
return s;
}
void CleanUp();
protected:
@ -64,6 +68,42 @@ class StressTest {
}
}
void UpdateIfInitialWriteFails(Env* db_stress_env, const Status& write_s,
Status* initial_write_s,
bool* initial_wal_write_may_succeed,
uint64_t* wait_for_recover_start_time) {
assert(db_stress_env && initial_write_s && initial_wal_write_may_succeed &&
wait_for_recover_start_time);
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when the
// first write fails
if (!write_s.ok() && (*initial_write_s).ok()) {
*initial_write_s = write_s;
*initial_wal_write_may_succeed =
!FaultInjectionTestFS::IsFailedToWriteToWALError(*initial_write_s);
*wait_for_recover_start_time = db_stress_env->NowMicros();
}
}
void PrintWriteRecoveryWaitTimeIfNeeded(Env* db_stress_env,
const Status& initial_write_s,
bool initial_wal_write_may_succeed,
uint64_t wait_for_recover_start_time,
const std::string& thread_name) {
assert(db_stress_env);
bool waited_for_recovery = !initial_write_s.ok() &&
IsErrorInjectedAndRetryable(initial_write_s) &&
initial_wal_write_may_succeed;
if (waited_for_recovery) {
uint64_t elapsed_sec =
(db_stress_env->NowMicros() - wait_for_recover_start_time) / 1000000;
if (elapsed_sec > 10) {
fprintf(stdout,
"%s thread slept to wait for write recovery for "
"%" PRIu64 " seconds\n",
thread_name.c_str(), elapsed_sec);
}
}
}
void GetDeleteRangeKeyLocks(
ThreadState* thread, int rand_column_family, int64_t rand_key,
std::vector<std::unique_ptr<MutexLock>>* range_locks) {
@ -411,5 +451,6 @@ void InitializeOptionsGeneral(
// user-defined timestamp.
void CheckAndSetOptionsForUserTimestamp(Options& options);
bool ShouldDisableAutoCompactionsBeforeVerifyDb();
} // namespace ROCKSDB_NAMESPACE
#endif // GFLAGS

View File

@ -32,41 +32,29 @@ void ExpectedState::Precommit(int cf, int64_t key, const ExpectedValue& value) {
std::atomic_thread_fence(std::memory_order_release);
}
PendingExpectedValue ExpectedState::PreparePut(int cf, int64_t key,
bool* prepared) {
assert(prepared);
PendingExpectedValue ExpectedState::PreparePut(int cf, int64_t key) {
ExpectedValue expected_value = Load(cf, key);
// Calculate the original expected value
const ExpectedValue orig_expected_value = expected_value;
// Calculate the pending expected value
bool res = expected_value.Put(true /* pending */);
if (!res) {
PendingExpectedValue ret = PendingExpectedValue(
&Value(cf, key), orig_expected_value, orig_expected_value);
*prepared = false;
return ret;
}
expected_value.Put(true /* pending */);
const ExpectedValue pending_expected_value = expected_value;
// Calculate the final expected value
res = expected_value.Put(false /* pending */);
assert(res);
expected_value.Put(false /* pending */);
const ExpectedValue final_expected_value = expected_value;
// Precommit
Precommit(cf, key, pending_expected_value);
*prepared = true;
return PendingExpectedValue(&Value(cf, key), orig_expected_value,
final_expected_value);
}
ExpectedValue ExpectedState::Get(int cf, int64_t key) { return Load(cf, key); }
PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key,
bool* prepared) {
assert(prepared);
PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key) {
ExpectedValue expected_value = Load(cf, key);
// Calculate the original expected value
@ -77,47 +65,32 @@ PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key,
if (!res) {
PendingExpectedValue ret = PendingExpectedValue(
&Value(cf, key), orig_expected_value, orig_expected_value);
*prepared = false;
return ret;
}
const ExpectedValue pending_expected_value = expected_value;
// Calculate the final expected value
res = expected_value.Delete(false /* pending */);
assert(res);
expected_value.Delete(false /* pending */);
const ExpectedValue final_expected_value = expected_value;
// Precommit
Precommit(cf, key, pending_expected_value);
*prepared = true;
return PendingExpectedValue(&Value(cf, key), orig_expected_value,
final_expected_value);
}
PendingExpectedValue ExpectedState::PrepareSingleDelete(int cf, int64_t key,
bool* prepared) {
return PrepareDelete(cf, key, prepared);
PendingExpectedValue ExpectedState::PrepareSingleDelete(int cf, int64_t key) {
return PrepareDelete(cf, key);
}
std::vector<PendingExpectedValue> ExpectedState::PrepareDeleteRange(
int cf, int64_t begin_key, int64_t end_key, bool* prepared) {
int cf, int64_t begin_key, int64_t end_key) {
std::vector<PendingExpectedValue> pending_expected_values;
bool has_prepared_failed = false;
for (int64_t key = begin_key; key < end_key; ++key) {
bool each_prepared = false;
PendingExpectedValue pending_expected_value =
PrepareDelete(cf, key, &each_prepared);
if (each_prepared) {
pending_expected_values.push_back(pending_expected_value);
} else {
has_prepared_failed = true;
pending_expected_value.PermitUnclosedPendingState();
break;
}
pending_expected_values.push_back(PrepareDelete(cf, key));
}
*prepared = !has_prepared_failed;
return pending_expected_values;
}
@ -759,8 +732,31 @@ Status FileExpectedStateManager::Restore(DB* db) {
s = Env::Default()->DeleteFile(state_file_path);
}
if (s.ok()) {
saved_seqno_ = kMaxSequenceNumber;
s = Env::Default()->DeleteFile(trace_file_path);
std::vector<std::string> expected_state_dir_children;
s = Env::Default()->GetChildren(expected_state_dir_path_,
&expected_state_dir_children);
if (s.ok()) {
for (size_t i = 0; i < expected_state_dir_children.size(); ++i) {
const auto& filename = expected_state_dir_children[i];
if (filename.size() >= kTraceFilenameSuffix.size() &&
filename.rfind(kTraceFilenameSuffix) ==
filename.size() - kTraceFilenameSuffix.size()) {
SequenceNumber found_seqno = ParseUint64(filename.substr(
0, filename.size() - kTraceFilenameSuffix.size()));
// Delete older trace files, but keep the one we just replayed for
// debugging purposes
if (found_seqno < saved_seqno_) {
s = Env::Default()->DeleteFile(GetPathForFilename(filename));
}
}
if (!s.ok()) {
break;
}
}
}
if (s.ok()) {
saved_seqno_ = kMaxSequenceNumber;
}
}
return s;
}

View File

@ -44,7 +44,7 @@ class ExpectedState {
//
// Requires external locking covering `key` in `cf` to prevent concurrent
// write or delete to the same `key`.
PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared);
PendingExpectedValue PreparePut(int cf, int64_t key);
// Does not requires external locking.
ExpectedValue Get(int cf, int64_t key);
@ -55,18 +55,17 @@ class ExpectedState {
//
// Requires external locking covering `key` in `cf` to prevent concurrent
// write or delete to the same `key`.
PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared);
PendingExpectedValue PrepareDelete(int cf, int64_t key);
// Requires external locking covering `key` in `cf` to prevent concurrent
// write or delete to the same `key`.
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key, bool* prepared);
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key);
// Requires external locking covering keys in `[begin_key, end_key)` in `cf`
// to prevent concurrent write or delete to the same `key`.
std::vector<PendingExpectedValue> PrepareDeleteRange(int cf,
int64_t begin_key,
int64_t end_key,
bool* prepared);
int64_t end_key);
// Update the expected value for start of an incomplete write or delete
// operation on the key assoicated with this expected value
@ -197,30 +196,28 @@ class ExpectedStateManager {
void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); }
// See ExpectedState::PreparePut()
PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared) {
return latest_->PreparePut(cf, key, prepared);
PendingExpectedValue PreparePut(int cf, int64_t key) {
return latest_->PreparePut(cf, key);
}
// See ExpectedState::Get()
ExpectedValue Get(int cf, int64_t key) { return latest_->Get(cf, key); }
// See ExpectedState::PrepareDelete()
PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared) {
return latest_->PrepareDelete(cf, key, prepared);
PendingExpectedValue PrepareDelete(int cf, int64_t key) {
return latest_->PrepareDelete(cf, key);
}
// See ExpectedState::PrepareSingleDelete()
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key,
bool* prepared) {
return latest_->PrepareSingleDelete(cf, key, prepared);
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) {
return latest_->PrepareSingleDelete(cf, key);
}
// See ExpectedState::PrepareDeleteRange()
std::vector<PendingExpectedValue> PrepareDeleteRange(int cf,
int64_t begin_key,
int64_t end_key,
bool* prepared) {
return latest_->PrepareDeleteRange(cf, begin_key, end_key, prepared);
int64_t end_key) {
return latest_->PrepareDeleteRange(cf, begin_key, end_key);
}
// See ExpectedState::Exists()

View File

@ -10,11 +10,7 @@
#include <atomic>
namespace ROCKSDB_NAMESPACE {
bool ExpectedValue::Put(bool pending) {
if (pending && (PendingWrite() || PendingDelete())) {
return false;
}
void ExpectedValue::Put(bool pending) {
if (pending) {
SetPendingWrite();
} else {
@ -22,15 +18,10 @@ bool ExpectedValue::Put(bool pending) {
ClearDeleted();
ClearPendingWrite();
}
return true;
}
bool ExpectedValue::Delete(bool pending) {
if (pending && (PendingWrite() || PendingDelete())) {
return false;
}
if (!Exists()) {
if (pending && !Exists()) {
return false;
}
if (pending) {

View File

@ -37,11 +37,14 @@ class ExpectedValue {
explicit ExpectedValue(uint32_t expected_value)
: expected_value_(expected_value) {}
bool Exists() const { return PendingWrite() || !IsDeleted(); }
bool Exists() const {
assert(!PendingWrite() && !PendingDelete());
return !IsDeleted();
}
uint32_t Read() const { return expected_value_; }
bool Put(bool pending);
void Put(bool pending);
bool Delete(bool pending);

View File

@ -1619,28 +1619,21 @@ class NonBatchedOpsStressTest : public StressTest {
// write
bool initial_wal_write_may_succeed = true;
bool prepared = false;
PendingExpectedValue pending_expected_value =
shared->PreparePut(rand_column_family, rand_key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
return s;
}
shared->PreparePut(rand_column_family, rand_key);
const uint32_t value_base = pending_expected_value.GetFinalValueBase();
const size_t sz = GenerateValue(value_base, value, sizeof(value));
const Slice v(value, sz);
uint64_t wait_for_recover_start_time = 0;
do {
// In order to commit the expected state for the initial write failed with
// injected retryable error and successful WAL write, retry the write
// until it succeeds after the recovery finishes
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed) {
lock.reset();
std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000));
lock.reset(new MutexLock(
shared->GetMutexForKey(rand_column_family, rand_key)));
}
if (FLAGS_use_put_entity_one_in > 0 &&
(value_base % FLAGS_use_put_entity_one_in) == 0) {
@ -1691,13 +1684,10 @@ class NonBatchedOpsStressTest : public StressTest {
});
}
}
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when the
// first write fails
if (!s.ok() && initial_write_s.ok()) {
initial_write_s = s;
initial_wal_write_may_succeed =
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
}
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
&initial_wal_write_may_succeed,
&wait_for_recover_start_time);
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed);
@ -1719,6 +1709,9 @@ class NonBatchedOpsStressTest : public StressTest {
thread->shared->SafeTerminate();
}
} else {
PrintWriteRecoveryWaitTimeIfNeeded(
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
wait_for_recover_start_time, "TestPut");
pending_expected_value.Commit();
thread->stats.AddBytesForWrites(1, sz);
PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
@ -1756,25 +1749,18 @@ class NonBatchedOpsStressTest : public StressTest {
// Use delete if the key may be overwritten and a single deletion
// otherwise.
if (shared->AllowsOverwrite(rand_key)) {
bool prepared = false;
PendingExpectedValue pending_expected_value =
shared->PrepareDelete(rand_column_family, rand_key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
return s;
}
shared->PrepareDelete(rand_column_family, rand_key);
uint64_t wait_for_recover_start_time = 0;
do {
// In order to commit the expected state for the initial write failed
// with injected retryable error and successful WAL write, retry the
// write until it succeeds after the recovery finishes
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed) {
lock.reset();
std::this_thread::sleep_for(
std::chrono::microseconds(1 * 1000 * 1000));
lock.reset(new MutexLock(
shared->GetMutexForKey(rand_column_family, rand_key)));
}
if (!FLAGS_use_txn) {
if (FLAGS_user_timestamp_size == 0) {
@ -1787,13 +1773,9 @@ class NonBatchedOpsStressTest : public StressTest {
return txn.Delete(cfh, key);
});
}
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when
// the first write fails
if (!s.ok() && initial_write_s.ok()) {
initial_write_s = s;
initial_wal_write_may_succeed =
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
}
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
&initial_wal_write_may_succeed,
&wait_for_recover_start_time);
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed);
@ -1816,29 +1798,25 @@ class NonBatchedOpsStressTest : public StressTest {
thread->shared->SafeTerminate();
}
} else {
PrintWriteRecoveryWaitTimeIfNeeded(
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
wait_for_recover_start_time, "TestDelete");
pending_expected_value.Commit();
thread->stats.AddDeletes(1);
}
} else {
bool prepared = false;
PendingExpectedValue pending_expected_value =
shared->PrepareSingleDelete(rand_column_family, rand_key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
return s;
}
shared->PrepareSingleDelete(rand_column_family, rand_key);
uint64_t wait_for_recover_start_time = 0;
do {
// In order to commit the expected state for the initial write failed
// with injected retryable error and successful WAL write, retry the
// write until it succeeds after the recovery finishes
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed) {
lock.reset();
std::this_thread::sleep_for(
std::chrono::microseconds(1 * 1000 * 1000));
lock.reset(new MutexLock(
shared->GetMutexForKey(rand_column_family, rand_key)));
}
if (!FLAGS_use_txn) {
if (FLAGS_user_timestamp_size == 0) {
@ -1851,13 +1829,9 @@ class NonBatchedOpsStressTest : public StressTest {
return txn.SingleDelete(cfh, key);
});
}
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when
// the first write fails
if (!s.ok() && initial_write_s.ok()) {
initial_write_s = s;
initial_wal_write_may_succeed =
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
}
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
&initial_wal_write_may_succeed,
&wait_for_recover_start_time);
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed);
@ -1880,6 +1854,9 @@ class NonBatchedOpsStressTest : public StressTest {
thread->shared->SafeTerminate();
}
} else {
PrintWriteRecoveryWaitTimeIfNeeded(
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
wait_for_recover_start_time, "TestDelete");
pending_expected_value.Commit();
thread->stats.AddSingleDeletes(1);
}
@ -1914,18 +1891,9 @@ class NonBatchedOpsStressTest : public StressTest {
// write
bool initial_wal_write_may_succeed = true;
bool prepared = false;
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
rand_key + FLAGS_range_deletion_width);
const int covered = static_cast<int>(pending_expected_values.size());
std::string keystr = Key(rand_key);
@ -1935,6 +1903,7 @@ class NonBatchedOpsStressTest : public StressTest {
Slice end_key = end_keystr;
std::string write_ts_str;
Slice write_ts;
uint64_t wait_for_recover_start_time = 0;
do {
// In order to commit the expected state for the initial write failed with
@ -1942,10 +1911,7 @@ class NonBatchedOpsStressTest : public StressTest {
// until it succeeds after the recovery finishes
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed) {
range_locks.clear();
std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000));
GetDeleteRangeKeyLocks(thread, rand_column_family, rand_key,
&range_locks);
}
if (FLAGS_user_timestamp_size) {
write_ts_str = GetNowNanos();
@ -1954,13 +1920,9 @@ class NonBatchedOpsStressTest : public StressTest {
} else {
s = db_->DeleteRange(write_opts, cfh, key, end_key);
}
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when the
// first write fails
if (!s.ok() && initial_write_s.ok()) {
initial_write_s = s;
initial_wal_write_may_succeed =
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
}
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
&initial_wal_write_may_succeed,
&wait_for_recover_start_time);
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed);
@ -1985,6 +1947,9 @@ class NonBatchedOpsStressTest : public StressTest {
thread->shared->SafeTerminate();
}
} else {
PrintWriteRecoveryWaitTimeIfNeeded(
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
wait_for_recover_start_time, "TestDeleteRange");
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.Commit();
@ -2057,16 +2022,8 @@ class NonBatchedOpsStressTest : public StressTest {
}
keys.push_back(key);
bool prepared = false;
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
shared->PreparePut(column_family, key);
const uint32_t value_base = pending_expected_value.GetFinalValueBase();
values.push_back(value_base);
@ -2630,6 +2587,8 @@ class NonBatchedOpsStressTest : public StressTest {
// Value doesn't exist in db, update state to reflect that
shared->SyncDelete(cf, key);
return true;
} else {
assert(false);
}
}
char expected_value_data[kValueMaxLen];
@ -2728,7 +2687,11 @@ class NonBatchedOpsStressTest : public StressTest {
SharedState* const shared = thread->shared;
assert(shared);
if (!shared->AllowsOverwrite(key) && shared->Exists(column_family, key)) {
const ExpectedValue expected_value =
thread->shared->Get(column_family, key);
bool may_exist = !ExpectedValueHelper::MustHaveNotExisted(expected_value,
expected_value);
if (!shared->AllowsOverwrite(key) && may_exist) {
// Just do read your write checks for keys that allow overwrites.
return;
}

6
env/file_system.cc vendored
View File

@ -181,10 +181,10 @@ FileOptions FileSystem::OptimizeForBlobFileRead(
IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
const std::string& fname, bool should_sync,
const IOOptions& io_options) {
const IOOptions& io_options,
const FileOptions& file_options) {
std::unique_ptr<FSWritableFile> file;
EnvOptions soptions;
IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr);
IOStatus s = fs->NewWritableFile(fname, file_options, &file, nullptr);
if (!s.ok()) {
return s;
}

View File

@ -31,6 +31,7 @@ DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs,
total_trash_size_(0),
rate_bytes_per_sec_(rate_bytes_per_sec),
pending_files_(0),
next_trash_bucket_(0),
bytes_max_delete_chunk_(bytes_max_delete_chunk),
closing_(false),
cv_(&mu_),
@ -66,10 +67,8 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
total_trash_size_.load() > total_size * max_trash_db_ratio_.load())) {
// Rate limiting is disabled or trash size makes up more than
// max_trash_db_ratio_ (default 25%) of the total DB size
TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
Status s = DeleteFileImmediately(file_path, /*accounted=*/true);
if (s.ok()) {
s = sst_file_manager_->OnDeleteFile(file_path);
ROCKS_LOG_INFO(info_log_,
"Deleted file %s immediately, rate_bytes_per_sec %" PRIi64
", total_trash_size %" PRIu64 ", total_size %" PRIi64
@ -77,15 +76,57 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
file_path.c_str(), rate_bytes_per_sec_.load(),
total_trash_size_.load(), total_size,
max_trash_db_ratio_.load());
InstrumentedMutexLock l(&mu_);
RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
}
return s;
}
return AddFileToDeletionQueue(file_path, dir_to_sync, /*bucket=*/std::nullopt,
/*accounted=*/true);
}
Status DeleteScheduler::DeleteUnaccountedFile(const std::string& file_path,
const std::string& dir_to_sync,
const bool force_bg,
std::optional<int32_t> bucket) {
uint64_t num_hard_links = 1;
fs_->NumFileLinks(file_path, IOOptions(), &num_hard_links, nullptr)
.PermitUncheckedError();
// We can tolerate rare races where we might immediately delete both links
// to a file.
if (rate_bytes_per_sec_.load() <= 0 || (!force_bg && num_hard_links > 1)) {
Status s = DeleteFileImmediately(file_path, /*accounted=*/false);
if (s.ok()) {
ROCKS_LOG_INFO(info_log_,
"Deleted file %s immediately, rate_bytes_per_sec %" PRIi64,
file_path.c_str(), rate_bytes_per_sec_.load());
}
return s;
}
return AddFileToDeletionQueue(file_path, dir_to_sync, bucket,
/*accounted=*/false);
}
Status DeleteScheduler::DeleteFileImmediately(const std::string& file_path,
bool accounted) {
TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::DeleteFile::cb",
const_cast<std::string*>(&file_path));
Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
if (s.ok()) {
s = OnDeleteFile(file_path, accounted);
InstrumentedMutexLock l(&mu_);
RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
}
return s;
}
Status DeleteScheduler::AddFileToDeletionQueue(const std::string& file_path,
const std::string& dir_to_sync,
std::optional<int32_t> bucket,
bool accounted) {
// Move file to trash
std::string trash_file;
Status s = MarkAsTrash(file_path, &trash_file);
Status s = MarkAsTrash(file_path, accounted, &trash_file);
ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(),
s.ToString().c_str());
@ -94,7 +135,7 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
file_path.c_str(), s.ToString().c_str());
s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
if (s.ok()) {
s = sst_file_manager_->OnDeleteFile(file_path);
s = OnDeleteFile(file_path, accounted);
ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately",
trash_file.c_str());
InstrumentedMutexLock l(&mu_);
@ -104,11 +145,13 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
}
// Update the total trash size
uint64_t trash_file_size = 0;
IOStatus io_s =
fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
if (io_s.ok()) {
total_trash_size_.fetch_add(trash_file_size);
if (accounted) {
uint64_t trash_file_size = 0;
IOStatus io_s =
fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
if (io_s.ok()) {
total_trash_size_.fetch_add(trash_file_size);
}
}
//**TODO: What should we do if we failed to
// get the file size?
@ -117,8 +160,15 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
{
InstrumentedMutexLock l(&mu_);
RecordTick(stats_.get(), FILES_MARKED_TRASH);
queue_.emplace(trash_file, dir_to_sync);
queue_.emplace(trash_file, dir_to_sync, accounted, bucket);
pending_files_++;
if (bucket.has_value()) {
auto iter = pending_files_in_buckets_.find(bucket.value());
assert(iter != pending_files_in_buckets_.end());
if (iter != pending_files_in_buckets_.end()) {
iter->second++;
}
}
if (pending_files_ == 1) {
cv_.SignalAll();
}
@ -177,7 +227,7 @@ Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
}
Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
std::string* trash_file) {
bool accounted, std::string* trash_file) {
// Sanity check of the path
size_t idx = file_path.rfind('/');
if (idx == std::string::npos || idx == file_path.size() - 1) {
@ -211,7 +261,7 @@ Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
}
cnt++;
}
if (s.ok()) {
if (s.ok() && accounted) {
s = sst_file_manager_->OnMoveFile(file_path, *trash_file);
}
return s;
@ -235,6 +285,8 @@ void DeleteScheduler::BackgroundEmptyTrash() {
uint64_t total_deleted_bytes = 0;
int64_t current_delete_rate = rate_bytes_per_sec_.load();
while (!queue_.empty() && !closing_) {
// Satisfy static analysis.
std::optional<int32_t> bucket = std::nullopt;
if (current_delete_rate != rate_bytes_per_sec_.load()) {
// User changed the delete rate
current_delete_rate = rate_bytes_per_sec_.load();
@ -247,14 +299,17 @@ void DeleteScheduler::BackgroundEmptyTrash() {
// Get new file to delete
const FileAndDir& fad = queue_.front();
std::string path_in_trash = fad.fname;
std::string dir_to_sync = fad.dir;
bool accounted = fad.accounted;
bucket = fad.bucket;
// We don't need to hold the lock while deleting the file
mu_.Unlock();
uint64_t deleted_bytes = 0;
bool is_complete = true;
// Delete file from trash and update total_penlty value
Status s =
DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete);
Status s = DeleteTrashFile(path_in_trash, dir_to_sync, accounted,
&deleted_bytes, &is_complete);
total_deleted_bytes += deleted_bytes;
mu_.Lock();
if (is_complete) {
@ -288,12 +343,20 @@ void DeleteScheduler::BackgroundEmptyTrash() {
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
&total_penalty);
int32_t pending_files_in_bucket = std::numeric_limits<int32_t>::max();
if (is_complete) {
pending_files_--;
if (bucket.has_value()) {
auto iter = pending_files_in_buckets_.find(bucket.value());
assert(iter != pending_files_in_buckets_.end());
if (iter != pending_files_in_buckets_.end()) {
pending_files_in_bucket = iter->second--;
}
}
}
if (pending_files_ == 0) {
// Unblock WaitForEmptyTrash since there are no more files waiting
// to be deleted
if (pending_files_ == 0 || pending_files_in_bucket == 0) {
// Unblock WaitForEmptyTrash or WaitForEmptyTrashBucket since there are
// no more files waiting to be deleted
cv_.SignalAll();
}
}
@ -302,12 +365,14 @@ void DeleteScheduler::BackgroundEmptyTrash() {
Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
const std::string& dir_to_sync,
uint64_t* deleted_bytes,
bool accounted, uint64_t* deleted_bytes,
bool* is_complete) {
uint64_t file_size;
Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr);
*is_complete = true;
TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::DeleteTrashFile::cb",
const_cast<std::string*>(&path_in_trash));
if (s.ok()) {
bool need_full_delete = true;
if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
@ -374,7 +439,7 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
}
if (s.ok()) {
*deleted_bytes = file_size;
s = sst_file_manager_->OnDeleteFile(path_in_trash);
s = OnDeleteFile(path_in_trash, accounted);
}
}
}
@ -384,12 +449,24 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
path_in_trash.c_str(), s.ToString().c_str());
*deleted_bytes = 0;
} else {
total_trash_size_.fetch_sub(*deleted_bytes);
if (accounted) {
total_trash_size_.fetch_sub(*deleted_bytes);
}
}
return s;
}
Status DeleteScheduler::OnDeleteFile(const std::string& file_path,
bool accounted) {
if (accounted) {
return sst_file_manager_->OnDeleteFile(file_path);
}
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::OnDeleteFile",
const_cast<std::string*>(&file_path));
return Status::OK();
}
void DeleteScheduler::WaitForEmptyTrash() {
InstrumentedMutexLock l(&mu_);
while (pending_files_ > 0 && !closing_) {
@ -397,6 +474,30 @@ void DeleteScheduler::WaitForEmptyTrash() {
}
}
std::optional<int32_t> DeleteScheduler::NewTrashBucket() {
if (rate_bytes_per_sec_.load() <= 0) {
return std::nullopt;
}
InstrumentedMutexLock l(&mu_);
int32_t bucket_number = next_trash_bucket_++;
pending_files_in_buckets_.emplace(bucket_number, 0);
return bucket_number;
}
void DeleteScheduler::WaitForEmptyTrashBucket(int32_t bucket) {
InstrumentedMutexLock l(&mu_);
if (bucket >= next_trash_bucket_) {
return;
}
auto iter = pending_files_in_buckets_.find(bucket);
while (iter != pending_files_in_buckets_.end() && iter->second > 0 &&
!closing_) {
cv_.Wait();
iter = pending_files_in_buckets_.find(bucket);
}
pending_files_in_buckets_.erase(bucket);
}
void DeleteScheduler::MaybeCreateBackgroundThread() {
if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) {
bg_thread_.reset(

View File

@ -7,6 +7,7 @@
#include <map>
#include <optional>
#include <queue>
#include <string>
#include <thread>
@ -48,16 +49,45 @@ class DeleteScheduler {
MaybeCreateBackgroundThread();
}
// Mark file as trash directory and schedule its deletion. If force_bg is
// set, it forces the file to always be deleted in the background thread,
// except when rate limiting is disabled
// Delete an accounted file that is tracked by `SstFileManager` and should be
// tracked by this `DeleteScheduler` when it's deleted.
// The file is deleted immediately if slow deletion is disabled. If force_bg
// is not set and trash to db size ratio exceeded the configured threshold,
// it is immediately deleted too. In all other cases, the file will be moved
// to a trash directory and scheduled for deletion by a background thread.
Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
const bool force_bg = false);
// Wait for all files being deleteing in the background to finish or for
// Delete an unaccounted file that is not tracked by `SstFileManager` and
// should not be tracked by this `DeleteScheduler` when it's deleted.
// The file is deleted immediately if slow deletion is disabled. If force_bg
// is not set and the file have more than 1 hard link, it is immediately
// deleted too. In all other cases, the file will be moved to a trash
// directory and scheduled for deletion by a background thread.
// This API also supports assign a file to a specified bucket created by
// `NewTrashBucket` when delete files in the background. So the caller can
// wait for a specific bucket to be empty by checking the
// `WaitForEmptyTrashBucket` API.
Status DeleteUnaccountedFile(const std::string& file_path,
const std::string& dir_to_sync,
const bool force_bg = false,
std::optional<int32_t> bucket = std::nullopt);
// Wait for all files being deleted in the background to finish or for
// destructor to be called.
void WaitForEmptyTrash();
// Creates a new trash bucket. A bucket is only created and returned when slow
// deletion is enabled.
// For each bucket that is created, the user should also call
// `WaitForEmptyTrashBucket` after scheduling file deletions to make sure the
// trash files are all cleared.
std::optional<int32_t> NewTrashBucket();
// Wait for all the files in the specified bucket to be deleted in the
// background or for the destructor to be called.
void WaitForEmptyTrashBucket(int32_t bucket);
// Return a map containing errors that happened in BackgroundEmptyTrash
// file_path => error status
std::map<std::string, Status> GetBackgroundErrors();
@ -87,12 +117,21 @@ class DeleteScheduler {
}
private:
Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash);
Status DeleteFileImmediately(const std::string& file_path, bool accounted);
Status AddFileToDeletionQueue(const std::string& file_path,
const std::string& dir_to_sync,
std::optional<int32_t> bucket, bool accounted);
Status MarkAsTrash(const std::string& file_path, bool accounted,
std::string* path_in_trash);
Status DeleteTrashFile(const std::string& path_in_trash,
const std::string& dir_to_sync,
const std::string& dir_to_sync, bool accounted,
uint64_t* deleted_bytes, bool* is_complete);
Status OnDeleteFile(const std::string& file_path, bool accounted);
void BackgroundEmptyTrash();
void MaybeCreateBackgroundThread();
@ -104,19 +143,28 @@ class DeleteScheduler {
std::atomic<uint64_t> total_trash_size_;
// Maximum number of bytes that should be deleted per second
std::atomic<int64_t> rate_bytes_per_sec_;
// Mutex to protect queue_, pending_files_, bg_errors_, closing_, stats_
// Mutex to protect queue_, pending_files_, next_trash_bucket_,
// pending_files_in_buckets_, bg_errors_, closing_, stats_
InstrumentedMutex mu_;
struct FileAndDir {
FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {}
FileAndDir(const std::string& _fname, const std::string& _dir,
bool _accounted, std::optional<int32_t> _bucket)
: fname(_fname), dir(_dir), accounted(_accounted), bucket(_bucket) {}
std::string fname;
std::string dir; // empty will be skipped.
bool accounted;
std::optional<int32_t> bucket;
};
// Queue of trash files that need to be deleted
std::queue<FileAndDir> queue_;
// Number of trash files that are waiting to be deleted
int32_t pending_files_;
// Next trash bucket that can be created
int32_t next_trash_bucket_;
// A mapping from trash bucket to number of pending files in the bucket
std::map<int32_t, int32_t> pending_files_in_buckets_;
uint64_t bytes_max_delete_chunk_;
// Errors that happened in BackgroundEmptyTrash (file_path => error)
std::map<std::string, Status> bg_errors_;
@ -127,6 +175,7 @@ class DeleteScheduler {
// Condition variable signaled in these conditions
// - pending_files_ value change from 0 => 1
// - pending_files_ value change from 1 => 0
// - a value in pending_files_in_buckets change from 1 => 0
// - closing_ value is set to true
InstrumentedCondVar cv_;
// Background thread running BackgroundEmptyTrash
@ -138,6 +187,10 @@ class DeleteScheduler {
// If the trash size constitutes for more than this fraction of the total DB
// size we will start deleting new files passed to DeleteScheduler
// immediately
// Unaccounted files passed for deletion will not cause change in
// total_trash_size_ or affect the DeleteScheduler::total_trash_size_ over
// SstFileManager::total_size_ ratio. Their slow deletion is not subject to
// this configured threshold either.
std::atomic<double> max_trash_db_ratio_;
static const uint64_t kMicrosInSecond = 1000 * 1000LL;
std::shared_ptr<Statistics> stats_;

View File

@ -78,7 +78,7 @@ class DeleteSchedulerTest : public testing::Test {
}
std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024,
size_t dummy_files_dirs_idx = 0) {
size_t dummy_files_dirs_idx = 0, bool track = true) {
std::string file_path =
dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name;
std::unique_ptr<WritableFile> f;
@ -86,7 +86,9 @@ class DeleteSchedulerTest : public testing::Test {
std::string data(size, 'A');
EXPECT_OK(f->Append(data));
EXPECT_OK(f->Close());
EXPECT_OK(sst_file_mgr_->OnAddFile(file_path));
if (track) {
EXPECT_OK(sst_file_mgr_->OnAddFile(file_path));
}
return file_path;
}
@ -353,6 +355,8 @@ TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
ASSERT_EQ(num_files,
stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
ASSERT_FALSE(delete_scheduler_->NewTrashBucket().has_value());
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}
@ -718,6 +722,141 @@ TEST_F(DeleteSchedulerTest, IsTrashCheck) {
ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx"));
}
TEST_F(DeleteSchedulerTest, DeleteAccountedAndUnaccountedFiles) {
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / s
NewDeleteScheduler();
// Create 100 files, every file is 1 KB
int num_files = 100; // 100 files
uint64_t file_size = 1024; // 1 KB as a file size
std::vector<std::string> generated_files;
for (int i = 0; i < num_files; i++) {
std::string file_name = "file" + std::to_string(i) + ".data";
generated_files.push_back(NewDummyFile(file_name, file_size,
/*dummy_files_dirs_idx*/ 0,
/*track=*/false));
}
for (int i = 0; i < num_files; i++) {
if (i % 2) {
ASSERT_OK(sst_file_mgr_->OnAddFile(generated_files[i], file_size));
ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
} else {
ASSERT_OK(
delete_scheduler_->DeleteUnaccountedFile(generated_files[i], ""));
}
}
delete_scheduler_->WaitForEmptyTrash();
ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
ASSERT_EQ(0, sst_file_mgr_->GetTotalSize());
}
TEST_F(DeleteSchedulerTest, ConcurrentlyDeleteUnaccountedFilesInBuckets) {
int bg_delete_file = 0;
int fg_delete_file = 0;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::DeleteTrashFile:DeleteFile",
[&](void* /*arg*/) { bg_delete_file++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / s
NewDeleteScheduler();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
// Create 1000 files, every file is 1 KB
int num_files = 1000;
uint64_t file_size = 1024; // 1 KB as a file size
std::vector<std::string> generated_files;
for (int i = 0; i < num_files; i++) {
std::string file_name = "file" + std::to_string(i) + ".data";
generated_files.push_back(NewDummyFile(file_name, file_size,
/*dummy_files_dirs_idx*/ 0,
/*track=*/false));
}
// Concurrently delete files in different buckets and check all the buckets
// are empty.
int thread_cnt = 10;
int files_per_thread = 100;
std::atomic<int> thread_num(0);
std::vector<port::Thread> threads;
std::function<void()> delete_thread = [&]() {
std::optional<int32_t> bucket = delete_scheduler_->NewTrashBucket();
ASSERT_TRUE(bucket.has_value());
int idx = thread_num.fetch_add(1);
int range_start = idx * files_per_thread;
int range_end = range_start + files_per_thread;
for (int j = range_start; j < range_end; j++) {
ASSERT_OK(delete_scheduler_->DeleteUnaccountedFile(
generated_files[j], "", /*false_bg=*/false, bucket));
}
delete_scheduler_->WaitForEmptyTrashBucket(bucket.value());
};
for (int i = 0; i < thread_cnt; i++) {
threads.emplace_back(delete_thread);
}
for (size_t i = 0; i < threads.size(); i++) {
threads[i].join();
}
ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
ASSERT_EQ(1000, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
ASSERT_EQ(0, fg_delete_file);
ASSERT_EQ(1000, bg_delete_file);
// OK to re check an already empty bucket
delete_scheduler_->WaitForEmptyTrashBucket(9);
// Invalid bucket return too.
delete_scheduler_->WaitForEmptyTrashBucket(100);
std::optional<int32_t> next_bucket = delete_scheduler_->NewTrashBucket();
ASSERT_TRUE(next_bucket.has_value());
ASSERT_EQ(10, next_bucket.value());
delete_scheduler_->WaitForEmptyTrashBucket(10);
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}
TEST_F(DeleteSchedulerTest,
ImmediatelyDeleteUnaccountedFilesWithRemainingLinks) {
int bg_delete_file = 0;
int fg_delete_file = 0;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::DeleteTrashFile:DeleteFile",
[&](void* /*arg*/) { bg_delete_file++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec
NewDeleteScheduler();
std::string file1 = NewDummyFile("data_1", 500 * 1024,
/*dummy_files_dirs_idx*/ 0, /*track=*/false);
std::string file2 = NewDummyFile("data_2", 100 * 1024,
/*dummy_files_dirs_idx*/ 0, /*track=*/false);
ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
// Should delete in 4 batch if there is no hardlink
ASSERT_OK(
delete_scheduler_->DeleteUnaccountedFile(file1, "", /*force_bg=*/false));
ASSERT_OK(
delete_scheduler_->DeleteUnaccountedFile(file2, "", /*force_bg=*/false));
delete_scheduler_->WaitForEmptyTrash();
ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
ASSERT_EQ(0, bg_delete_file);
ASSERT_EQ(2, fg_delete_file);
ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
ASSERT_EQ(2, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

View File

@ -125,8 +125,8 @@ IOStatus CreateFile(FileSystem* fs, const std::string& destination,
Status DeleteDBFile(const ImmutableDBOptions* db_options,
const std::string& fname, const std::string& dir_to_sync,
const bool force_bg, const bool force_fg) {
SstFileManagerImpl* sfm =
static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
SstFileManagerImpl* sfm = static_cast_with_check<SstFileManagerImpl>(
db_options->sst_file_manager.get());
if (sfm && !force_fg) {
return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
} else {
@ -134,6 +134,21 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options,
}
}
Status DeleteUnaccountedDBFile(const ImmutableDBOptions* db_options,
const std::string& fname,
const std::string& dir_to_sync,
const bool force_bg, const bool force_fg,
std::optional<int32_t> bucket) {
SstFileManagerImpl* sfm = static_cast_with_check<SstFileManagerImpl>(
db_options->sst_file_manager.get());
if (sfm && !force_fg) {
return sfm->ScheduleUnaccountedFileDeletion(fname, dir_to_sync, force_bg,
bucket);
} else {
return db_options->env->DeleteFile(fname);
}
}
// requested_checksum_func_name brings the function name of the checksum
// generator in checksum_factory. Empty string is permitted, in which case the
// name of the generator created by the factory is unchecked. When

View File

@ -55,6 +55,16 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options,
const std::string& fname, const std::string& path_to_sync,
const bool force_bg, const bool force_fg);
// Delete an unaccounted DB file that is not tracked by SstFileManager and will
// not be tracked by its DeleteScheduler when getting deleted.
// If a legitimate bucket is provided and this file is scheduled for slow
// deletion, it will be assigned to the specified trash bucket.
Status DeleteUnaccountedDBFile(const ImmutableDBOptions* db_options,
const std::string& fname,
const std::string& dir_to_sync,
const bool force_bg, const bool force_fg,
std::optional<int32_t> bucket);
// TODO(hx235): pass the whole DBOptions intead of its individual fields
IOStatus GenerateOneFileChecksum(
FileSystem* fs, const std::string& file_path,

View File

@ -388,6 +388,7 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
const std::string& dbname, uint64_t descriptor_number,
Temperature temp,
FSDirectory* dir_contains_current_file) {
// Remove leading "dbname/" and add newline to manifest file name
std::string manifest = DescriptorFileName(dbname, descriptor_number);
@ -397,8 +398,11 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
std::string tmp = TempFileName(dbname, descriptor_number);
IOOptions opts;
IOStatus s = PrepareIOFromWriteOptions(write_options, opts);
FileOptions file_opts;
file_opts.temperature = temp;
if (s.ok()) {
s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts);
s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts,
file_opts);
}
TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s);
if (s.ok()) {
@ -423,7 +427,8 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
}
Status SetIdentityFile(const WriteOptions& write_options, Env* env,
const std::string& dbname, const std::string& db_id) {
const std::string& dbname, Temperature temp,
const std::string& db_id) {
std::string id;
if (db_id.empty()) {
id = env->GenerateUniqueId();
@ -437,8 +442,11 @@ Status SetIdentityFile(const WriteOptions& write_options, Env* env,
Status s;
IOOptions opts;
s = PrepareIOFromWriteOptions(write_options, opts);
FileOptions file_opts;
file_opts.temperature = temp;
if (s.ok()) {
s = WriteStringToFile(env, id, tmp, true, &opts);
s = WriteStringToFile(env->GetFileSystem().get(), id, tmp,
/*should_sync=*/true, opts, file_opts);
}
if (s.ok()) {
s = env->RenameFile(tmp, identify_file_name);

View File

@ -161,11 +161,12 @@ bool ParseFileName(const std::string& filename, uint64_t* number,
// when
IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
const std::string& dbname, uint64_t descriptor_number,
Temperature temp,
FSDirectory* dir_contains_current_file);
// Make the IDENTITY file for the db
Status SetIdentityFile(const WriteOptions& write_options, Env* env,
const std::string& dbname,
const std::string& dbname, Temperature temp,
const std::string& db_id = {});
// Sync manifest file `file`.

View File

@ -421,10 +421,28 @@ Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path,
return delete_scheduler_.DeleteFile(file_path, path_to_sync, force_bg);
}
Status SstFileManagerImpl::ScheduleUnaccountedFileDeletion(
const std::string& file_path, const std::string& dir_to_sync,
const bool force_bg, std::optional<int32_t> bucket) {
TEST_SYNC_POINT_CALLBACK(
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion",
const_cast<std::string*>(&file_path));
return delete_scheduler_.DeleteUnaccountedFile(file_path, dir_to_sync,
force_bg, bucket);
}
void SstFileManagerImpl::WaitForEmptyTrash() {
delete_scheduler_.WaitForEmptyTrash();
}
std::optional<int32_t> SstFileManagerImpl::NewTrashBucket() {
return delete_scheduler_.NewTrashBucket();
}
void SstFileManagerImpl::WaitForEmptyTrashBucket(int32_t bucket) {
delete_scheduler_.WaitForEmptyTrashBucket(bucket);
}
void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
uint64_t file_size) {
auto tracked_file = tracked_files_.find(file_path);

View File

@ -5,7 +5,7 @@
#pragma once
#include <optional>
#include <string>
#include "db/compaction/compaction.h"
@ -118,17 +118,40 @@ class SstFileManagerImpl : public SstFileManager {
// not guaranteed
bool CancelErrorRecovery(ErrorHandler* db);
// Mark file as trash and schedule it's deletion. If force_bg is set, it
// Mark a file as trash and schedule its deletion. If force_bg is set, it
// forces the file to be deleting in the background regardless of DB size,
// except when rate limited delete is disabled
// except when rate limited delete is disabled.
virtual Status ScheduleFileDeletion(const std::string& file_path,
const std::string& dir_to_sync,
const bool force_bg = false);
// Wait for all files being deleteing in the background to finish or for
// Delete an unaccounted file. The file is deleted immediately if slow
// deletion is disabled. A file with more than 1 hard links will be deleted
// immediately unless force_bg is set. In other cases, files will be scheduled
// for slow deletion, and assigned to the specified bucket if a legitimate one
// is provided. A legitimate bucket is one that is created with the
// `NewTrashBucket` API, and for which `WaitForEmptyTrashBucket` hasn't been
// called yet.
virtual Status ScheduleUnaccountedFileDeletion(
const std::string& file_path, const std::string& dir_to_sync,
const bool force_bg = false,
std::optional<int32_t> bucket = std::nullopt);
// Wait for all files being deleted in the background to finish or for
// destructor to be called.
virtual void WaitForEmptyTrash();
// Creates a new trash bucket. A legitimate bucket is only created and
// returned when slow deletion is enabled.
// For each bucket that is created and used, the user should also call
// `WaitForEmptyTrashBucket` after scheduling file deletions to make sure all
// the trash files are cleared.
std::optional<int32_t> NewTrashBucket();
// Wait for all the files in the specified bucket to be deleted in the
// background or for destructor to be called.
virtual void WaitForEmptyTrashBucket(int32_t bucket);
DeleteScheduler* delete_scheduler() { return &delete_scheduler_; }
// Stop the error recovery background thread. This should be called only

View File

@ -61,18 +61,6 @@ enum CompactionPri : char {
kRoundRobin = 0x4,
};
// Temperature of a file. Used to pass to FileSystem for a different
// placement and/or coding.
// Reserve some numbers in the middle, in case we need to insert new tier
// there.
enum class Temperature : uint8_t {
kUnknown = 0,
kHot = 0x04,
kWarm = 0x08,
kCold = 0x0C,
kLastTemperature,
};
struct FileTemperatureAge {
Temperature temperature = Temperature::kUnknown;
uint64_t age = 0;
@ -813,7 +801,7 @@ struct AdvancedColumnFamilyOptions {
// If this option is set, when creating the last level files, pass this
// temperature to FileSystem used. Should be no-op for default FileSystem
// and users need to plug in their own FileSystem to take advantage of it.
// When using FIFO compaction, this option is ignored.
// Currently only compatible with universal compaction.
//
// Dynamically changeable through the SetOptions() API
Temperature last_level_temperature = Temperature::kUnknown;
@ -1090,6 +1078,13 @@ struct AdvancedColumnFamilyOptions {
// Dynamically changeable through the SetOptions() API.
uint32_t bottommost_file_compaction_delay = 0;
// Enables additional integrity checks during reads/scans.
// Specifically, for skiplist-based memtables, we verify that keys visited
// are in order. This is helpful to detect corrupted memtable keys during
// reads. Enabling this feature incurs a performance overhead due to an
// additional key comparison during memtable lookup.
bool paranoid_memory_checks = false;
// Create ColumnFamilyOptions with default values for all fields
AdvancedColumnFamilyOptions();
// Create ColumnFamilyOptions from Options

View File

@ -30,6 +30,7 @@
#include "rocksdb/port_defs.h"
#include "rocksdb/status.h"
#include "rocksdb/thread_status.h"
#include "rocksdb/types.h"
#ifdef _WIN32
// Windows API macro interference
@ -159,6 +160,9 @@ class Env : public Customizable {
// Size of file in bytes
uint64_t size_bytes;
// EXPERIMENTAL - only provided by some implementations
Temperature temperature = Temperature::kUnknown;
};
Env();

View File

@ -195,7 +195,9 @@ struct FileOptions : EnvOptions {
FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
FileOptions(const DBOptions& opts)
: EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
: EnvOptions(opts),
temperature(opts.metadata_write_temperature),
handoff_checksum_type(ChecksumType::kCRC32c) {}
FileOptions(const EnvOptions& opts)
: EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
@ -1952,7 +1954,8 @@ class FSDirectoryWrapper : public FSDirectory {
// A utility routine: write "data" to the named file.
IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
const std::string& fname, bool should_sync = false,
const IOOptions& io_options = IOOptions());
const IOOptions& io_options = IOOptions(),
const FileOptions& file_options = FileOptions());
// A utility routine: read contents of named file into *data
IOStatus ReadFileToString(FileSystem* fs, const std::string& fname,

View File

@ -47,7 +47,8 @@ class FilterBitsReader;
// structs because this is expected to be a temporary, stack-allocated object.
struct FilterBuildingContext {
// This constructor is for internal use only and subject to change.
FilterBuildingContext(const BlockBasedTableOptions& table_options);
// Keeps a reference to table_options.
explicit FilterBuildingContext(const BlockBasedTableOptions& table_options);
// Options for the table being built
const BlockBasedTableOptions& table_options;

View File

@ -194,6 +194,15 @@ class MemTableRep {
virtual void Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry));
// Same as Get() but performs data integrity validation.
virtual Status GetAndValidate(const LookupKey& /* k */,
void* /* callback_args */,
bool (* /* callback_func */)(void* arg,
const char* entry),
bool /*allow_data_in_error*/) {
return Status::NotSupported("GetAndValidate() not implemented.");
}
virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
const Slice& /*end_key*/) {
return 0;
@ -235,13 +244,38 @@ class MemTableRep {
// REQUIRES: Valid()
virtual void Next() = 0;
// Advances to the next position and performs integrity validations on the
// skip list. Iterator becomes invalid and Corruption is returned if a
// corruption is found.
// REQUIRES: Valid()
virtual Status NextAndValidate(bool /* allow_data_in_errors */) {
return Status::NotSupported("NextAndValidate() not implemented.");
}
// Advances to the previous position.
// REQUIRES: Valid()
virtual void Prev() = 0;
// Advances to the previous position and performs integrity validations on
// the skip list. Iterator becomes invalid and Corruption is returned if a
// corruption is found.
// REQUIRES: Valid()
virtual Status PrevAndValidate(bool /* allow_data_in_errors */) {
return Status::NotSupported("PrevAndValidate() not implemented.");
}
// Advance to the first entry with a key >= target
virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
// Seek and perform integrity validations on the skip list.
// Iterator becomes invalid and Corruption is returned if a
// corruption is found.
virtual Status SeekAndValidate(const Slice& /* internal_key */,
const char* /* memtable_key */,
bool /* allow_data_in_errors */) {
return Status::NotSupported("SeekAndValidate() not implemented.");
}
// retreat to the first entry with a key <= target
virtual void SeekForPrev(const Slice& internal_key,
const char* memtable_key) = 0;

View File

@ -512,6 +512,10 @@ class CompactionService : public Customizable {
return CompactionServiceJobStatus::kUseLocal;
}
// Optional callback function upon Installation.
virtual void OnInstallation(const std::string& /*scheduled_job_id*/,
CompactionServiceJobStatus /*status*/) {}
// Deprecated. Please implement Schedule() and Wait() API to handle remote
// compaction
@ -1434,7 +1438,17 @@ struct DBOptions {
// For example, if an SST or blob file referenced by the MANIFEST is missing,
// BER might be able to find a set of files corresponding to an old "point in
// time" version of the column family, possibly from an older MANIFEST
// file. Some other kinds of DB files (e.g. CURRENT, LOCK, IDENTITY) are
// file.
// Besides complete "point in time" version, an incomplete version with
// only a suffix of L0 files missing can also be recovered to if the
// versioning history doesn't include an atomic flush. From the users'
// perspective, missing a suffix of L0 files means missing the
// user's most recently written data. So the remaining available files still
// presents a valid point in time view, although for some previous time. It's
// not done for atomic flush because that guarantees a consistent view across
// column families. We cannot guarantee that if recovering an incomplete
// version.
// Some other kinds of DB files (e.g. CURRENT, LOCK, IDENTITY) are
// either ignored or replaced with BER, or quietly fixed regardless of BER
// setting. BER does require at least one valid MANIFEST to recover to a
// non-trivial DB state, unlike `ldb repair`.
@ -1566,6 +1580,16 @@ struct DBOptions {
// Default 100ms
uint64_t follower_catchup_retry_wait_ms = 100;
// When DB files other than SST, blob and WAL files are created, use this
// filesystem temperature. (See also `wal_write_temperature` and various
// `*_temperature` CF options.) When not `kUnknown`, this overrides any
// temperature set by OptimizeForManifestWrite functions.
Temperature metadata_write_temperature = Temperature::kUnknown;
// Use this filesystem temperature when creating WAL files. When not
// `kUnknown`, this overrides any temperature set by OptimizeForLogWrite
// functions.
Temperature wal_write_temperature = Temperature::kUnknown;
// End EXPERIMENTAL
};
@ -2107,6 +2131,8 @@ struct CompactRangeOptions {
// IngestExternalFileOptions is used by IngestExternalFile()
struct IngestExternalFileOptions {
// Can be set to true to move the files instead of copying them.
// Note that original file links will be removed after successful ingestion,
// unless `allow_db_generated_files` is true.
bool move_files = false;
// If set to true, ingestion falls back to copy when move fails.
bool failed_move_fall_back_to_copy = true;
@ -2180,22 +2206,19 @@ struct IngestExternalFileOptions {
// XXX: "bottommost" is obsolete/confusing terminology to refer to last level
bool fail_if_not_bottommost_level = false;
// EXPERIMENTAL
// If set to true, ingestion will
// - allow the files to not be generated by SstFileWriter, and
// - ignore cf_id mismatch between cf_id in the files and the CF they are
// being ingested into.
//
// REQUIRES:
// - files to be ingested do not overlap with existing keys.
// - write_global_seqno = false
// - move_files = false
//
// Warning: This ONLY works for SST files where all keys have sequence number
// zero and with no duplicated user keys (this should be guaranteed if the
// file is generated by a DB with zero as the largest sequence number).
// We scan the entire SST files to validate sequence numbers.
// Warning: If a DB contains ingested files generated by another DB/CF,
// RepairDB() may not correctly recover these files. It may lose these files.
// Enables ingestion of files not generated by SstFileWriter. When true:
// - Allows files to be ingested when their cf_id doesn't match the CF they
// are being ingested into.
// - Preserves original file links after successful ingestion when
// `move_files = true`.
// REQUIREMENTS:
// - Ingested files must not overlap with existing keys.
// - `write_global_seqno` must be false.
// - All keys in ingested files should have sequence number 0. We fail
// ingestion if any sequence numbers is non-zero.
// WARNING: If a DB contains ingested files generated by another DB/CF,
// RepairDB() may not recover these files correctly, potentially leading to
// data loss.
bool allow_db_generated_files = false;
};

View File

@ -529,6 +529,11 @@ enum Tickers : uint32_t {
// Footer corruption detected when opening an SST file for reading
SST_FOOTER_CORRUPTION_COUNT,
// Counters for file read retries with the verify_and_reconstruct_read
// file system option after detecting a checksum mismatch
FILE_READ_CORRUPTION_RETRY_COUNT,
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
TICKER_ENUM_MAX
};

View File

@ -291,15 +291,11 @@ struct BlockBasedTableOptions {
// Same as block_restart_interval but used for the index block.
int index_block_restart_interval = 1;
// Block size for partitioned metadata. Currently applied to indexes when
// kTwoLevelIndexSearch is used and to filters when partition_filters is used.
// Note: Since in the current implementation the filters and index partitions
// are aligned, an index/filter block is created when either index or filter
// block size reaches the specified limit.
// Note: this limit is currently applied to only index blocks; a filter
// partition is cut right after an index block is cut
// TODO(myabandeh): remove the note above when filter partitions are cut
// separately
// Target block size for partitioned metadata. Currently applied to indexes
// when kTwoLevelIndexSearch is used and to filters when partition_filters is
// used. When decouple_partitioned_filters=false (original behavior), there is
// much more deviation from this target size. See the comment on
// decouple_partitioned_filters.
uint64_t metadata_block_size = 4096;
// `cache_usage_options` allows users to specify the default
@ -398,6 +394,23 @@ struct BlockBasedTableOptions {
// block cache even when cache_index_and_filter_blocks=false.
bool partition_filters = false;
// When both partitioned indexes and partitioned filters are enabled,
// this enables independent partitioning boundaries between the two. Most
// notably, this enables these metadata blocks to hit their target size much
// more accurately, as there is often a disparity between index sizes and
// filter sizes. This should reduce fragmentation and metadata overheads in
// the block cache, as well as treat blocks more fairly for cache eviction
// purposes.
//
// There are no SST format compatibility issues with this option. (All
// versions of RocksDB able to read partitioned filters are able to read
// decoupled partitioned filters.)
//
// decouple_partitioned_filters = false is the original behavior, because of
// limitations in the initial implementation, and the new behavior
// decouple_partitioned_filters = true is expected to become the new default.
bool decouple_partitioned_filters = false;
// Option to generate Bloom/Ribbon filters that minimize memory
// internal fragmentation.
//
@ -679,6 +692,11 @@ struct BlockBasedTablePropertyNames {
static const std::string kWholeKeyFiltering;
// value is "1" for true and "0" for false.
static const std::string kPrefixFiltering;
// Set to "1" when partitioned filters are decoupled from partitioned indexes.
// This metadata is recorded in case a read-time optimization for coupled
// filter+index partitioning is ever developed; that optimization/assumption
// would be disabled when this is set.
static const std::string kDecoupledPartitionedFilters;
};
// Create default block based table factory.

View File

@ -74,6 +74,7 @@ struct TablePropertiesNames {
static const std::string kSequenceNumberTimeMapping;
static const std::string kTailStartOffset;
static const std::string kUserDefinedTimestampsPersisted;
static const std::string kKeyLargestSeqno;
};
// `TablePropertiesCollector` provides the mechanism for users to collect
@ -125,6 +126,8 @@ class TablePropertiesCollector {
// Finish() will be called when a table has already been built and is ready
// for writing the properties block.
// It will be called only once by RocksDB internal.
// When the returned Status is not OK, the collected properties will not be
// written to the file's property block.
//
// @params properties User will add their collected statistics to
// `properties`.
@ -132,6 +135,7 @@ class TablePropertiesCollector {
// Return the human-readable properties, where the key is property name and
// the value is the human-readable form of value.
// Returned properties are used for logging.
// It will only be called after Finish() has been called by RocksDB internal.
virtual UserCollectedProperties GetReadableProperties() const = 0;
@ -290,6 +294,12 @@ struct TableProperties {
// it's explicitly written to meta properties block.
uint64_t user_defined_timestamps_persisted = 1;
// The largest sequence number of keys in this file.
// UINT64_MAX means unknown.
// Only written to properties block if known (should be known unless the
// table is empty).
uint64_t key_largest_seqno = UINT64_MAX;
// DB identity
// db_id is an identifier generated the first time the DB is created
// If DB identity is unset or unassigned, `db_id` will be an empty string.

View File

@ -110,4 +110,16 @@ enum class WriteStallCondition {
kNormal,
};
// Temperature of a file. Used to pass to FileSystem for a different
// placement and/or coding.
// Reserve some numbers in the middle, in case we need to insert new tier
// there.
enum class Temperature : uint8_t {
kUnknown = 0,
kHot = 0x04,
kWarm = 0x08,
kCold = 0x0C,
kLastTemperature,
};
} // namespace ROCKSDB_NAMESPACE

View File

@ -323,6 +323,22 @@ struct TransactionOptions {
// description. If a negative value is specified, then the default value from
// TransactionDBOptions is used.
int64_t write_batch_flush_threshold = -1;
// DO NOT USE.
// This is only a temporary option dedicated for MyRocks that will soon be
// removed.
// In normal use cases, meta info like column family's timestamp size is
// tracked at the transaction layer, so it's not necessary and even
// detrimental to track such info inside the internal WriteBatch because it
// may let anti-patterns like bypassing Transaction write APIs and directly
// write to its internal `WriteBatch` retrieved like this:
// https://github.com/facebook/mysql-5.6/blob/fb-mysql-8.0.32/storage/rocksdb/ha_rocksdb.cc#L4949-L4950
// Setting this option to true will keep aforementioned use case continue to
// work before it's refactored out.
// When this flag is enabled, we also intentionally only track the timestamp
// size in APIs that MyRocks currently are using, including Put, Merge, Delete
// DeleteRange, SingleDelete.
bool write_batch_track_timestamp_size = false;
};
// The per-write optimizations that do not involve transactions. TransactionDB

View File

@ -12,7 +12,7 @@
// NOTE: in 'main' development branch, this should be the *next*
// minor or major version number planned for release.
#define ROCKSDB_MAJOR 9
#define ROCKSDB_MINOR 6
#define ROCKSDB_MINOR 7
#define ROCKSDB_PATCH 0
// Do not use these. We made the mistake of declaring macros starting with

View File

@ -30,7 +30,7 @@
#include <functional>
#include <memory>
#include <string>
#include <vector>
#include <unordered_map>
#include "rocksdb/status.h"
#include "rocksdb/write_batch_base.h"
@ -437,6 +437,30 @@ class WriteBatch : public WriteBatchBase {
Status UpdateTimestamps(const Slice& ts,
std::function<size_t(uint32_t /*cf*/)> ts_sz_func);
// TODO: remove these internal APIs after MyRocks refactor to not directly
// write to a `WriteBatch` retrieved from `Transaction` via
// `Transaction::GetWriteBatch`.
void SetTrackTimestampSize(bool track_timestamp_size) {
track_timestamp_size_ = track_timestamp_size;
}
inline void MaybeTrackTimestampSize(uint32_t column_family_id, size_t ts_sz) {
if (!track_timestamp_size_) {
return;
}
auto iter = cf_id_to_ts_sz_.find(column_family_id);
if (iter == cf_id_to_ts_sz_.end()) {
cf_id_to_ts_sz_.emplace(column_family_id, ts_sz);
}
}
// Return a mapping from column family id to timestamp size of all the column
// families involved in this WriteBatch.
const std::unordered_map<uint32_t, size_t>& GetColumnFamilyToTimestampSize() {
return cf_id_to_ts_sz_;
}
// Verify the per-key-value checksums of this write batch.
// Corruption status will be returned if the verification fails.
// If this write batch does not have per-key-value checksum,
@ -511,6 +535,10 @@ class WriteBatch : public WriteBatchBase {
size_t default_cf_ts_sz_ = 0;
bool track_timestamp_size_ = false;
std::unordered_map<uint32_t, size_t> cf_id_to_ts_sz_;
protected:
std::string rep_; // See comment in write_batch.cc for the format of rep_
};

View File

@ -5317,6 +5317,10 @@ class TickerTypeJni {
return -0x53;
case ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT:
return -0x55;
case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_COUNT:
return -0x56;
case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT:
return -0x57;
case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
// -0x54 is the max value at this time. Since these values are exposed
// directly to Java clients, we'll keep the value the same till the next
@ -5774,6 +5778,11 @@ class TickerTypeJni {
return ROCKSDB_NAMESPACE::Tickers::PREFETCH_HITS;
case -0x55:
return ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT;
case -0x56:
return ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_COUNT;
case -0x57:
return ROCKSDB_NAMESPACE::Tickers::
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT;
case -0x54:
// -0x54 is the max value at this time. Since these values are exposed
// directly to Java clients, we'll keep the value the same till the next

View File

@ -878,6 +878,10 @@ public enum TickerType {
SST_FOOTER_CORRUPTION_COUNT((byte) -0x55),
FILE_READ_CORRUPTION_RETRY_COUNT((byte) -0x56),
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT((byte) -0x57),
TICKER_ENUM_MAX((byte) -0x54);
private final byte value;

View File

@ -52,6 +52,7 @@
#include "port/likely.h"
#include "port/port.h"
#include "rocksdb/slice.h"
#include "test_util/sync_point.h"
#include "util/coding.h"
#include "util/random.h"
@ -169,13 +170,20 @@ class InlineSkipList {
// REQUIRES: Valid()
void Next();
[[nodiscard]] Status NextAndValidate(bool allow_data_in_errors);
// Advances to the previous position.
// REQUIRES: Valid()
void Prev();
[[nodiscard]] Status PrevAndValidate(bool allow_data_in_errors);
// Advance to the first entry with a key >= target
void Seek(const char* target);
[[nodiscard]] Status SeekAndValidate(const char* target,
bool allow_data_in_errors);
// Retreat to the last entry with a key <= target
void SeekForPrev(const char* target);
@ -237,21 +245,20 @@ class InlineSkipList {
bool KeyIsAfterNode(const DecodedKey& key, Node* n) const;
// Returns the earliest node with a key >= key.
// Return nullptr if there is no such node.
Node* FindGreaterOrEqual(const char* key) const;
// Returns nullptr if there is no such node.
// @param out_of_order_node If not null, will validate the order of visited
// nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
// returned and *out_of_order_node will be set to n2.
Node* FindGreaterOrEqual(const char* key, Node** out_of_order_node) const;
// Return the latest node with a key < key.
// Return head_ if there is no such node.
// Returns the latest node with a key < key.
// Returns head_ if there is no such node.
// Fills prev[level] with pointer to previous node at "level" for every
// level in [0..max_height_-1], if prev is non-null.
Node* FindLessThan(const char* key, Node** prev = nullptr) const;
// Return the latest node with a key < key on bottom_level. Start searching
// from root node on the level below top_level.
// Fills prev[level] with pointer to previous node at "level" for every
// level in [bottom_level..top_level-1], if prev is non-null.
Node* FindLessThan(const char* key, Node** prev, Node* root, int top_level,
int bottom_level) const;
// @param out_of_order_node If not null, will validate the order of visited
// nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
// returned and *out_of_order_node will be set to n2.
Node* FindLessThan(const char* key, Node** out_of_order_node) const;
// Return the last node in the list.
// Return head_ if list is empty.
@ -274,6 +281,8 @@ class InlineSkipList {
// lowest_level (inclusive).
void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice,
int recompute_level);
static Status Corruption(Node* prev, Node* next, bool allow_data_in_errors);
};
// Implementation details follow
@ -392,20 +401,68 @@ inline void InlineSkipList<Comparator>::Iterator::Next() {
node_ = node_->Next(0);
}
template <class Comparator>
inline Status InlineSkipList<Comparator>::Iterator::NextAndValidate(
bool allow_data_in_errors) {
assert(Valid());
Node* prev_node = node_;
node_ = node_->Next(0);
// Verify that keys are increasing.
if (prev_node != list_->head_ && node_ != nullptr &&
list_->compare_(prev_node->Key(), node_->Key()) >= 0) {
Node* node = node_;
// invalidates the iterator
node_ = nullptr;
return Corruption(prev_node, node, allow_data_in_errors);
}
return Status::OK();
}
template <class Comparator>
inline void InlineSkipList<Comparator>::Iterator::Prev() {
// Instead of using explicit "prev" links, we just search for the
// last node that falls before key.
assert(Valid());
node_ = list_->FindLessThan(node_->Key());
node_ = list_->FindLessThan(node_->Key(), nullptr);
if (node_ == list_->head_) {
node_ = nullptr;
}
}
template <class Comparator>
inline Status InlineSkipList<Comparator>::Iterator::PrevAndValidate(
const bool allow_data_in_errors) {
assert(Valid());
// Skip list validation is done in FindLessThan().
Node* out_of_order_node = nullptr;
node_ = list_->FindLessThan(node_->Key(), &out_of_order_node);
if (out_of_order_node) {
Node* node = node_;
node_ = nullptr;
return Corruption(node, out_of_order_node, allow_data_in_errors);
}
if (node_ == list_->head_) {
node_ = nullptr;
}
return Status::OK();
}
template <class Comparator>
inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) {
node_ = list_->FindGreaterOrEqual(target);
node_ = list_->FindGreaterOrEqual(target, nullptr);
}
template <class Comparator>
inline Status InlineSkipList<Comparator>::Iterator::SeekAndValidate(
const char* target, const bool allow_data_in_errors) {
Node* out_of_order_node = nullptr;
node_ = list_->FindGreaterOrEqual(target, &out_of_order_node);
if (out_of_order_node) {
Node* node = node_;
node_ = nullptr;
return Corruption(node, out_of_order_node, allow_data_in_errors);
}
return Status::OK();
}
template <class Comparator>
@ -448,6 +505,7 @@ int InlineSkipList<Comparator>::RandomHeight() {
rnd->Next() < kScaledInverseBranching_) {
height++;
}
TEST_SYNC_POINT_CALLBACK("InlineSkipList::RandomHeight::height", &height);
assert(height > 0);
assert(height <= kMaxHeight_);
assert(height <= kMaxPossibleHeight);
@ -472,7 +530,8 @@ bool InlineSkipList<Comparator>::KeyIsAfterNode(const DecodedKey& key,
template <class Comparator>
typename InlineSkipList<Comparator>::Node*
InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
InlineSkipList<Comparator>::FindGreaterOrEqual(
const char* key, Node** const out_of_order_node) const {
// Note: It looks like we could reduce duplication by implementing
// this function as FindLessThan(key)->Next(0), but we wouldn't be able
// to exit early on equality and the result wouldn't even be correct.
@ -486,6 +545,11 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
Node* next = x->Next(level);
if (next != nullptr) {
PREFETCH(next->Next(level), 0, 1);
if (out_of_order_node && x != head_ &&
compare_(x->Key(), next->Key()) >= 0) {
*out_of_order_node = next;
return x;
}
}
// Make sure the lists are sorted
assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
@ -509,18 +573,11 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
template <class Comparator>
typename InlineSkipList<Comparator>::Node*
InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev) const {
return FindLessThan(key, prev, head_, GetMaxHeight(), 0);
}
template <class Comparator>
typename InlineSkipList<Comparator>::Node*
InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
Node* root, int top_level,
int bottom_level) const {
assert(top_level > bottom_level);
int level = top_level - 1;
Node* x = root;
InlineSkipList<Comparator>::FindLessThan(const char* key,
Node** const out_of_order_node) const {
int level = GetMaxHeight() - 1;
assert(level >= 0);
Node* x = head_;
// KeyIsAfter(key, last_not_after) is definitely false
Node* last_not_after = nullptr;
const DecodedKey key_decoded = compare_.decode_key(key);
@ -529,6 +586,11 @@ InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
Node* next = x->Next(level);
if (next != nullptr) {
PREFETCH(next->Next(level), 0, 1);
if (out_of_order_node && x != head_ &&
compare_(x->Key(), next->Key()) >= 0) {
*out_of_order_node = next;
return x;
}
}
assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
assert(x == head_ || KeyIsAfterNode(key_decoded, x));
@ -537,10 +599,7 @@ InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
assert(next != nullptr);
x = next;
} else {
if (prev != nullptr) {
prev[level] = x;
}
if (level == bottom_level) {
if (level == 0) {
return x;
} else {
// Switch to next list, reuse KeyIsAfterNode() result
@ -910,12 +969,12 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
while (true) {
// Checking for duplicate keys on the level 0 is sufficient
if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
compare_(x->Key(), splice->next_[i]->Key()) >= 0)) {
compare_(splice->next_[i]->Key(), key_decoded) <= 0)) {
// duplicate key
return false;
}
if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) {
compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) {
// duplicate key
return false;
}
@ -953,12 +1012,12 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
}
// Checking for duplicate keys on the level 0 is sufficient
if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
compare_(x->Key(), splice->next_[i]->Key()) >= 0)) {
compare_(splice->next_[i]->Key(), key_decoded) <= 0)) {
// duplicate key
return false;
}
if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) {
compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) {
// duplicate key
return false;
}
@ -999,7 +1058,7 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
template <class Comparator>
bool InlineSkipList<Comparator>::Contains(const char* key) const {
Node* x = FindGreaterOrEqual(key);
Node* x = FindGreaterOrEqual(key, nullptr);
if (x != nullptr && Equal(key, x->Key())) {
return true;
} else {
@ -1048,4 +1107,14 @@ void InlineSkipList<Comparator>::TEST_Validate() const {
}
}
template <class Comparator>
Status InlineSkipList<Comparator>::Corruption(Node* prev, Node* next,
bool allow_data_in_errors) {
std::string msg = "Out-of-order keys found in skiplist.";
if (allow_data_in_errors) {
msg.append(" prev key: " + Slice(prev->Key()).ToString(true));
msg.append(" next key: " + Slice(next->Key()).ToString(true));
}
return Status::Corruption(msg);
}
} // namespace ROCKSDB_NAMESPACE

View File

@ -92,6 +92,20 @@ class SkipListRep : public MemTableRep {
}
}
Status GetAndValidate(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry),
bool allow_data_in_errors) override {
SkipListRep::Iterator iter(&skip_list_);
Slice dummy_slice;
Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(),
allow_data_in_errors);
for (; iter.Valid() && status.ok() &&
callback_func(callback_args, iter.key());
status = iter.NextAndValidate(allow_data_in_errors)) {
}
return status;
}
uint64_t ApproximateNumEntries(const Slice& start_ikey,
const Slice& end_ikey) override {
std::string tmp;
@ -181,15 +195,24 @@ class SkipListRep : public MemTableRep {
// Returns the key at the current position.
// REQUIRES: Valid()
const char* key() const override { return iter_.key(); }
const char* key() const override {
assert(Valid());
return iter_.key();
}
// Advances to the next position.
// REQUIRES: Valid()
void Next() override { iter_.Next(); }
void Next() override {
assert(Valid());
iter_.Next();
}
// Advances to the previous position.
// REQUIRES: Valid()
void Prev() override { iter_.Prev(); }
void Prev() override {
assert(Valid());
iter_.Prev();
}
// Advance to the first entry with a key >= target
void Seek(const Slice& user_key, const char* memtable_key) override {
@ -219,6 +242,26 @@ class SkipListRep : public MemTableRep {
// Final state of iterator is Valid() iff list is not empty.
void SeekToLast() override { iter_.SeekToLast(); }
Status NextAndValidate(bool allow_data_in_errors) override {
assert(Valid());
return iter_.NextAndValidate(allow_data_in_errors);
}
Status SeekAndValidate(const Slice& user_key, const char* memtable_key,
bool allow_data_in_errors) override {
if (memtable_key != nullptr) {
return iter_.SeekAndValidate(memtable_key, allow_data_in_errors);
} else {
return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key),
allow_data_in_errors);
}
}
Status PrevAndValidate(bool allow_data_in_error) override {
assert(Valid());
return iter_.PrevAndValidate(allow_data_in_error);
}
protected:
std::string tmp_; // For passing to EncodeKey
};

View File

@ -266,6 +266,10 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"},
{PREFETCH_HITS, "rocksdb.prefetch.hits"},
{SST_FOOTER_CORRUPTION_COUNT, "rocksdb.footer.corruption.count"},
{FILE_READ_CORRUPTION_RETRY_COUNT,
"rocksdb.file.read.corruption.retry.count"},
{FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
"rocksdb.file.read.corruption.retry.success.count"},
};
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {

View File

@ -531,6 +531,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct MutableCFOptions, block_protection_bytes_per_key),
OptionType::kUInt8T, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"paranoid_memory_checks",
{offsetof(struct MutableCFOptions, paranoid_memory_checks),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{kOptNameCompOpts,
OptionTypeInfo::Struct(
kOptNameCompOpts, &compression_options_type_info,
@ -1104,6 +1108,8 @@ void MutableCFOptions::Dump(Logger* log) const {
ttl);
ROCKS_LOG_INFO(log, " periodic_compaction_seconds: %" PRIu64,
periodic_compaction_seconds);
ROCKS_LOG_INFO(log, " paranoid_memory_checks: %d",
paranoid_memory_checks);
std::string result;
char buf[10];
for (const auto m : max_bytes_for_level_multiplier_additional) {

View File

@ -168,6 +168,7 @@ struct MutableCFOptions {
memtable_protection_bytes_per_key(
options.memtable_protection_bytes_per_key),
block_protection_bytes_per_key(options.block_protection_bytes_per_key),
paranoid_memory_checks(options.paranoid_memory_checks),
sample_for_compression(
options.sample_for_compression), // TODO: is 0 fine here?
compression_per_level(options.compression_per_level),
@ -317,6 +318,7 @@ struct MutableCFOptions {
Temperature default_write_temperature;
uint32_t memtable_protection_bytes_per_key;
uint8_t block_protection_bytes_per_key;
bool paranoid_memory_checks;
uint64_t sample_for_compression;
std::vector<CompressionType> compression_per_level;

View File

@ -576,6 +576,14 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct ImmutableDBOptions, follower_catchup_retry_wait_ms),
OptionType::kUInt64T, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"metadata_write_temperature",
{offsetof(struct ImmutableDBOptions, metadata_write_temperature),
OptionType::kTemperature, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"wal_write_temperature",
{offsetof(struct ImmutableDBOptions, wal_write_temperature),
OptionType::kTemperature, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
};
const std::string OptionsHelper::kDBOptionsName = "DBOptions";
@ -778,7 +786,9 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
follower_refresh_catchup_period_ms(
options.follower_refresh_catchup_period_ms),
follower_catchup_retry_count(options.follower_catchup_retry_count),
follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms) {
follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms),
metadata_write_temperature(options.metadata_write_temperature),
wal_write_temperature(options.wal_write_temperature) {
fs = env->GetFileSystem();
clock = env->GetSystemClock().get();
logger = info_log.get();
@ -956,6 +966,10 @@ void ImmutableDBOptions::Dump(Logger* log) const {
db_host_id.c_str());
ROCKS_LOG_HEADER(log, " Options.enforce_single_del_contracts: %s",
enforce_single_del_contracts ? "true" : "false");
ROCKS_LOG_HEADER(log, " Options.metadata_write_temperature: %s",
temperature_to_string[metadata_write_temperature].c_str());
ROCKS_LOG_HEADER(log, " Options.wal_write_temperature: %s",
temperature_to_string[wal_write_temperature].c_str());
}
bool ImmutableDBOptions::IsWalDirSameAsDBPath() const {

View File

@ -103,6 +103,8 @@ struct ImmutableDBOptions {
uint64_t follower_refresh_catchup_period_ms;
uint64_t follower_catchup_retry_count;
uint64_t follower_catchup_retry_wait_ms;
Temperature metadata_write_temperature;
Temperature wal_write_temperature;
// Beginning convenience/helper objects that are not part of the base
// DBOptions

View File

@ -180,6 +180,15 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
options.enforce_single_del_contracts =
immutable_db_options.enforce_single_del_contracts;
options.daily_offpeak_time_utc = mutable_db_options.daily_offpeak_time_utc;
options.follower_refresh_catchup_period_ms =
immutable_db_options.follower_refresh_catchup_period_ms;
options.follower_catchup_retry_count =
immutable_db_options.follower_catchup_retry_count;
options.follower_catchup_retry_wait_ms =
immutable_db_options.follower_catchup_retry_wait_ms;
options.metadata_write_temperature =
immutable_db_options.metadata_write_temperature;
options.wal_write_temperature = immutable_db_options.wal_write_temperature;
return options;
}
@ -213,6 +222,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
moptions.memtable_protection_bytes_per_key;
cf_opts->block_protection_bytes_per_key =
moptions.block_protection_bytes_per_key;
cf_opts->paranoid_memory_checks = moptions.paranoid_memory_checks;
cf_opts->bottommost_file_compaction_delay =
moptions.bottommost_file_compaction_delay;

View File

@ -69,8 +69,9 @@ Status PersistRocksDBOptions(const WriteOptions& write_options,
}
std::unique_ptr<FSWritableFile> wf;
Status s =
fs->NewWritableFile(file_name, FileOptions(), &wf, nullptr);
FileOptions file_options;
file_options.temperature = db_opt.metadata_write_temperature;
Status s = fs->NewWritableFile(file_name, file_options, &wf, nullptr);
if (!s.ok()) {
return s;
}

View File

@ -188,6 +188,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
"block_size_deviation=8;block_restart_interval=4; "
"metadata_block_size=1024;"
"partition_filters=false;"
"decouple_partitioned_filters=true;"
"optimize_filters_for_memory=true;"
"use_delta_encoding=true;"
"index_block_restart_interval=4;"
@ -366,7 +367,12 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
"lowest_used_cache_tier=kNonVolatileBlockTier;"
"allow_data_in_errors=false;"
"enforce_single_del_contracts=false;"
"daily_offpeak_time_utc=08:30-19:00;",
"daily_offpeak_time_utc=08:30-19:00;"
"follower_refresh_catchup_period_ms=123;"
"follower_catchup_retry_count=456;"
"follower_catchup_retry_wait_ms=789;"
"metadata_write_temperature=kCold;"
"wal_write_temperature=kHot;",
new_options));
ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
@ -567,7 +573,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
"block_protection_bytes_per_key=1;"
"memtable_max_range_deletions=999999;"
"bottommost_file_compaction_delay=7200;"
"uncache_aggressiveness=1234;",
"uncache_aggressiveness=1234;"
"paranoid_memory_checks=1;",
new_options));
ASSERT_NE(new_options->blob_cache.get(), nullptr);

View File

@ -96,7 +96,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
filter_bits_builder, table_opt.index_block_restart_interval,
use_delta_encoding_for_index_values, p_index_builder, partition_size,
ts_sz, persist_user_defined_timestamps);
ts_sz, persist_user_defined_timestamps,
table_opt.decouple_partitioned_filters);
} else {
return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
table_opt.whole_key_filtering,
@ -213,10 +214,11 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
public:
explicit BlockBasedTablePropertiesCollector(
BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering,
bool prefix_filtering)
bool prefix_filtering, bool decoupled_partitioned_filters)
: index_type_(index_type),
whole_key_filtering_(whole_key_filtering),
prefix_filtering_(prefix_filtering) {}
prefix_filtering_(prefix_filtering),
decoupled_partitioned_filters_(decoupled_partitioned_filters) {}
Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
uint64_t /*file_size*/) override {
@ -240,6 +242,11 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
whole_key_filtering_ ? kPropTrue : kPropFalse});
properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering,
prefix_filtering_ ? kPropTrue : kPropFalse});
if (decoupled_partitioned_filters_) {
properties->insert(
{BlockBasedTablePropertyNames::kDecoupledPartitionedFilters,
kPropTrue});
}
return Status::OK();
}
@ -257,6 +264,7 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
BlockBasedTableOptions::IndexType index_type_;
bool whole_key_filtering_;
bool prefix_filtering_;
bool decoupled_partitioned_filters_;
};
struct BlockBasedTableBuilder::Rep {
@ -296,7 +304,7 @@ struct BlockBasedTableBuilder::Rep {
std::string index_separator_scratch;
PartitionedIndexBuilder* p_index_builder_ = nullptr;
std::string last_key;
std::string last_ikey; // Internal key or empty (unset)
const Slice* first_key_in_next_block = nullptr;
CompressionType compression_type;
uint64_t sample_for_compression;
@ -594,7 +602,8 @@ struct BlockBasedTableBuilder::Rep {
table_properties_collectors.emplace_back(
new BlockBasedTablePropertiesCollector(
table_options.index_type, table_options.whole_key_filtering,
prefix_extractor != nullptr));
prefix_extractor != nullptr,
table_options.decouple_partitioned_filters));
if (ts_sz > 0 && persist_user_defined_timestamps) {
table_properties_collectors.emplace_back(
new TimestampTablePropertiesCollector(
@ -618,6 +627,9 @@ struct BlockBasedTableBuilder::Rep {
if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) {
ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set");
}
// Default is UINT64_MAX for unknown. Setting it to 0 here
// to allow updating it by taking max in BlockBasedTableBuilder::Add().
props.key_largest_seqno = 0;
if (FormatVersionUsesContextChecksum(table_options.format_version)) {
// Must be non-zero and semi- or quasi-random
@ -654,6 +666,7 @@ struct BlockBasedTableBuilder::Rep {
};
struct BlockBasedTableBuilder::ParallelCompressionRep {
// TODO: consider replacing with autovector or similar
// Keys is a wrapper of vector of strings avoiding
// releasing string memories during vector clear()
// in order to save memory allocation overhead
@ -998,24 +1011,27 @@ BlockBasedTableBuilder::~BlockBasedTableBuilder() {
delete rep_;
}
void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
Rep* r = rep_;
assert(rep_->state != Rep::State::kClosed);
if (!ok()) {
return;
}
ValueType value_type = ExtractValueType(key);
ValueType value_type;
SequenceNumber seq;
UnPackSequenceAndType(ExtractInternalKeyFooter(ikey), &seq, &value_type);
r->props.key_largest_seqno = std::max(r->props.key_largest_seqno, seq);
if (IsValueType(value_type)) {
#ifndef NDEBUG
if (r->props.num_entries > r->props.num_range_deletions) {
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
assert(r->internal_comparator.Compare(ikey, Slice(r->last_ikey)) > 0);
}
#endif // !NDEBUG
auto should_flush = r->flush_block_policy->Update(key, value);
auto should_flush = r->flush_block_policy->Update(ikey, value);
if (should_flush) {
assert(!r->data_block.empty());
r->first_key_in_next_block = &key;
r->first_key_in_next_block = &ikey;
Flush();
if (r->state == Rep::State::kBuffered) {
bool exceeds_buffer_limit =
@ -1050,7 +1066,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
if (r->IsParallelCompressionEnabled()) {
r->pc_rep->curr_block_keys->Clear();
} else {
r->index_builder->AddIndexEntry(r->last_key, &key, r->pending_handle,
r->index_builder->AddIndexEntry(r->last_ikey, &ikey,
r->pending_handle,
&r->index_separator_scratch);
}
}
@ -1060,27 +1077,31 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
// builder after being added to index builder.
if (r->state == Rep::State::kUnbuffered) {
if (r->IsParallelCompressionEnabled()) {
r->pc_rep->curr_block_keys->PushBack(key);
r->pc_rep->curr_block_keys->PushBack(ikey);
} else {
if (r->filter_builder != nullptr) {
r->filter_builder->Add(
ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
r->filter_builder->AddWithPrevKey(
ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz),
r->last_ikey.empty()
? Slice{}
: ExtractUserKeyAndStripTimestamp(r->last_ikey, r->ts_sz));
}
}
}
r->data_block.AddWithLastKey(key, value, r->last_key);
r->last_key.assign(key.data(), key.size());
r->data_block.AddWithLastKey(ikey, value, r->last_ikey);
r->last_ikey.assign(ikey.data(), ikey.size());
assert(!r->last_ikey.empty());
if (r->state == Rep::State::kBuffered) {
// Buffered keys will be replayed from data_block_buffers during
// `Finish()` once compression dictionary has been finalized.
} else {
if (!r->IsParallelCompressionEnabled()) {
r->index_builder->OnKeyAdded(key);
r->index_builder->OnKeyAdded(ikey);
}
}
// TODO offset passed in is not accurate for parallel compression case
NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
r->table_properties_collectors,
r->ioptions.logger);
@ -1094,9 +1115,9 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
if (r->ts_sz > 0 && !r->persist_user_defined_timestamps) {
persisted_end = StripTimestampFromUserKey(value, r->ts_sz);
}
r->range_del_block.Add(key, persisted_end);
r->range_del_block.Add(ikey, persisted_end);
// TODO offset passed in is not accurate for parallel compression case
NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
r->table_properties_collectors,
r->ioptions.logger);
} else {
@ -1108,7 +1129,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
}
r->props.num_entries++;
r->props.raw_key_size += key.size();
r->props.raw_key_size += ikey.size();
if (!r->persist_user_defined_timestamps) {
r->props.raw_key_size -= r->ts_sz;
}
@ -1452,6 +1473,8 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
Rep* r = rep_;
ParallelCompressionRep::BlockRepSlot* slot = nullptr;
ParallelCompressionRep::BlockRep* block_rep = nullptr;
// Starts empty; see FilterBlockBuilder::AddWithPrevKey
std::string prev_block_last_key_no_ts;
while (r->pc_rep->write_queue.pop(slot)) {
assert(slot != nullptr);
slot->Take(block_rep);
@ -1465,13 +1488,20 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
continue;
}
Slice prev_key_no_ts = prev_block_last_key_no_ts;
for (size_t i = 0; i < block_rep->keys->Size(); i++) {
auto& key = (*block_rep->keys)[i];
if (r->filter_builder != nullptr) {
r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
Slice key_no_ts = ExtractUserKeyAndStripTimestamp(key, r->ts_sz);
r->filter_builder->AddWithPrevKey(key_no_ts, prev_key_no_ts);
prev_key_no_ts = key_no_ts;
}
r->index_builder->OnKeyAdded(key);
}
if (r->filter_builder != nullptr) {
prev_block_last_key_no_ts.assign(prev_key_no_ts.data(),
prev_key_no_ts.size());
}
r->pc_rep->file_size_estimator.SetCurrBlockUncompSize(
block_rep->data->size());
@ -1563,6 +1593,13 @@ void BlockBasedTableBuilder::WriteFilterBlock(
// No filter block needed
return;
}
if (!rep_->last_ikey.empty()) {
// We might have been using AddWithPrevKey, so need PrevKeyBeforeFinish
// to be safe. And because we are re-synchronized after buffered/parallel
// operation, rep_->last_ikey is accurate.
rep_->filter_builder->PrevKeyBeforeFinish(
ExtractUserKeyAndStripTimestamp(rep_->last_ikey, rep_->ts_sz));
}
BlockHandle filter_block_handle;
bool is_partitioned_filter = rep_->table_options.partition_filters;
if (ok()) {
@ -1578,9 +1615,10 @@ void BlockBasedTableBuilder::WriteFilterBlock(
// See FilterBlockBuilder::Finish() for more on the difference in
// transferred filter data payload among different FilterBlockBuilder
// subtypes.
std::unique_ptr<const char[]> filter_data;
Slice filter_content =
rep_->filter_builder->Finish(filter_block_handle, &s, &filter_data);
std::unique_ptr<const char[]> filter_owner;
Slice filter_content;
s = rep_->filter_builder->Finish(filter_block_handle, &filter_content,
&filter_owner);
assert(s.ok() || s.IsIncomplete() || s.IsCorruption());
if (s.IsCorruption()) {
@ -1749,6 +1787,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
rep_->props.user_defined_timestamps_persisted =
rep_->persist_user_defined_timestamps;
assert(IsEmpty() || rep_->props.key_largest_seqno != UINT64_MAX);
// Add basic properties
property_block_builder.AddTableProperty(rep_->props);
@ -1976,6 +2015,10 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
for (; iter->Valid(); iter->Next()) {
Slice key = iter->key();
if (r->filter_builder != nullptr) {
// NOTE: AddWithPrevKey here would only save key copying if prev is
// pinned (iter->IsKeyPinned()), which is probably rare with delta
// encoding. OK to go from Add() here to AddWithPrevKey() in
// unbuffered operation.
r->filter_builder->Add(
ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
}
@ -1989,6 +2032,7 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
iter->SeekToLast();
assert(iter->Valid());
r->index_builder->AddIndexEntry(
iter->key(), first_key_in_next_block_ptr, r->pending_handle,
&r->index_separator_scratch);
@ -2027,7 +2071,7 @@ Status BlockBasedTableBuilder::Finish() {
// block, we will finish writing all index entries first.
if (ok() && !empty_data_block) {
r->index_builder->AddIndexEntry(
r->last_key, nullptr /* no next data block */, r->pending_handle,
r->last_ikey, nullptr /* no next data block */, r->pending_handle,
&r->index_separator_scratch);
}
}

View File

@ -304,6 +304,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct BlockBasedTableOptions, partition_filters),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"decouple_partitioned_filters",
{offsetof(struct BlockBasedTableOptions, decouple_partitioned_filters),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"optimize_filters_for_memory",
{offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory),
OptionType::kBoolean, OptionVerificationType::kNormal,
@ -971,6 +975,8 @@ const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
"rocksdb.block.based.table.whole.key.filtering";
const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
"rocksdb.block.based.table.prefix.filtering";
const std::string BlockBasedTablePropertyNames::kDecoupledPartitionedFilters =
"rocksdb.block.based.table.decoupled.partitioned.filters";
const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
const std::string kHashIndexPrefixesMetadataBlock =
"rocksdb.hashindex.metadata";

Some files were not shown because too many files have changed in this diff Show More