Merge branch 'main' into JniReaderForTableIterator

This commit is contained in:
Swaminathan Balachandran 2024-08-31 10:09:33 -07:00 committed by GitHub
commit 21eca90d5e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
131 changed files with 4763 additions and 1706 deletions

View File

@ -1,13 +1,13 @@
name: facebook/rocksdb/benchmark-linux name: facebook/rocksdb/benchmark-linux
on: workflow_dispatch on: workflow_dispatch
jobs: permissions: {}
# FIXME: when this job is fixed, it should be given a cron schedule like # FIXME: Disabled temporarily
# schedule: # schedule:
# - cron: 0 * * * * # - cron: 7 */2 * * * # At minute 7 past every 2nd hour
# workflow_dispatch: jobs:
benchmark-linux: benchmark-linux:
if: ${{ github.repository_owner == 'facebook' }} if: ${{ github.repository_owner == 'facebook' }}
runs-on: ubuntu-latest runs-on: ubuntu-latest # FIXME: change this back to self-hosted when ready
steps: steps:
- uses: actions/checkout@v4.1.0 - uses: actions/checkout@v4.1.0
- uses: "./.github/actions/build-for-benchmarks" - uses: "./.github/actions/build-for-benchmarks"

View File

@ -1,5 +1,6 @@
name: facebook/rocksdb/nightly name: facebook/rocksdb/nightly
on: workflow_dispatch on: workflow_dispatch
permissions: {}
jobs: jobs:
# These jobs would be in nightly but are failing or otherwise broken for # These jobs would be in nightly but are failing or otherwise broken for
# some reason. # some reason.

View File

@ -3,6 +3,7 @@ on:
schedule: schedule:
- cron: 0 9 * * * - cron: 0 9 * * *
workflow_dispatch: workflow_dispatch:
permissions: {}
jobs: jobs:
build-format-compatible: build-format-compatible:
if: ${{ github.repository_owner == 'facebook' }} if: ${{ github.repository_owner == 'facebook' }}
@ -59,12 +60,15 @@ jobs:
container: container:
image: zjay437/rocksdb:0.6 image: zjay437/rocksdb:0.6
options: --shm-size=16gb options: --shm-size=16gb
env:
CC: clang-13
CXX: clang++-13
steps: steps:
- uses: actions/checkout@v4.1.0 - uses: actions/checkout@v4.1.0
- uses: "./.github/actions/pre-steps" - uses: "./.github/actions/pre-steps"
- uses: "./.github/actions/setup-folly" - uses: "./.github/actions/setup-folly"
- uses: "./.github/actions/build-folly" - uses: "./.github/actions/build-folly"
- run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check - run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
- uses: "./.github/actions/post-steps" - uses: "./.github/actions/post-steps"
build-linux-valgrind: build-linux-valgrind:
if: ${{ github.repository_owner == 'facebook' }} if: ${{ github.repository_owner == 'facebook' }}

View File

@ -1,5 +1,6 @@
name: facebook/rocksdb/pr-jobs-candidate name: facebook/rocksdb/pr-jobs-candidate
on: workflow_dispatch on: workflow_dispatch
permissions: {}
jobs: jobs:
# These jobs would be in pr-jobs but are failing or otherwise broken for # These jobs would be in pr-jobs but are failing or otherwise broken for
# some reason. # some reason.

View File

@ -1,5 +1,6 @@
name: facebook/rocksdb/pr-jobs name: facebook/rocksdb/pr-jobs
on: [push, pull_request] on: [push, pull_request]
permissions: {}
jobs: jobs:
# NOTE: multiple workflows would be recommended, but the current GHA UI in # NOTE: multiple workflows would be recommended, but the current GHA UI in
# PRs doesn't make it clear when there's an overall error with a workflow, # PRs doesn't make it clear when there's an overall error with a workflow,

View File

@ -1,6 +1,28 @@
# Rocksdb Change Log # Rocksdb Change Log
> NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt` > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
## 9.6.0 (08/19/2024)
### New Features
* *Best efforts recovery supports recovering to incomplete Version with a clean seqno cut that presents a valid point in time view from the user's perspective, if versioning history doesn't include atomic flush.
* New option `BlockBasedTableOptions::decouple_partitioned_filters` should improve efficiency in serving read queries because filter and index partitions can consistently target the configured `metadata_block_size`. This option is currently opt-in.
* Introduce a new mutable CF option `paranoid_memory_checks`. It enables additional validation on data integrity during reads/scanning. Currently, skip list based memtable will validate key ordering during look up and scans.
### Public API Changes
* Add ticker stats to count file read retries due to checksum mismatch
* Adds optional installation callback function for remote compaction
### Behavior Changes
* There may be less intra-L0 compaction triggered by total L0 size being too small. We now use compensated file size (tombstones are assigned some value size) when calculating L0 size and reduce the threshold for L0 size limit. This is to avoid accumulating too much data/tombstones in L0.
### Bug Fixes
* *Make DestroyDB supports slow deletion when it's configured in `SstFileManager`. The slow deletion is subject to the configured `rate_bytes_per_sec`, but not subject to the `max_trash_db_ratio`.
* Fixed a bug where we set unprep_seqs_ even when WriteImpl() fails. This was caught by stress test write fault injection in WriteImpl(). This may have incorrectly caused iteration creation failure for unvalidated writes or returned wrong result for WriteUnpreparedTxn::GetUnpreparedSequenceNumbers().
* Fixed a bug where successful write right after error recovery for last failed write finishes causes duplicate WAL entries
* Fixed a data race involving the background error status in `unordered_write` mode.
* *Fix a bug where file snapshot functions like backup, checkpoint may attempt to copy a non-existing manifest file. #12882
* Fix a bug where per kv checksum corruption may be ignored in MultiGet().
* Fix a race condition in pessimistic transactions that could allow multiple transactions with the same name to be registered simultaneously, resulting in a crash or other unpredictable behavior.
## 9.5.0 (07/19/2024) ## 9.5.0 (07/19/2024)
### Public API Changes ### Public API Changes
* Introduced new C API function rocksdb_writebatch_iterate_cf for column family-aware iteration over the contents of a WriteBatch * Introduced new C API function rocksdb_writebatch_iterate_cf for column family-aware iteration over the contents of a WriteBatch

View File

@ -1652,6 +1652,9 @@ bool ColumnFamilyData::ShouldPostponeFlushToRetainUDT(
} }
for (const Slice& table_newest_udt : for (const Slice& table_newest_udt :
imm()->GetTablesNewestUDT(max_memtable_id)) { imm()->GetTablesNewestUDT(max_memtable_id)) {
if (table_newest_udt.empty()) {
continue;
}
assert(table_newest_udt.size() == full_history_ts_low.size()); assert(table_newest_udt.size() == full_history_ts_low.size());
// Checking the newest UDT contained in MemTable with ascending ID up to // Checking the newest UDT contained in MemTable with ascending ID up to
// `max_memtable_id`. Return immediately on finding the first MemTable that // `max_memtable_id`. Return immediately on finding the first MemTable that

View File

@ -3067,12 +3067,20 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) {
WaitForCompaction(); WaitForCompaction();
AssertFilesPerLevel("0,1", 0 /* cf */); AssertFilesPerLevel("0,1", 0 /* cf */);
// We should calculate the limit by obtaining the number of env background
// threads, because the current test case will share the same env
// with another case that may have already increased the number of
// background threads which is larger than kParallelismLimit
const auto limit = env_->GetBackgroundThreads(Env::Priority::LOW);
// Block the compaction thread pool so marked files accumulate in L0. // Block the compaction thread pool so marked files accumulate in L0.
test::SleepingBackgroundTask sleeping_tasks[kParallelismLimit]; std::vector<std::shared_ptr<test::SleepingBackgroundTask>> sleeping_tasks;
for (int i = 0; i < kParallelismLimit; i++) { for (int i = 0; i < limit; i++) {
sleeping_tasks.emplace_back(
std::make_shared<test::SleepingBackgroundTask>());
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
&sleeping_tasks[i], Env::Priority::LOW); sleeping_tasks[i].get(), Env::Priority::LOW);
sleeping_tasks[i].WaitUntilSleeping(); sleeping_tasks[i]->WaitUntilSleeping();
} }
// Zero marked upper-level files. No speedup. // Zero marked upper-level files. No speedup.
@ -3091,9 +3099,9 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) {
ASSERT_EQ(kParallelismLimit, dbfull()->TEST_BGCompactionsAllowed()); ASSERT_EQ(kParallelismLimit, dbfull()->TEST_BGCompactionsAllowed());
AssertFilesPerLevel("2,1", 0 /* cf */); AssertFilesPerLevel("2,1", 0 /* cf */);
for (int i = 0; i < kParallelismLimit; i++) { for (int i = 0; i < limit; i++) {
sleeping_tasks[i].WakeUp(); sleeping_tasks[i]->WakeUp();
sleeping_tasks[i].WaitUntilDone(); sleeping_tasks[i]->WaitUntilDone();
} }
} }

View File

@ -552,7 +552,8 @@ class CompactionJobTestBase : public testing::Test {
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*read_only=*/false)); /*error_handler=*/nullptr, /*read_only=*/false));
compaction_job_stats_.Reset(); compaction_job_stats_.Reset();
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); ASSERT_OK(
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
VersionEdit new_db; VersionEdit new_db;
new_db.SetLogNumber(0); new_db.SetLogNumber(0);
@ -575,7 +576,8 @@ class CompactionJobTestBase : public testing::Test {
} }
ASSERT_OK(s); ASSERT_OK(s);
// Make "CURRENT" file that points to the new manifest file. // Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
Temperature::kUnknown, nullptr);
ASSERT_OK(s); ASSERT_OK(s);

View File

@ -925,11 +925,15 @@ bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {
} }
uint64_t l0_size = 0; uint64_t l0_size = 0;
for (const auto& file : l0_files) { for (const auto& file : l0_files) {
l0_size += file->fd.GetFileSize(); assert(file->compensated_file_size >= file->fd.GetFileSize());
// Compact down L0s with more deletions.
l0_size += file->compensated_file_size;
} }
const uint64_t min_lbase_size =
l0_size * static_cast<uint64_t>(std::max( // Avoid L0->Lbase compactions that are inefficient for write-amp.
10.0, mutable_cf_options_.max_bytes_for_level_multiplier)); const double kMultiplier =
std::max(10.0, mutable_cf_options_.max_bytes_for_level_multiplier) * 2;
const uint64_t min_lbase_size = MultiplyCheckOverflow(l0_size, kMultiplier);
assert(min_lbase_size >= l0_size); assert(min_lbase_size >= l0_size);
const std::vector<FileMetaData*>& lbase_files = const std::vector<FileMetaData*>& lbase_files =
vstorage_->LevelFiles(/*level=*/base_level); vstorage_->LevelFiles(/*level=*/base_level);

View File

@ -214,7 +214,10 @@ class CompactionPickerTest : public CompactionPickerTestBase {
explicit CompactionPickerTest() explicit CompactionPickerTest()
: CompactionPickerTestBase(BytewiseComparator()) {} : CompactionPickerTestBase(BytewiseComparator()) {}
~CompactionPickerTest() override = default; ~CompactionPickerTest() override {
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->DisableProcessing();
}
}; };
class CompactionPickerU64TsTest : public CompactionPickerTestBase { class CompactionPickerU64TsTest : public CompactionPickerTestBase {
@ -4284,27 +4287,28 @@ TEST_F(CompactionPickerTest, IntraL0WhenL0IsSmall) {
SCOPED_TRACE("lbase_size_multiplier=" + SCOPED_TRACE("lbase_size_multiplier=" +
std::to_string(lbase_size_multiplier)); std::to_string(lbase_size_multiplier));
NewVersionStorage(6, kCompactionStyleLevel); NewVersionStorage(6, kCompactionStyleLevel);
// When L0 size is <= Lbase size / max_bytes_for_level_multiplier, // When L0 size is <= Lbase size / max_bytes_for_level_multiplier / 2,
// intra-L0 compaction is picked. Otherwise, L0->L1 // intra-L0 compaction is picked. Otherwise, L0->L1
// compaction is picked. // compaction is picked.
// compensated_file_size will be used to compute total l0 size.
Add(/*level=*/0, /*file_number=*/1U, /*smallest=*/"100", Add(/*level=*/0, /*file_number=*/1U, /*smallest=*/"100",
/*largest=*/"200", /*file_size=*/1000, /*path_id=*/0, /*largest=*/"200", /*file_size=*/10, /*path_id=*/0,
/*smallest_seq=*/10, /*largest_seq=*/11, /*smallest_seq=*/10, /*largest_seq=*/11,
/*compensated_file_size=*/1000); /*compensated_file_size=*/1000);
Add(/*level=*/0, /*file_number=*/2U, /*smallest=*/"100", Add(/*level=*/0, /*file_number=*/2U, /*smallest=*/"100",
/*largest=*/"100", /*file_size=*/1000, /*path_id=*/0, /*largest=*/"100", /*file_size=*/10, /*path_id=*/0,
/*smallest_seq=*/20, /*largest_seq=*/21, /*smallest_seq=*/20, /*largest_seq=*/21,
/*compensated_file_size=*/1000); /*compensated_file_size=*/1000);
Add(/*level=*/0, /*file_number=*/3U, /*smallest=*/"100", Add(/*level=*/0, /*file_number=*/3U, /*smallest=*/"100",
/*largest=*/"200", /*file_size=*/1000, /*path_id=*/0, /*largest=*/"200", /*file_size=*/10, /*path_id=*/0,
/*smallest_seq=*/30, /*largest_seq=*/31, /*smallest_seq=*/30, /*largest_seq=*/31,
/*compensated_file_size=*/1000); /*compensated_file_size=*/1000);
Add(/*level=*/0, /*file_number=*/4U, /*smallest=*/"100", Add(/*level=*/0, /*file_number=*/4U, /*smallest=*/"100",
/*largest=*/"200", /*file_size=*/1000, /*path_id=*/0, /*largest=*/"200", /*file_size=*/10, /*path_id=*/0,
/*smallest_seq=*/40, /*largest_seq=*/41, /*smallest_seq=*/40, /*largest_seq=*/41,
/*compensated_file_size=*/1000); /*compensated_file_size=*/1000);
const uint64_t l0_size = 4000; const uint64_t l0_size = 4000;
const uint64_t lbase_size = l0_size * lbase_size_multiplier; const uint64_t lbase_size = l0_size * lbase_size_multiplier * 2;
Add(/*level=*/1, /*file_number=*/5U, /*smallest=*/"100", Add(/*level=*/1, /*file_number=*/5U, /*smallest=*/"100",
/*largest=*/"200", /*file_size=*/lbase_size, /*path_id=*/0, /*largest=*/"200", /*file_size=*/lbase_size, /*path_id=*/0,
/*smallest_seq=*/0, /*largest_seq=*/0, /*smallest_seq=*/0, /*largest_seq=*/0,

View File

@ -140,9 +140,13 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
return compaction_status; return compaction_status;
} }
// CompactionServiceJobStatus::kSuccess was returned, but somehow we failed to
// read the result. Consider this as an installation failure
if (!s.ok()) { if (!s.ok()) {
sub_compact->status = s; sub_compact->status = s;
compaction_result.status.PermitUncheckedError(); compaction_result.status.PermitUncheckedError();
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
return CompactionServiceJobStatus::kFailure; return CompactionServiceJobStatus::kFailure;
} }
sub_compact->status = compaction_result.status; sub_compact->status = compaction_result.status;
@ -154,18 +158,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
is_first_one = false; is_first_one = false;
} }
ROCKS_LOG_INFO(db_options_.info_log, ROCKS_LOG_INFO(
"[%s] [JOB %d] Receive remote compaction result, output path: " db_options_.info_log,
"%s, files: %s", "[%s] [JOB %d] Received remote compaction result, output path: "
compaction_input.column_family.name.c_str(), job_id_, "%s, files: %s",
compaction_result.output_path.c_str(), compaction_input.column_family.name.c_str(), job_id_,
output_files_oss.str().c_str()); compaction_result.output_path.c_str(), output_files_oss.str().c_str());
if (!s.ok()) {
sub_compact->status = s;
return CompactionServiceJobStatus::kFailure;
}
// Installation Starts
for (const auto& file : compaction_result.output_files) { for (const auto& file : compaction_result.output_files) {
uint64_t file_num = versions_->NewFileNumber(); uint64_t file_num = versions_->NewFileNumber();
auto src_file = compaction_result.output_path + "/" + file.file_name; auto src_file = compaction_result.output_path + "/" + file.file_name;
@ -174,6 +174,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr); s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
if (!s.ok()) { if (!s.ok()) {
sub_compact->status = s; sub_compact->status = s;
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
return CompactionServiceJobStatus::kFailure; return CompactionServiceJobStatus::kFailure;
} }
@ -182,6 +184,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
if (!s.ok()) { if (!s.ok()) {
sub_compact->status = s; sub_compact->status = s;
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
return CompactionServiceJobStatus::kFailure; return CompactionServiceJobStatus::kFailure;
} }
meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size, meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
@ -206,6 +210,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read); RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES, RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
compaction_result.bytes_written); compaction_result.bytes_written);
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kSuccess);
return CompactionServiceJobStatus::kSuccess; return CompactionServiceJobStatus::kSuccess;
} }

View File

@ -108,6 +108,11 @@ class MyTestCompactionService : public CompactionService {
} }
} }
void OnInstallation(const std::string& /*scheduled_job_id*/,
CompactionServiceJobStatus status) override {
final_updated_status_ = status;
}
int GetCompactionNum() { return compaction_num_.load(); } int GetCompactionNum() { return compaction_num_.load(); }
CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; } CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; }
@ -136,6 +141,10 @@ class MyTestCompactionService : public CompactionService {
void SetCanceled(bool canceled) { canceled_ = canceled; } void SetCanceled(bool canceled) { canceled_ = canceled; }
CompactionServiceJobStatus GetFinalCompactionServiceJobStatus() {
return final_updated_status_.load();
}
private: private:
InstrumentedMutex mutex_; InstrumentedMutex mutex_;
std::atomic_int compaction_num_{0}; std::atomic_int compaction_num_{0};
@ -158,6 +167,8 @@ class MyTestCompactionService : public CompactionService {
std::vector<std::shared_ptr<TablePropertiesCollectorFactory>> std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
table_properties_collector_factories_; table_properties_collector_factories_;
std::atomic_bool canceled_{false}; std::atomic_bool canceled_{false};
std::atomic<CompactionServiceJobStatus> final_updated_status_{
CompactionServiceJobStatus::kUseLocal};
}; };
class CompactionServiceTest : public DBTestBase { class CompactionServiceTest : public DBTestBase {
@ -255,6 +266,8 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
auto my_cs = GetCompactionService(); auto my_cs = GetCompactionService();
ASSERT_GE(my_cs->GetCompactionNum(), 1); ASSERT_GE(my_cs->GetCompactionNum(), 1);
ASSERT_EQ(CompactionServiceJobStatus::kSuccess,
my_cs->GetFinalCompactionServiceJobStatus());
// make sure the compaction statistics is only recorded on the remote side // make sure the compaction statistics is only recorded on the remote side
ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1); ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1);
@ -437,6 +450,8 @@ TEST_F(CompactionServiceTest, InvalidResult) {
Slice end(end_str); Slice end(end_str);
Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
ASSERT_FALSE(s.ok()); ASSERT_FALSE(s.ok());
ASSERT_EQ(CompactionServiceJobStatus::kFailure,
my_cs->GetFinalCompactionServiceJobStatus());
} }
TEST_F(CompactionServiceTest, SubCompaction) { TEST_F(CompactionServiceTest, SubCompaction) {

View File

@ -3407,6 +3407,46 @@ class TableFileListener : public EventListener {
InstrumentedMutex mutex_; InstrumentedMutex mutex_;
std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_; std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_;
}; };
class FlushTableFileListener : public EventListener {
public:
void OnTableFileCreated(const TableFileCreationInfo& info) override {
InstrumentedMutexLock lock(&mutex_);
if (info.reason != TableFileCreationReason::kFlush) {
return;
}
cf_to_flushed_files_[info.cf_name].push_back(info.file_path);
}
std::vector<std::string>& GetFlushedFiles(const std::string& cf_name) {
InstrumentedMutexLock lock(&mutex_);
return cf_to_flushed_files_[cf_name];
}
private:
InstrumentedMutex mutex_;
std::unordered_map<std::string, std::vector<std::string>>
cf_to_flushed_files_;
};
class FlushBlobFileListener : public EventListener {
public:
void OnBlobFileCreated(const BlobFileCreationInfo& info) override {
InstrumentedMutexLock lock(&mutex_);
if (info.reason != BlobFileCreationReason::kFlush) {
return;
}
cf_to_flushed_blobs_files_[info.cf_name].push_back(info.file_path);
}
std::vector<std::string>& GetFlushedBlobFiles(const std::string& cf_name) {
InstrumentedMutexLock lock(&mutex_);
return cf_to_flushed_blobs_files_[cf_name];
}
private:
InstrumentedMutex mutex_;
std::unordered_map<std::string, std::vector<std::string>>
cf_to_flushed_blobs_files_;
};
} // anonymous namespace } // anonymous namespace
TEST_F(DBBasicTest, LastSstFileNotInManifest) { TEST_F(DBBasicTest, LastSstFileNotInManifest) {
@ -3512,6 +3552,121 @@ TEST_F(DBBasicTest, RecoverWithMissingFiles) {
} }
} }
// Param 0: whether to enable blob DB.
// Param 1: when blob DB is enabled, whether to also delete the missing L0
// file's associated blob file.
class BestEffortsRecoverIncompleteVersionTest
: public DBTestBase,
public testing::WithParamInterface<std::tuple<bool, bool>> {
public:
BestEffortsRecoverIncompleteVersionTest()
: DBTestBase("best_efforts_recover_incomplete_version_test",
/*env_do_fsync=*/false) {}
};
TEST_P(BestEffortsRecoverIncompleteVersionTest, Basic) {
Options options = CurrentOptions();
options.enable_blob_files = std::get<0>(GetParam());
bool delete_blob_file_too = std::get<1>(GetParam());
DestroyAndReopen(options);
FlushTableFileListener* flush_table_listener = new FlushTableFileListener();
FlushBlobFileListener* flush_blob_listener = new FlushBlobFileListener();
// Disable auto compaction to simplify SST file name tracking.
options.disable_auto_compactions = true;
options.listeners.emplace_back(flush_table_listener);
options.listeners.emplace_back(flush_blob_listener);
CreateAndReopenWithCF({"pikachu", "eevee"}, options);
std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
"eevee"};
int num_cfs = static_cast<int>(handles_.size());
ASSERT_EQ(3, num_cfs);
std::string start = "a";
Slice start_slice = start;
std::string end = "d";
Slice end_slice = end;
for (int cf = 0; cf != num_cfs; ++cf) {
ASSERT_OK(Put(cf, "a", "a_value"));
ASSERT_OK(Flush(cf));
// Compact file to L1 to avoid trivial file move in the next compaction
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf],
&start_slice, &end_slice));
ASSERT_OK(Put(cf, "a", "a_value_new"));
ASSERT_OK(Flush(cf));
ASSERT_OK(Put(cf, "b", "b_value"));
ASSERT_OK(Flush(cf));
ASSERT_OK(Put(cf, "f", "f_value"));
ASSERT_OK(Flush(cf));
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf],
&start_slice, &end_slice));
}
dbfull()->TEST_DeleteObsoleteFiles();
// Delete the most recent L0 file which is before a compaction.
for (int i = 0; i < num_cfs; ++i) {
std::vector<std::string>& files =
flush_table_listener->GetFlushedFiles(all_cf_names[i]);
ASSERT_EQ(4, files.size());
ASSERT_OK(env_->DeleteFile(files[files.size() - 1]));
if (options.enable_blob_files) {
std::vector<std::string>& blob_files =
flush_blob_listener->GetFlushedBlobFiles(all_cf_names[i]);
ASSERT_EQ(4, blob_files.size());
if (delete_blob_file_too) {
ASSERT_OK(env_->DeleteFile(blob_files[files.size() - 1]));
}
}
}
options.best_efforts_recovery = true;
ReopenWithColumnFamilies(all_cf_names, options);
for (int i = 0; i < num_cfs; ++i) {
auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
ColumnFamilyData* cfd = cfh->cfd();
VersionStorageInfo* vstorage = cfd->current()->storage_info();
// The L0 file flushed right before the last compaction is missing.
ASSERT_EQ(0, vstorage->LevelFiles(0).size());
// Only the output of the last compaction is available.
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
}
// Verify data
ReadOptions read_opts;
read_opts.total_order_seek = true;
for (int i = 0; i < num_cfs; ++i) {
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[i]));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("a", iter->key());
ASSERT_EQ("a_value_new", iter->value());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("b", iter->key());
ASSERT_EQ("b_value", iter->value());
iter->Next();
ASSERT_FALSE(iter->Valid());
ASSERT_OK(iter->status());
}
// Write more data.
for (int cf = 0; cf < num_cfs; ++cf) {
ASSERT_OK(Put(cf, "g", "g_value"));
ASSERT_OK(Flush(cf));
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr,
nullptr));
std::string value;
ASSERT_OK(db_->Get(ReadOptions(), handles_[cf], "g", &value));
ASSERT_EQ("g_value", value);
}
}
INSTANTIATE_TEST_CASE_P(BestEffortsRecoverIncompleteVersionTest,
BestEffortsRecoverIncompleteVersionTest,
testing::Values(std::make_tuple(false, false),
std::make_tuple(true, false),
std::make_tuple(true, true)));
TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) { TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.env = env_; options.env = env_;

File diff suppressed because it is too large Load Diff

View File

@ -289,10 +289,12 @@ TEST_F(DBFollowerTest, RetryCatchup) {
{"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"}, {"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
{"DBImpl::BackgroundCompaction:Start", {"DBImpl::BackgroundCompaction:Start",
"DBImplFollower::TryCatchupWithLeader:Begin2"}, "DBImplFollower::TryCatchupWithLeader:Begin2"},
{"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1", {"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin1",
"DBImpl::BackgroundCompaction:BeforeCompaction"}, "DBImpl::BackgroundCompaction:BeforeCompaction"},
{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"}, "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin2"},
{"DBImplFollower::TryCatchupWithLeader:End", "Follower::WaitForCatchup"}, {"DBImplFollower::TryCatchupWithLeader:End", "Follower::WaitForCatchup"},
}); });
SyncPoint::GetInstance()->EnableProcessing(); SyncPoint::GetInstance()->EnableProcessing();
@ -335,10 +337,12 @@ TEST_F(DBFollowerTest, RetryCatchupManifestRollover) {
SyncPoint::GetInstance()->LoadDependency({ SyncPoint::GetInstance()->LoadDependency({
{"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"}, {"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
{"Leader::Flushed", "DBImplFollower::TryCatchupWithLeader:Begin2"}, {"Leader::Flushed", "DBImplFollower::TryCatchupWithLeader:Begin2"},
{"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1", {"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin1",
"Leader::Done"}, "Leader::Done"},
{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"}, "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin2"},
{"DBImplFollower::TryCatchupWithLeader:End", {"DBImplFollower::TryCatchupWithLeader:End",
"Follower::WaitForCatchup:1"}, "Follower::WaitForCatchup:1"},
}); });

View File

@ -17,6 +17,7 @@
#include <cstdio> #include <cstdio>
#include <map> #include <map>
#include <memory> #include <memory>
#include <optional>
#include <set> #include <set>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
@ -2475,7 +2476,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
RecordTick(stats_, MEMTABLE_HIT); RecordTick(stats_, MEMTABLE_HIT);
} }
} }
if (!done && !s.ok() && !s.IsMergeInProgress()) { if (!s.ok() && !s.IsMergeInProgress() && !s.IsNotFound()) {
assert(done);
ReturnAndCleanupSuperVersion(cfd, sv); ReturnAndCleanupSuperVersion(cfd, sv);
return s; return s;
} }
@ -3141,10 +3143,11 @@ Status DBImpl::MultiGetImpl(
StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
assert(sorted_keys); assert(sorted_keys);
assert(start_key + num_keys <= sorted_keys->size());
// Clear the timestamps for returning results so that we can distinguish // Clear the timestamps for returning results so that we can distinguish
// between tombstone or key that has never been written // between tombstone or key that has never been written
for (auto* kctx : *sorted_keys) { for (size_t i = start_key; i < start_key + num_keys; ++i) {
assert(kctx); KeyContext* kctx = (*sorted_keys)[i];
if (kctx->timestamp) { if (kctx->timestamp) {
kctx->timestamp->clear(); kctx->timestamp->clear();
} }
@ -5240,6 +5243,14 @@ Status DestroyDB(const std::string& dbname, const Options& options,
Env* env = soptions.env; Env* env = soptions.env;
std::vector<std::string> filenames; std::vector<std::string> filenames;
bool wal_in_db_path = soptions.IsWalDirSameAsDBPath(); bool wal_in_db_path = soptions.IsWalDirSameAsDBPath();
auto sfm = static_cast_with_check<SstFileManagerImpl>(
options.sst_file_manager.get());
// Allocate a separate trash bucket to be used by all the to be deleted
// files, so we can later wait for this bucket to be empty before return.
std::optional<int32_t> bucket;
if (sfm) {
bucket = sfm->NewTrashBucket();
}
// Reset the logger because it holds a handle to the // Reset the logger because it holds a handle to the
// log file and prevents cleanup and directory removal // log file and prevents cleanup and directory removal
@ -5251,6 +5262,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
/*IODebugContext*=*/nullptr) /*IODebugContext*=*/nullptr)
.PermitUncheckedError(); .PermitUncheckedError();
std::set<std::string> paths_to_delete;
FileLock* lock; FileLock* lock;
const std::string lockname = LockFileName(dbname); const std::string lockname = LockFileName(dbname);
Status result = env->LockFile(lockname, &lock); Status result = env->LockFile(lockname, &lock);
@ -5267,10 +5279,9 @@ Status DestroyDB(const std::string& dbname, const Options& options,
del = DestroyDB(path_to_delete, options); del = DestroyDB(path_to_delete, options);
} else if (type == kTableFile || type == kWalFile || } else if (type == kTableFile || type == kWalFile ||
type == kBlobFile) { type == kBlobFile) {
del = DeleteDBFile( del = DeleteUnaccountedDBFile(&soptions, path_to_delete, dbname,
&soptions, path_to_delete, dbname, /*force_bg=*/false,
/*force_bg=*/false, /*force_fg=*/false, bucket);
/*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false);
} else { } else {
del = env->DeleteFile(path_to_delete); del = env->DeleteFile(path_to_delete);
} }
@ -5279,6 +5290,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
} }
} }
} }
paths_to_delete.insert(dbname);
std::set<std::string> paths; std::set<std::string> paths;
for (const DbPath& db_path : options.db_paths) { for (const DbPath& db_path : options.db_paths) {
@ -5300,18 +5312,19 @@ Status DestroyDB(const std::string& dbname, const Options& options,
(type == kTableFile || (type == kTableFile ||
type == kBlobFile)) { // Lock file will be deleted at end type == kBlobFile)) { // Lock file will be deleted at end
std::string file_path = path + "/" + fname; std::string file_path = path + "/" + fname;
Status del = DeleteDBFile(&soptions, file_path, dbname, Status del = DeleteUnaccountedDBFile(&soptions, file_path, dbname,
/*force_bg=*/false, /*force_fg=*/false); /*force_bg=*/false,
/*force_fg=*/false, bucket);
if (!del.ok() && result.ok()) { if (!del.ok() && result.ok()) {
result = del; result = del;
} }
} }
} }
// TODO: Should we return an error if we cannot delete the directory?
env->DeleteDir(path).PermitUncheckedError();
} }
} }
paths_to_delete.merge(paths);
std::vector<std::string> walDirFiles; std::vector<std::string> walDirFiles;
std::string archivedir = ArchivalDirectory(dbname); std::string archivedir = ArchivalDirectory(dbname);
bool wal_dir_exists = false; bool wal_dir_exists = false;
@ -5335,46 +5348,49 @@ Status DestroyDB(const std::string& dbname, const Options& options,
// Delete archival files. // Delete archival files.
for (const auto& file : archiveFiles) { for (const auto& file : archiveFiles) {
if (ParseFileName(file, &number, &type) && type == kWalFile) { if (ParseFileName(file, &number, &type) && type == kWalFile) {
Status del = Status del = DeleteUnaccountedDBFile(
DeleteDBFile(&soptions, archivedir + "/" + file, archivedir, &soptions, archivedir + "/" + file, archivedir,
/*force_bg=*/false, /*force_fg=*/!wal_in_db_path); /*force_bg=*/false, /*force_fg=*/!wal_in_db_path, bucket);
if (!del.ok() && result.ok()) { if (!del.ok() && result.ok()) {
result = del; result = del;
} }
} }
} }
// Ignore error in case dir contains other files paths_to_delete.insert(archivedir);
env->DeleteDir(archivedir).PermitUncheckedError();
} }
// Delete log files in the WAL dir // Delete log files in the WAL dir
if (wal_dir_exists) { if (wal_dir_exists) {
for (const auto& file : walDirFiles) { for (const auto& file : walDirFiles) {
if (ParseFileName(file, &number, &type) && type == kWalFile) { if (ParseFileName(file, &number, &type) && type == kWalFile) {
Status del = Status del = DeleteUnaccountedDBFile(
DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number), &soptions, LogFileName(soptions.wal_dir, number),
soptions.wal_dir, /*force_bg=*/false, soptions.wal_dir, /*force_bg=*/false,
/*force_fg=*/!wal_in_db_path); /*force_fg=*/!wal_in_db_path, bucket);
if (!del.ok() && result.ok()) { if (!del.ok() && result.ok()) {
result = del; result = del;
} }
} }
} }
// Ignore error in case dir contains other files paths_to_delete.insert(soptions.wal_dir);
env->DeleteDir(soptions.wal_dir).PermitUncheckedError();
} }
// Ignore error since state is already gone // Ignore error since state is already gone
env->UnlockFile(lock).PermitUncheckedError(); env->UnlockFile(lock).PermitUncheckedError();
env->DeleteFile(lockname).PermitUncheckedError(); env->DeleteFile(lockname).PermitUncheckedError();
// Make sure trash files are all cleared before return.
if (sfm && bucket.has_value()) {
sfm->WaitForEmptyTrashBucket(bucket.value());
}
// sst_file_manager holds a ref to the logger. Make sure the logger is // sst_file_manager holds a ref to the logger. Make sure the logger is
// gone before trying to remove the directory. // gone before trying to remove the directory.
soptions.sst_file_manager.reset(); soptions.sst_file_manager.reset();
// Ignore error in case dir contains other files // Ignore error in case dir contains other files
env->DeleteDir(dbname).PermitUncheckedError(); for (const auto& path_to_delete : paths_to_delete) {
; env->DeleteDir(path_to_delete).PermitUncheckedError();
}
} }
return result; return result;
} }
@ -5820,11 +5836,6 @@ Status DBImpl::IngestExternalFiles(
"write_global_seqno is deprecated and does not work with " "write_global_seqno is deprecated and does not work with "
"allow_db_generated_files."); "allow_db_generated_files.");
} }
if (ingest_opts.move_files) {
return Status::NotSupported(
"Options move_files and allow_db_generated_files are not "
"compatible.");
}
} }
} }

View File

@ -1226,6 +1226,8 @@ class DBImpl : public DB {
return logs_.back().number; return logs_.back().number;
} }
void TEST_DeleteObsoleteFiles();
const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const { const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
return files_grabbed_for_purge_; return files_grabbed_for_purge_;
} }

View File

@ -314,6 +314,11 @@ const autovector<uint64_t>& DBImpl::TEST_GetFilesToQuarantine() const {
return error_handler_.GetFilesToQuarantine(); return error_handler_.GetFilesToQuarantine();
} }
void DBImpl::TEST_DeleteObsoleteFiles() {
InstrumentedMutexLock l(&mutex_);
DeleteObsoleteFiles();
}
size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
InstrumentedMutexLock l(&const_cast<DBImpl*>(this)->stats_history_mutex_); InstrumentedMutexLock l(&const_cast<DBImpl*>(this)->stats_history_mutex_);
return EstimateInMemoryStatsHistorySize(); return EstimateInMemoryStatsHistorySize();

View File

@ -970,7 +970,9 @@ Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only,
} }
// Persist it to IDENTITY file if allowed // Persist it to IDENTITY file if allowed
if (!read_only) { if (!read_only) {
s = SetIdentityFile(write_options, env_, dbname_, db_id_); s = SetIdentityFile(write_options, env_, dbname_,
immutable_db_options_.metadata_write_temperature,
db_id_);
} }
return s; return s;
} }

View File

@ -295,7 +295,8 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) {
Status DBImpl::NewDB(std::vector<std::string>* new_filenames) { Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
VersionEdit new_db; VersionEdit new_db;
const WriteOptions write_options(Env::IOActivity::kDBOpen); const WriteOptions write_options(Env::IOActivity::kDBOpen);
Status s = SetIdentityFile(write_options, env_, dbname_); Status s = SetIdentityFile(write_options, env_, dbname_,
immutable_db_options_.metadata_write_temperature);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -319,6 +320,12 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
} }
std::unique_ptr<FSWritableFile> file; std::unique_ptr<FSWritableFile> file;
FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_); FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
// DB option takes precedence when not kUnknown
if (immutable_db_options_.metadata_write_temperature !=
Temperature::kUnknown) {
file_options.temperature =
immutable_db_options_.metadata_write_temperature;
}
s = NewWritableFile(fs_.get(), manifest, &file, file_options); s = NewWritableFile(fs_.get(), manifest, &file, file_options);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
@ -344,6 +351,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
if (s.ok()) { if (s.ok()) {
// Make "CURRENT" file that points to the new manifest file. // Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(write_options, fs_.get(), dbname_, 1, s = SetCurrentFile(write_options, fs_.get(), dbname_, 1,
immutable_db_options_.metadata_write_temperature,
directories_.GetDbDir()); directories_.GetDbDir());
if (new_filenames) { if (new_filenames) {
new_filenames->emplace_back( new_filenames->emplace_back(
@ -530,6 +538,12 @@ Status DBImpl::Recover(
/*no_error_if_files_missing=*/false, is_retry, /*no_error_if_files_missing=*/false, is_retry,
&desc_status); &desc_status);
desc_status.PermitUncheckedError(); desc_status.PermitUncheckedError();
if (is_retry) {
RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_COUNT);
if (desc_status.ok()) {
RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
}
}
if (can_retry) { if (can_retry) {
// If we're opening for the first time and the failure is likely due to // If we're opening for the first time and the failure is likely due to
// a corrupt MANIFEST file (could result in either the log::Reader // a corrupt MANIFEST file (could result in either the log::Reader
@ -1930,6 +1944,10 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
BuildDBOptions(immutable_db_options_, mutable_db_options_); BuildDBOptions(immutable_db_options_, mutable_db_options_);
FileOptions opt_file_options = FileOptions opt_file_options =
fs_->OptimizeForLogWrite(file_options_, db_options); fs_->OptimizeForLogWrite(file_options_, db_options);
// DB option takes precedence when not kUnknown
if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) {
opt_file_options.temperature = immutable_db_options_.wal_write_temperature;
}
std::string wal_dir = immutable_db_options_.GetWalDir(); std::string wal_dir = immutable_db_options_.GetWalDir();
std::string log_fname = LogFileName(wal_dir, log_file_num); std::string log_fname = LogFileName(wal_dir, log_file_num);

View File

@ -969,21 +969,17 @@ Status DBImpl::WriteImplWALOnly(
assert(w.state == WriteThread::STATE_GROUP_LEADER); assert(w.state == WriteThread::STATE_GROUP_LEADER);
if (publish_last_seq == kDoPublishLastSeq) { if (publish_last_seq == kDoPublishLastSeq) {
Status status;
// Currently we only use kDoPublishLastSeq in unordered_write // Currently we only use kDoPublishLastSeq in unordered_write
assert(immutable_db_options_.unordered_write); assert(immutable_db_options_.unordered_write);
WriteContext write_context;
if (error_handler_.IsDBStopped()) {
status = error_handler_.GetBGError();
}
// TODO(myabandeh): Make preliminary checks thread-safe so we could do them // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
// without paying the cost of obtaining the mutex. // without paying the cost of obtaining the mutex.
if (status.ok()) { LogContext log_context;
LogContext log_context; WriteContext write_context;
status = PreprocessWrite(write_options, &log_context, &write_context); Status status =
WriteStatusCheckOnLocked(status); PreprocessWrite(write_options, &log_context, &write_context);
} WriteStatusCheckOnLocked(status);
if (!status.ok()) { if (!status.ok()) {
WriteThread::WriteGroup write_group; WriteThread::WriteGroup write_group;
write_thread->EnterAsBatchGroupLeader(&w, &write_group); write_thread->EnterAsBatchGroupLeader(&w, &write_group);

View File

@ -705,6 +705,7 @@ class DBIOCorruptionTest
DBIOCorruptionTest() : DBIOFailureTest() { DBIOCorruptionTest() : DBIOFailureTest() {
BlockBasedTableOptions bbto; BlockBasedTableOptions bbto;
options_ = CurrentOptions(); options_ = CurrentOptions();
options_.statistics = CreateDBStatistics();
base_env_ = env_; base_env_ = env_;
EXPECT_NE(base_env_, nullptr); EXPECT_NE(base_env_, nullptr);
@ -727,6 +728,8 @@ class DBIOCorruptionTest
Status ReopenDB() { return TryReopen(options_); } Status ReopenDB() { return TryReopen(options_); }
Statistics* stats() { return options_.statistics.get(); }
protected: protected:
std::unique_ptr<Env> env_guard_; std::unique_ptr<Env> env_guard_;
std::shared_ptr<CorruptionFS> fs_; std::shared_ptr<CorruptionFS> fs_;
@ -749,8 +752,12 @@ TEST_P(DBIOCorruptionTest, GetReadCorruptionRetry) {
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_EQ(val, "val1"); ASSERT_EQ(val, "val1");
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else { } else {
ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
} }
@ -773,8 +780,12 @@ TEST_P(DBIOCorruptionTest, IterReadCorruptionRetry) {
} }
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_OK(iter->status()); ASSERT_OK(iter->status());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else { } else {
ASSERT_TRUE(iter->status().IsCorruption()); ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
delete iter; delete iter;
} }
@ -799,9 +810,13 @@ TEST_P(DBIOCorruptionTest, MultiGetReadCorruptionRetry) {
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_EQ(values[0].ToString(), "val1"); ASSERT_EQ(values[0].ToString(), "val1");
ASSERT_EQ(values[1].ToString(), "val2"); ASSERT_EQ(values[1].ToString(), "val2");
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else { } else {
ASSERT_TRUE(statuses[0].IsCorruption()); ASSERT_TRUE(statuses[0].IsCorruption());
ASSERT_TRUE(statuses[1].IsCorruption()); ASSERT_TRUE(statuses[1].IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
} }
@ -818,6 +833,9 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
std::string val; std::string val;
ReadOptions ro; ReadOptions ro;
@ -826,6 +844,7 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
ASSERT_EQ(val, "val1"); ASSERT_EQ(val, "val1");
} else { } else {
ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
} }
@ -838,6 +857,9 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
Status s = Flush(); Status s = Flush();
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
std::string val; std::string val;
ReadOptions ro; ReadOptions ro;
@ -846,6 +868,7 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
ASSERT_EQ(val, "val1"); ASSERT_EQ(val, "val1");
} else { } else {
ASSERT_NOK(s); ASSERT_NOK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
} }
@ -862,8 +885,12 @@ TEST_P(DBIOCorruptionTest, ManifestCorruptionRetry) {
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_OK(ReopenDB()); ASSERT_OK(ReopenDB());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else { } else {
ASSERT_EQ(ReopenDB(), Status::Corruption()); ASSERT_EQ(ReopenDB(), Status::Corruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->DisableProcessing();
} }

View File

@ -684,13 +684,14 @@ class DbMemtableKVChecksumTest : public DbKvChecksumTest {
DbMemtableKVChecksumTest() : DbKvChecksumTest() {} DbMemtableKVChecksumTest() : DbKvChecksumTest() {}
protected: protected:
const size_t kValueLenOffset = 12;
// Indices in the memtable entry that we will not corrupt. // Indices in the memtable entry that we will not corrupt.
// For memtable entry format, see comments in MemTable::Add(). // For memtable entry format, see comments in MemTable::Add().
// We do not corrupt key length and value length fields in this test // We do not corrupt key length and value length fields in this test
// case since it causes segfault and ASAN will complain. // case since it causes segfault and ASAN will complain.
// For this test case, key and value are all of length 3, so // For this test case, key and value are all of length 3, so
// key length field is at index 0 and value length field is at index 12. // key length field is at index 0 and value length field is at index 12.
const std::set<size_t> index_not_to_corrupt{0, 12}; const std::set<size_t> index_not_to_corrupt{0, kValueLenOffset};
void SkipNotToCorruptEntry() { void SkipNotToCorruptEntry() {
if (index_not_to_corrupt.find(corrupt_byte_offset_) != if (index_not_to_corrupt.find(corrupt_byte_offset_) !=
@ -737,6 +738,8 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
buf[corrupt_byte_offset_] += corrupt_byte_addend_; buf[corrupt_byte_offset_] += corrupt_byte_addend_;
++corrupt_byte_offset_; ++corrupt_byte_offset_;
}); });
// Corrupt value only so that MultiGet below can find the key.
corrupt_byte_offset_ = kValueLenOffset + 1;
SyncPoint::GetInstance()->EnableProcessing(); SyncPoint::GetInstance()->EnableProcessing();
Options options = CurrentOptions(); Options options = CurrentOptions();
options.memtable_protection_bytes_per_key = options.memtable_protection_bytes_per_key =
@ -745,12 +748,17 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
options.merge_operator = MergeOperators::CreateStringAppendOperator(); options.merge_operator = MergeOperators::CreateStringAppendOperator();
} }
std::string key = "key";
SkipNotToCorruptEntry(); SkipNotToCorruptEntry();
while (MoreBytesToCorrupt()) { while (MoreBytesToCorrupt()) {
Reopen(options); Reopen(options);
ASSERT_OK(ExecuteWrite(nullptr)); ASSERT_OK(ExecuteWrite(nullptr));
std::string val; std::string val;
ASSERT_TRUE(db_->Get(ReadOptions(), "key", &val).IsCorruption()); ASSERT_TRUE(db_->Get(ReadOptions(), key, &val).IsCorruption());
std::vector<std::string> vals = {val};
std::vector<Status> statuses = db_->MultiGet(
ReadOptions(), {db_->DefaultColumnFamily()}, {key}, &vals, nullptr);
ASSERT_TRUE(statuses[0].IsCorruption());
Destroy(options); Destroy(options);
SkipNotToCorruptEntry(); SkipNotToCorruptEntry();
} }

View File

@ -339,6 +339,91 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
} }
} }
TEST_F(DBMemTableTest, IntegrityChecks) {
// We insert keys key000000, key000001 and key000002 into skiplist at fixed
// height 1 (smallest height). Then we corrupt the second key to aey000001 to
// make it smaller. With `paranoid_memory_checks` set to true, if the
// skip list sees key000000 and then aey000001, then it will report out of
// order keys with corruption status. With `paranoid_memory_checks` set
// to false, read/scan may return wrong results.
for (bool allow_data_in_error : {false, true}) {
Options options = CurrentOptions();
options.allow_data_in_errors = allow_data_in_error;
options.paranoid_memory_checks = true;
DestroyAndReopen(options);
SyncPoint::GetInstance()->SetCallBack(
"InlineSkipList::RandomHeight::height", [](void* h) {
auto height_ptr = static_cast<int*>(h);
*height_ptr = 1;
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(Put(Key(0), "val0"));
ASSERT_OK(Put(Key(2), "val2"));
// p will point to the buffer for encoded key000001
char* p = nullptr;
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:BeforeReturn:Encoded", [&](void* encoded) {
p = const_cast<char*>(static_cast<Slice*>(encoded)->data());
});
ASSERT_OK(Put(Key(1), "val1"));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
ASSERT_TRUE(p);
// Offset 0 is key size, key bytes start at offset 1.
// "key000001 -> aey000001"
p[1] = 'a';
ReadOptions rops;
std::string val;
Status s = db_->Get(rops, Key(1), &val);
ASSERT_TRUE(s.IsCorruption());
std::string key0 = Slice(Key(0)).ToString(true);
ASSERT_EQ(s.ToString().find(key0) != std::string::npos,
allow_data_in_error);
// Without `paranoid_memory_checks`, NotFound will be returned.
// This would fail an assertion in InlineSkipList::FindGreaterOrEqual().
// If we remove the assertion, this passes.
// ASSERT_TRUE(db_->Get(ReadOptions(), Key(1), &val).IsNotFound());
std::vector<std::string> vals;
std::vector<Status> statuses = db_->MultiGet(
rops, {db_->DefaultColumnFamily()}, {Key(1)}, &vals, nullptr);
ASSERT_TRUE(statuses[0].IsCorruption());
ASSERT_EQ(statuses[0].ToString().find(key0) != std::string::npos,
allow_data_in_error);
std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
ASSERT_OK(iter->status());
iter->Seek(Key(1));
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
iter->Seek(Key(0));
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
// iterating through skip list at height at 1 should catch out-of-order keys
iter->Next();
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
ASSERT_FALSE(iter->Valid());
iter->SeekForPrev(Key(2));
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
// Internally DB Iter will iterate backwards (call Prev()) after
// SeekToLast() to find the correct internal key with the last user key.
// Prev() will do integrity checks and catch corruption.
iter->SeekToLast();
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
ASSERT_FALSE(iter->Valid());
}
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) { int main(int argc, char** argv) {

View File

@ -507,6 +507,23 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
ASSERT_EQ(files_deleted, 0); ASSERT_EQ(files_deleted, 0);
ASSERT_EQ(files_scheduled_to_delete, 0); ASSERT_EQ(files_scheduled_to_delete, 0);
Close(); Close();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
assert(arg);
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
++files_scheduled_to_delete;
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
files_deleted++;
}
});
ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_OK(DestroyDB(dbname_, options));
ASSERT_EQ(files_deleted, blob_files.size()); ASSERT_EQ(files_deleted, blob_files.size());
ASSERT_EQ(files_scheduled_to_delete, blob_files.size()); ASSERT_EQ(files_scheduled_to_delete, blob_files.size());
@ -649,6 +666,23 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
} }
Close(); Close();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
assert(arg);
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
++files_scheduled_to_delete;
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
files_deleted++;
}
});
ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_OK(DestroyDB(dbname_, options));
sfm->WaitForEmptyTrash(); sfm->WaitForEmptyTrash();
ASSERT_EQ(files_deleted, 5); ASSERT_EQ(files_deleted, 5);
@ -883,8 +917,9 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
// Create 4 files in L0 // Create 4 files in L0
for (char v = 'a'; v <= 'd'; v++) { for (char v = 'a'; v <= 'd'; v++) {
if (v == 'c') { if (v == 'c') {
// Maximize the change that the last log file will be preserved in trash // Maximize the chance that the last log file will be preserved in trash
// before restarting the DB. // before restarting the DB. (Enable slow deletion but at a very slow
// deletion rate)
// We have to set this on the 2nd to last file for it to delay deletion // We have to set this on the 2nd to last file for it to delay deletion
// on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash()) // on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash())
options.sst_file_manager->SetDeleteRateBytesPerSecond(1); options.sst_file_manager->SetDeleteRateBytesPerSecond(1);
@ -1902,6 +1937,24 @@ TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) {
ASSERT_EQ(files_deleted, 1); ASSERT_EQ(files_deleted, 1);
Close(); Close();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
assert(arg);
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
++files_scheduled_to_delete;
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
files_deleted++;
}
});
ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_OK(DestroyDB(dbname_, options));
ASSERT_EQ(files_scheduled_to_delete, 4); ASSERT_EQ(files_scheduled_to_delete, 4);

View File

@ -10,6 +10,7 @@
#include <atomic> #include <atomic>
#include <cstdlib> #include <cstdlib>
#include <functional> #include <functional>
#include <iostream>
#include <memory> #include <memory>
#include "db/db_test_util.h" #include "db/db_test_util.h"
@ -26,6 +27,7 @@
#include "rocksdb/utilities/replayer.h" #include "rocksdb/utilities/replayer.h"
#include "rocksdb/wal_filter.h" #include "rocksdb/wal_filter.h"
#include "test_util/testutil.h" #include "test_util/testutil.h"
#include "util/defer.h"
#include "util/random.h" #include "util/random.h"
#include "utilities/fault_injection_env.h" #include "utilities/fault_injection_env.h"
@ -6544,6 +6546,235 @@ TEST_P(RenameCurrentTest, Compaction) {
ASSERT_EQ("d_value", Get("d")); ASSERT_EQ("d_value", Get("d"));
} }
TEST_F(DBTest2, VariousFileTemperatures) {
constexpr size_t kNumberFileTypes = static_cast<size_t>(kBlobFile) + 1U;
struct MyTestFS : public FileTemperatureTestFS {
explicit MyTestFS(const std::shared_ptr<FileSystem>& fs)
: FileTemperatureTestFS(fs) {
Reset();
}
IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts,
std::unique_ptr<FSWritableFile>* result,
IODebugContext* dbg) override {
IOStatus ios =
FileTemperatureTestFS::NewWritableFile(fname, opts, result, dbg);
if (ios.ok()) {
uint64_t number;
FileType type;
if (ParseFileName(GetFileName(fname), &number, "LOG", &type)) {
if (type == kTableFile) {
// Not checked here
} else if (type == kWalFile) {
if (opts.temperature != expected_wal_temperature) {
std::cerr << "Attempt to open " << fname << " with temperature "
<< temperature_to_string[opts.temperature]
<< " rather than "
<< temperature_to_string[expected_wal_temperature]
<< std::endl;
assert(false);
}
} else if (type == kDescriptorFile) {
if (opts.temperature != expected_manifest_temperature) {
std::cerr << "Attempt to open " << fname << " with temperature "
<< temperature_to_string[opts.temperature]
<< " rather than "
<< temperature_to_string[expected_wal_temperature]
<< std::endl;
assert(false);
}
} else if (opts.temperature != expected_other_metadata_temperature) {
std::cerr << "Attempt to open " << fname << " with temperature "
<< temperature_to_string[opts.temperature]
<< " rather than "
<< temperature_to_string[expected_wal_temperature]
<< std::endl;
assert(false);
}
UpdateCount(type, 1);
}
}
return ios;
}
IOStatus RenameFile(const std::string& src, const std::string& dst,
const IOOptions& options,
IODebugContext* dbg) override {
IOStatus ios = FileTemperatureTestFS::RenameFile(src, dst, options, dbg);
if (ios.ok()) {
uint64_t number;
FileType src_type;
FileType dst_type;
assert(ParseFileName(GetFileName(src), &number, "LOG", &src_type));
assert(ParseFileName(GetFileName(dst), &number, "LOG", &dst_type));
UpdateCount(src_type, -1);
UpdateCount(dst_type, 1);
}
return ios;
}
void UpdateCount(FileType type, int delta) {
size_t i = static_cast<size_t>(type);
assert(i < kNumberFileTypes);
counts[i].FetchAddRelaxed(delta);
}
std::map<FileType, size_t> PopCounts() {
std::map<FileType, size_t> ret;
for (size_t i = 0; i < kNumberFileTypes; ++i) {
int c = counts[i].ExchangeRelaxed(0);
if (c > 0) {
ret[static_cast<FileType>(i)] = c;
}
}
return ret;
}
FileOptions OptimizeForLogWrite(
const FileOptions& file_options,
const DBOptions& /*db_options*/) const override {
FileOptions opts = file_options;
if (optimize_wal_temperature != Temperature::kUnknown) {
opts.temperature = optimize_wal_temperature;
}
return opts;
}
FileOptions OptimizeForManifestWrite(
const FileOptions& file_options) const override {
FileOptions opts = file_options;
if (optimize_manifest_temperature != Temperature::kUnknown) {
opts.temperature = optimize_manifest_temperature;
}
return opts;
}
void Reset() {
optimize_manifest_temperature = Temperature::kUnknown;
optimize_wal_temperature = Temperature::kUnknown;
expected_manifest_temperature = Temperature::kUnknown;
expected_other_metadata_temperature = Temperature::kUnknown;
expected_wal_temperature = Temperature::kUnknown;
for (auto& c : counts) {
c.StoreRelaxed(0);
}
}
Temperature optimize_manifest_temperature;
Temperature optimize_wal_temperature;
Temperature expected_manifest_temperature;
Temperature expected_other_metadata_temperature;
Temperature expected_wal_temperature;
std::array<RelaxedAtomic<int>, kNumberFileTypes> counts;
};
// We don't have enough non-unknown temps to confidently distinguish that
// a specific setting caused a specific outcome, in a single run. This is a
// reasonable work-around without blowing up test time. Only returns
// non-unknown temperatures.
auto RandomTemp = [] {
static std::vector<Temperature> temps = {
Temperature::kHot, Temperature::kWarm, Temperature::kCold};
return temps[Random::GetTLSInstance()->Uniform(
static_cast<int>(temps.size()))];
};
auto test_fs = std::make_shared<MyTestFS>(env_->GetFileSystem());
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
for (bool use_optimize : {false, true}) {
std::cerr << "use_optimize: " << std::to_string(use_optimize) << std::endl;
for (bool use_temp_options : {false, true}) {
std::cerr << "use_temp_options: " << std::to_string(use_temp_options)
<< std::endl;
Options options = CurrentOptions();
// Currently require for last level temperature
options.compaction_style = kCompactionStyleUniversal;
options.env = env.get();
test_fs->Reset();
if (use_optimize) {
test_fs->optimize_manifest_temperature = RandomTemp();
test_fs->expected_manifest_temperature =
test_fs->optimize_manifest_temperature;
test_fs->optimize_wal_temperature = RandomTemp();
test_fs->expected_wal_temperature = test_fs->optimize_wal_temperature;
}
if (use_temp_options) {
options.metadata_write_temperature = RandomTemp();
test_fs->expected_manifest_temperature =
options.metadata_write_temperature;
test_fs->expected_other_metadata_temperature =
options.metadata_write_temperature;
options.wal_write_temperature = RandomTemp();
test_fs->expected_wal_temperature = options.wal_write_temperature;
options.last_level_temperature = RandomTemp();
options.default_write_temperature = RandomTemp();
}
DestroyAndReopen(options);
Defer closer([&] { Close(); });
using FTC = std::map<FileType, size_t>;
// Files on DB startup
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1},
{kDescriptorFile, 2},
{kCurrentFile, 2},
{kIdentityFile, 1},
{kOptionsFile, 1}}));
// Temperature count map
using TCM = std::map<Temperature, size_t>;
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), TCM({}));
ASSERT_OK(Put("foo", "1"));
ASSERT_OK(Put("bar", "1"));
ASSERT_OK(Flush());
ASSERT_OK(Put("foo", "2"));
ASSERT_OK(Put("bar", "2"));
ASSERT_OK(Flush());
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(),
TCM({{options.default_write_temperature, 2}}));
ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(),
TCM({{options.last_level_temperature, 1}}));
ASSERT_OK(Put("foo", "3"));
ASSERT_OK(Put("bar", "3"));
ASSERT_OK(Flush());
// Just in memtable/WAL
ASSERT_OK(Put("dog", "3"));
{
TCM expected;
expected[options.default_write_temperature] += 1;
expected[options.last_level_temperature] += 1;
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), expected);
}
// New files during operation
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 3}, {kTableFile, 4}}));
Reopen(options);
// New files during re-open/recovery
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1},
{kTableFile, 1},
{kDescriptorFile, 1},
{kCurrentFile, 1},
{kOptionsFile, 1}}));
Destroy(options);
}
}
}
TEST_F(DBTest2, LastLevelTemperature) { TEST_F(DBTest2, LastLevelTemperature) {
class TestListener : public EventListener { class TestListener : public EventListener {
public: public:

View File

@ -366,6 +366,11 @@ Options DBTestBase::GetOptions(
table_options.block_cache = NewLRUCache(/* too small */ 1); table_options.block_cache = NewLRUCache(/* too small */ 1);
} }
// Test anticipated new default as much as reasonably possible (and remove
// this code when obsolete)
assert(!table_options.decouple_partitioned_filters);
table_options.decouple_partitioned_filters = true;
bool can_allow_mmap = IsMemoryMappedAccessSupported(); bool can_allow_mmap = IsMemoryMappedAccessSupported();
switch (option_config) { switch (option_config) {
case kHashSkipList: case kHashSkipList:

View File

@ -831,6 +831,15 @@ class FileTemperatureTestFS : public FileSystemWrapper {
return count; return count;
} }
std::map<Temperature, size_t> CountCurrentSstFilesByTemp() {
MutexLock lock(&mu_);
std::map<Temperature, size_t> ret;
for (const auto& e : current_sst_file_temperatures_) {
ret[e.second]++;
}
return ret;
}
void OverrideSstFileTemperature(uint64_t number, Temperature temp) { void OverrideSstFileTemperature(uint64_t number, Temperature temp) {
MutexLock lock(&mu_); MutexLock lock(&mu_);
current_sst_file_temperatures_[number] = temp; current_sst_file_temperatures_[number] = temp;
@ -842,7 +851,7 @@ class FileTemperatureTestFS : public FileSystemWrapper {
requested_sst_file_temperatures_; requested_sst_file_temperatures_;
std::map<uint64_t, Temperature> current_sst_file_temperatures_; std::map<uint64_t, Temperature> current_sst_file_temperatures_;
std::string GetFileName(const std::string& fname) { static std::string GetFileName(const std::string& fname) {
auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1); auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
// workaround only for Windows that the file path could contain both Windows // workaround only for Windows that the file path could contain both Windows
// FilePathSeparator and '/' // FilePathSeparator and '/'

View File

@ -213,7 +213,6 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
options.num_levels = num_levels_; options.num_levels = num_levels_;
options.write_buffer_size = 105 << 10; // 105KB options.write_buffer_size = 105 << 10; // 105KB
options.arena_block_size = 4 << 10; options.arena_block_size = 4 << 10;
options.target_file_size_base = 32 << 10; // 32KB
// trigger compaction if there are >= 4 files // trigger compaction if there are >= 4 files
options.level0_file_num_compaction_trigger = 4; options.level0_file_num_compaction_trigger = 4;
KeepFilterFactory* filter = new KeepFilterFactory(true); KeepFilterFactory* filter = new KeepFilterFactory(true);

View File

@ -1472,6 +1472,126 @@ TEST_F(DBWALTest, SyncMultipleLogs) {
ASSERT_OK(dbfull()->SyncWAL()); ASSERT_OK(dbfull()->SyncWAL());
} }
TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) {
Options options = CurrentOptions();
options.max_write_buffer_number = 5;
options.track_and_verify_wals_in_manifest = true;
options.max_bgerror_resume_count = 0; // manual resume
options.recycle_log_file_num = 3;
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
// Disable truncating recycled WALs to new size in posix env
// (approximating a crash)
SyncPoint::GetInstance()->SetCallBack(
"PosixWritableFile::Close",
[](void* arg) { *(static_cast<size_t*>(arg)) = 0; });
SyncPoint::GetInstance()->EnableProcessing();
// Re-open with desired options
DestroyAndReopen(options);
Defer closer([this]() { Close(); });
// Ensure WAL recycling wasn't sanitized away
ASSERT_EQ(db_->GetOptions().recycle_log_file_num,
options.recycle_log_file_num);
// Prepare external files for later ingestion
std::string sst_files_dir = dbname_ + "/sst_files/";
ASSERT_OK(DestroyDir(env_, sst_files_dir));
ASSERT_OK(env_->CreateDir(sst_files_dir));
std::string external_file1 = sst_files_dir + "file1.sst";
{
SstFileWriter sst_file_writer(EnvOptions(), options);
ASSERT_OK(sst_file_writer.Open(external_file1));
ASSERT_OK(sst_file_writer.Put("external1", "ex1"));
ExternalSstFileInfo file_info;
ASSERT_OK(sst_file_writer.Finish(&file_info));
}
std::string external_file2 = sst_files_dir + "file2.sst";
{
SstFileWriter sst_file_writer(EnvOptions(), options);
ASSERT_OK(sst_file_writer.Open(external_file2));
ASSERT_OK(sst_file_writer.Put("external2", "ex2"));
ExternalSstFileInfo file_info;
ASSERT_OK(sst_file_writer.Finish(&file_info));
}
// Populate some WALs to be recycled such that there will be extra data
// from an old incarnation of the WAL on recovery
ASSERT_OK(db_->PauseBackgroundWork());
ASSERT_OK(Put("ignore1", Random::GetTLSInstance()->RandomString(500)));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(Put("ignore2", Random::GetTLSInstance()->RandomString(500)));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(db_->ContinueBackgroundWork());
ASSERT_OK(Flush());
ASSERT_OK(Put("ignore3", Random::GetTLSInstance()->RandomString(500)));
ASSERT_OK(Flush());
// Verify expected log files (still there for recycling)
std::vector<FileAttributes> files;
int log_count = 0;
ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files));
for (const auto& f : files) {
if (EndsWith(f.name, ".log")) {
EXPECT_GT(f.size_bytes, 500);
++log_count;
}
}
EXPECT_EQ(log_count, 3);
// (Re-used recipe) Generate two inactive WALs and one active WAL, with a
// gap in sequence numbers to interfere with recovery
ASSERT_OK(db_->PauseBackgroundWork());
ASSERT_OK(Put("key1", "val1"));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(Put("key2", "val2"));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
// Need a gap in sequence numbers, so e.g. ingest external file
// with an open snapshot
{
ManagedSnapshot snapshot(db_);
ASSERT_OK(
db_->IngestExternalFile({external_file1}, IngestExternalFileOptions()));
}
ASSERT_OK(Put("key3", "val3"));
ASSERT_OK(db_->SyncWAL());
// Need an SST file that is logically after that WAL, so that dropping WAL
// data is not a valid point in time.
{
ManagedSnapshot snapshot(db_);
ASSERT_OK(
db_->IngestExternalFile({external_file2}, IngestExternalFileOptions()));
}
// Approximate a crash, with respect to recycled WAL data extending past
// the end of the current WAL data (see SyncPoint callback above)
Close();
// Verify recycled log files haven't been truncated
files.clear();
log_count = 0;
ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files));
for (const auto& f : files) {
if (EndsWith(f.name, ".log")) {
EXPECT_GT(f.size_bytes, 500);
++log_count;
}
}
EXPECT_EQ(log_count, 3);
// Verify no data loss after reopen.
Reopen(options);
EXPECT_EQ("val1", Get("key1"));
EXPECT_EQ("val2", Get("key2")); // Passes because of adjacent seqnos
EXPECT_EQ("ex1", Get("external1"));
EXPECT_EQ("val3", Get("key3")); // <- ONLY FAILURE! (Not a point in time)
EXPECT_EQ("ex2", Get("external2"));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_F(DBWALTest, SyncWalPartialFailure) { TEST_F(DBWALTest, SyncWalPartialFailure) {
class MyTestFileSystem : public FileSystemWrapper { class MyTestFileSystem : public FileSystemWrapper {
public: public:
@ -1532,7 +1652,7 @@ TEST_F(DBWALTest, SyncWalPartialFailure) {
// * one inactive WAL, not synced, and // * one inactive WAL, not synced, and
// * one active WAL, not synced // * one active WAL, not synced
// with a single thread, to exercise as much logic as we reasonably can. // with a single thread, to exercise as much logic as we reasonably can.
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->PauseBackgroundWork()); ASSERT_OK(db_->PauseBackgroundWork());
ASSERT_OK(Put("key1", "val1")); ASSERT_OK(Put("key1", "val1"));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable()); ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(db_->SyncWAL()); ASSERT_OK(db_->SyncWAL());

View File

@ -172,6 +172,70 @@ TEST_F(DBBasicTestWithTimestamp, MixedCfs) {
Close(); Close();
} }
TEST_F(DBBasicTestWithTimestamp, MultiGetMultipleCfs) {
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
Options options = CurrentOptions();
options.env = env_;
options.create_if_missing = true;
options.avoid_flush_during_shutdown = true;
options.comparator = &test_cmp;
DestroyAndReopen(options);
Options options1 = CurrentOptions();
options1.env = env_;
options1.comparator = &test_cmp;
ColumnFamilyHandle* handle = nullptr;
Status s = db_->CreateColumnFamily(options1, "data", &handle);
ASSERT_OK(s);
std::string ts = Timestamp(1, 0);
WriteBatch wb(0, 0, 0, kTimestampSize);
ASSERT_OK(wb.Put("a", "value"));
ASSERT_OK(wb.Put(handle, "a", "value"));
const auto ts_sz_func = [kTimestampSize](uint32_t /*cf_id*/) {
return kTimestampSize;
};
ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func));
ASSERT_OK(db_->Write(WriteOptions(), &wb));
int num_keys = 2;
std::vector<Slice> keys;
std::vector<std::string> expected_values;
for (int i = 0; i < num_keys; i++) {
keys.push_back("a");
expected_values.push_back("value");
}
std::vector<ColumnFamilyHandle*> handles;
handles.push_back(db_->DefaultColumnFamily());
handles.push_back(handle);
{
Slice read_ts_slice(ts);
ReadOptions read_opts;
read_opts.timestamp = &read_ts_slice;
std::vector<PinnableSlice> values;
values.resize(num_keys);
std::vector<Status> statuses;
statuses.resize(num_keys);
std::vector<std::string> timestamps;
timestamps.resize(num_keys);
db_->MultiGet(read_opts, num_keys, handles.data(), keys.data(),
values.data(), timestamps.data(), statuses.data());
for (int i = 0; i < num_keys; i++) {
ASSERT_OK(statuses[i]);
ASSERT_EQ(expected_values[i], values[i].ToString());
ASSERT_EQ(ts, timestamps[i]);
}
}
delete handle;
Close();
}
TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) { TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.env = env_; options.env = env_;

View File

@ -330,17 +330,16 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
// output : <user_provided_key> // output : <user_provided_key>
inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
size_t ts_sz) { size_t ts_sz) {
Slice ret = internal_key; assert(internal_key.size() >= kNumInternalBytes + ts_sz);
ret.remove_suffix(kNumInternalBytes + ts_sz); return Slice(internal_key.data(),
return ret; internal_key.size() - (kNumInternalBytes + ts_sz));
} }
// input [user key]: <user_provided_key | ts> // input [user key]: <user_provided_key | ts>
// output: <user_provided_key> // output: <user_provided_key>
inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
Slice ret = user_key; assert(user_key.size() >= ts_sz);
ret.remove_suffix(ts_sz); return Slice(user_key.data(), user_key.size() - ts_sz);
return ret;
} }
// input [user key]: <user_provided_key | ts> // input [user key]: <user_provided_key | ts>

View File

@ -124,6 +124,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
<< "comparator" << table_properties.comparator_name << "comparator" << table_properties.comparator_name
<< "user_defined_timestamps_persisted" << "user_defined_timestamps_persisted"
<< table_properties.user_defined_timestamps_persisted << table_properties.user_defined_timestamps_persisted
<< "key_largest_seqno" << table_properties.key_largest_seqno
<< "merge_operator" << table_properties.merge_operator_name << "merge_operator" << table_properties.merge_operator_name
<< "prefix_extractor_name" << "prefix_extractor_name"
<< table_properties.prefix_extractor_name << "property_collectors" << table_properties.prefix_extractor_name << "property_collectors"

View File

@ -114,7 +114,6 @@ Status ExternalSstFileIngestionJob::Prepare(
const std::string path_inside_db = TableFileName( const std::string path_inside_db = TableFileName(
cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
if (ingestion_options_.move_files) { if (ingestion_options_.move_files) {
assert(!ingestion_options_.allow_db_generated_files);
status = status =
fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
if (status.ok()) { if (status.ok()) {
@ -627,7 +626,8 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
DeleteInternalFiles(); DeleteInternalFiles();
consumed_seqno_count_ = 0; consumed_seqno_count_ = 0;
files_overlap_ = false; files_overlap_ = false;
} else if (status.ok() && ingestion_options_.move_files) { } else if (status.ok() && ingestion_options_.move_files &&
!ingestion_options_.allow_db_generated_files) {
// The files were moved and added successfully, remove original file links // The files were moved and added successfully, remove original file links
for (IngestedFileInfo& f : files_to_ingest_) { for (IngestedFileInfo& f : files_to_ingest_) {
Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr); Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
@ -914,9 +914,18 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
} else if (!iter->status().ok()) { } else if (!iter->status().ok()) {
return iter->status(); return iter->status();
} }
if (ingestion_options_.allow_db_generated_files) { SequenceNumber largest_seqno =
// Verify that all keys have seqno zero. table_reader.get()->GetTableProperties()->key_largest_seqno;
// TODO: store largest seqno in table property and validate it instead. // UINT64_MAX means unknown and the file is generated before table property
// `key_largest_seqno` is introduced.
if (largest_seqno != UINT64_MAX && largest_seqno > 0) {
return Status::Corruption(
"External file has non zero largest sequence number " +
std::to_string(largest_seqno));
}
if (ingestion_options_.allow_db_generated_files &&
largest_seqno == UINT64_MAX) {
// Need to verify that all keys have seqno zero.
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Status pik_status = Status pik_status =
ParseInternalKey(iter->key(), &key, allow_data_in_errors); ParseInternalKey(iter->key(), &key, allow_data_in_errors);

View File

@ -674,10 +674,8 @@ class SstFileWriterCollector : public TablePropertiesCollector {
Status Finish(UserCollectedProperties* properties) override { Status Finish(UserCollectedProperties* properties) override {
std::string count = std::to_string(count_); std::string count = std::to_string(count_);
*properties = UserCollectedProperties{ properties->insert({prefix_ + "_SstFileWriterCollector", "YES"});
{prefix_ + "_SstFileWriterCollector", "YES"}, properties->insert({prefix_ + "_Count", count});
{prefix_ + "_Count", count},
};
return Status::OK(); return Status::OK();
} }
@ -3727,13 +3725,14 @@ INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
std::make_tuple(true, true), std::make_tuple(true, true),
std::make_tuple(false, false))); std::make_tuple(false, false)));
class IngestDBGeneratedFileTest : public ExternalSSTFileTestBase, class IngestDBGeneratedFileTest
public ::testing::WithParamInterface<bool> { : public ExternalSSTFileTestBase,
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
public: public:
IngestDBGeneratedFileTest() { IngestDBGeneratedFileTest() {
ingest_opts.allow_db_generated_files = true; ingest_opts.allow_db_generated_files = true;
ingest_opts.move_files = false; ingest_opts.move_files = std::get<0>(GetParam());
ingest_opts.verify_checksums_before_ingest = GetParam(); ingest_opts.verify_checksums_before_ingest = std::get<1>(GetParam());
ingest_opts.snapshot_consistency = false; ingest_opts.snapshot_consistency = false;
} }
@ -3742,9 +3741,16 @@ class IngestDBGeneratedFileTest : public ExternalSSTFileTestBase,
}; };
INSTANTIATE_TEST_CASE_P(BasicMultiConfig, IngestDBGeneratedFileTest, INSTANTIATE_TEST_CASE_P(BasicMultiConfig, IngestDBGeneratedFileTest,
testing::Bool()); testing::Combine(testing::Bool(), testing::Bool()));
TEST_P(IngestDBGeneratedFileTest, FailureCase) { TEST_P(IngestDBGeneratedFileTest, FailureCase) {
if (encrypted_env_ && ingest_opts.move_files) {
// FIXME: should fail ingestion or support this combination.
ROCKSDB_GTEST_SKIP(
"Encrypted env and move_files do not work together, as we reopen the "
"file after linking it which appends an extra encryption prefix.");
return;
}
// Ingesting overlapping data should always fail. // Ingesting overlapping data should always fail.
do { do {
SCOPED_TRACE("option_config_ = " + std::to_string(option_config_)); SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
@ -3778,6 +3784,7 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
live_meta[0].relative_filename); live_meta[0].relative_filename);
// Ingesting a file whose boundary key has non-zero seqno. // Ingesting a file whose boundary key has non-zero seqno.
Status s = db_->IngestExternalFile(to_ingest_files, ingest_opts); Status s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
// This error msg is from checking seqno of boundary keys.
ASSERT_TRUE( ASSERT_TRUE(
s.ToString().find("External file has non zero sequence number") != s.ToString().find("External file has non zero sequence number") !=
std::string::npos); std::string::npos);
@ -3824,10 +3831,9 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
live_meta[0].directory + "/" + live_meta[0].relative_filename; live_meta[0].directory + "/" + live_meta[0].relative_filename;
s = db_->IngestExternalFile(to_ingest_files, ingest_opts); s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
ASSERT_NOK(s); ASSERT_NOK(s);
ASSERT_TRUE( // This error msg is from checking largest seqno in table property.
s.ToString().find( ASSERT_TRUE(s.ToString().find("non zero largest sequence number") !=
"External file has a key with non zero sequence number") != std::string::npos);
std::string::npos);
db_->ReleaseSnapshot(snapshot); db_->ReleaseSnapshot(snapshot);
} }
@ -3897,14 +3903,6 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
ASSERT_TRUE(s.ToString().find(err) != std::string::npos); ASSERT_TRUE(s.ToString().find(err) != std::string::npos);
ASSERT_NOK(s); ASSERT_NOK(s);
ingest_opts.move_files = true;
s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
ingest_opts.move_files = false;
ASSERT_TRUE(
s.ToString().find("Options move_files and allow_db_generated_files are "
"not compatible") != std::string::npos);
ASSERT_NOK(s);
ingest_opts.snapshot_consistency = false; ingest_opts.snapshot_consistency = false;
ASSERT_OK(db_->IngestExternalFile(to_ingest_files, ingest_opts)); ASSERT_OK(db_->IngestExternalFile(to_ingest_files, ingest_opts));
db_->ReleaseSnapshot(snapshot); db_->ReleaseSnapshot(snapshot);
@ -3924,14 +3922,16 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
class IngestDBGeneratedFileTest2 class IngestDBGeneratedFileTest2
: public ExternalSSTFileTestBase, : public ExternalSSTFileTestBase,
public ::testing::WithParamInterface<std::tuple<bool, bool, bool, bool>> { public ::testing::WithParamInterface<
std::tuple<bool, bool, bool, bool, bool>> {
public: public:
IngestDBGeneratedFileTest2() = default; IngestDBGeneratedFileTest2() = default;
}; };
INSTANTIATE_TEST_CASE_P(VaryingOptions, IngestDBGeneratedFileTest2, INSTANTIATE_TEST_CASE_P(VaryingOptions, IngestDBGeneratedFileTest2,
testing::Combine(testing::Bool(), testing::Bool(), testing::Combine(testing::Bool(), testing::Bool(),
testing::Bool(), testing::Bool())); testing::Bool(), testing::Bool(),
testing::Bool()));
TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) { TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
// Use a separate column family to sort some data, generate multiple SST // Use a separate column family to sort some data, generate multiple SST
@ -3939,11 +3939,11 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
// to be ingested does not overlap with existing data. // to be ingested does not overlap with existing data.
IngestExternalFileOptions ingest_opts; IngestExternalFileOptions ingest_opts;
ingest_opts.allow_db_generated_files = true; ingest_opts.allow_db_generated_files = true;
ingest_opts.move_files = false;
ingest_opts.snapshot_consistency = std::get<0>(GetParam()); ingest_opts.snapshot_consistency = std::get<0>(GetParam());
ingest_opts.allow_global_seqno = std::get<1>(GetParam()); ingest_opts.allow_global_seqno = std::get<1>(GetParam());
ingest_opts.allow_blocking_flush = std::get<2>(GetParam()); ingest_opts.allow_blocking_flush = std::get<2>(GetParam());
ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam()); ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam());
ingest_opts.move_files = std::get<4>(GetParam());
do { do {
SCOPED_TRACE("option_config_ = " + std::to_string(option_config_)); SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));

View File

@ -1156,6 +1156,11 @@ void FlushJob::GetEffectiveCutoffUDTForPickedMemTables() {
// Find the newest user-defined timestamps from all the flushed memtables. // Find the newest user-defined timestamps from all the flushed memtables.
for (MemTable* m : mems_) { for (MemTable* m : mems_) {
Slice table_newest_udt = m->GetNewestUDT(); Slice table_newest_udt = m->GetNewestUDT();
// Empty memtables can be legitimately created and flushed, for example
// by error recovery flush attempts.
if (table_newest_udt.empty()) {
continue;
}
if (cutoff_udt_.empty() || if (cutoff_udt_.empty() ||
ucmp->CompareTimestamp(table_newest_udt, cutoff_udt_) > 0) { ucmp->CompareTimestamp(table_newest_udt, cutoff_udt_) > 0) {
if (!cutoff_udt_.empty()) { if (!cutoff_udt_.empty()) {

View File

@ -68,7 +68,8 @@ class FlushJobTestBase : public testing::Test {
} }
void NewDB() { void NewDB() {
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); ASSERT_OK(
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
VersionEdit new_db; VersionEdit new_db;
new_db.SetLogNumber(0); new_db.SetLogNumber(0);
@ -114,7 +115,8 @@ class FlushJobTestBase : public testing::Test {
} }
ASSERT_OK(s); ASSERT_OK(s);
// Make "CURRENT" file that points to the new manifest file. // Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
Temperature::kUnknown, nullptr);
ASSERT_OK(s); ASSERT_OK(s);
} }

View File

@ -354,13 +354,13 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
} }
TEST_F(EventListenerTest, MultiCF) { TEST_F(EventListenerTest, MultiCF) {
Options options;
options.env = CurrentOptions().env;
options.write_buffer_size = k110KB;
#ifdef ROCKSDB_USING_THREAD_STATUS
options.enable_thread_tracking = true;
#endif // ROCKSDB_USING_THREAD_STATUS
for (auto atomic_flush : {false, true}) { for (auto atomic_flush : {false, true}) {
Options options;
options.env = CurrentOptions().env;
options.write_buffer_size = k110KB;
#ifdef ROCKSDB_USING_THREAD_STATUS
options.enable_thread_tracking = true;
#endif // ROCKSDB_USING_THREAD_STATUS
options.atomic_flush = atomic_flush; options.atomic_flush = atomic_flush;
options.create_if_missing = true; options.create_if_missing = true;
DestroyAndReopen(options); DestroyAndReopen(options);

View File

@ -67,9 +67,10 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
statistics(ioptions.stats), statistics(ioptions.stats),
merge_operator(ioptions.merge_operator.get()), merge_operator(ioptions.merge_operator.get()),
info_log(ioptions.logger), info_log(ioptions.logger),
allow_data_in_errors(ioptions.allow_data_in_errors),
protection_bytes_per_key( protection_bytes_per_key(
mutable_cf_options.memtable_protection_bytes_per_key) {} mutable_cf_options.memtable_protection_bytes_per_key),
allow_data_in_errors(ioptions.allow_data_in_errors),
paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {}
MemTable::MemTable(const InternalKeyComparator& cmp, MemTable::MemTable(const InternalKeyComparator& cmp,
const ImmutableOptions& ioptions, const ImmutableOptions& ioptions,
@ -370,15 +371,17 @@ class MemTableIterator : public InternalIterator {
: bloom_(nullptr), : bloom_(nullptr),
prefix_extractor_(mem.prefix_extractor_), prefix_extractor_(mem.prefix_extractor_),
comparator_(mem.comparator_), comparator_(mem.comparator_),
valid_(false),
seqno_to_time_mapping_(seqno_to_time_mapping), seqno_to_time_mapping_(seqno_to_time_mapping),
arena_mode_(arena != nullptr),
value_pinned_(
!mem.GetImmutableMemTableOptions()->inplace_update_support),
protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
status_(Status::OK()), status_(Status::OK()),
logger_(mem.moptions_.info_log), logger_(mem.moptions_.info_log),
ts_sz_(mem.ts_sz_) { ts_sz_(mem.ts_sz_),
protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
valid_(false),
value_pinned_(
!mem.GetImmutableMemTableOptions()->inplace_update_support),
arena_mode_(arena != nullptr),
paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
allow_data_in_error(mem.moptions_.allow_data_in_errors) {
if (use_range_del_table) { if (use_range_del_table) {
iter_ = mem.range_del_table_->GetIterator(arena); iter_ = mem.range_del_table_->GetIterator(arena);
} else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
@ -406,6 +409,7 @@ class MemTableIterator : public InternalIterator {
} else { } else {
delete iter_; delete iter_;
} }
status_.PermitUncheckedError();
} }
#ifndef NDEBUG #ifndef NDEBUG
@ -415,10 +419,16 @@ class MemTableIterator : public InternalIterator {
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
#endif #endif
bool Valid() const override { return valid_ && status_.ok(); } bool Valid() const override {
// If inner iter_ is not valid, then this iter should also not be valid.
assert(iter_->Valid() || !(valid_ && status_.ok()));
return valid_ && status_.ok();
}
void Seek(const Slice& k) override { void Seek(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time); PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1); PERF_COUNTER_ADD(seek_on_memtable_count, 1);
status_ = Status::OK();
if (bloom_) { if (bloom_) {
// iterator should only use prefix bloom filter // iterator should only use prefix bloom filter
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
@ -433,13 +443,18 @@ class MemTableIterator : public InternalIterator {
} }
} }
} }
iter_->Seek(k, nullptr); if (paranoid_memory_checks_) {
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
} else {
iter_->Seek(k, nullptr);
}
valid_ = iter_->Valid(); valid_ = iter_->Valid();
VerifyEntryChecksum(); VerifyEntryChecksum();
} }
void SeekForPrev(const Slice& k) override { void SeekForPrev(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time); PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1); PERF_COUNTER_ADD(seek_on_memtable_count, 1);
status_ = Status::OK();
if (bloom_) { if (bloom_) {
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
if (prefix_extractor_->InDomain(user_k_without_ts)) { if (prefix_extractor_->InDomain(user_k_without_ts)) {
@ -453,7 +468,11 @@ class MemTableIterator : public InternalIterator {
} }
} }
} }
iter_->Seek(k, nullptr); if (paranoid_memory_checks_) {
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
} else {
iter_->Seek(k, nullptr);
}
valid_ = iter_->Valid(); valid_ = iter_->Valid();
VerifyEntryChecksum(); VerifyEntryChecksum();
if (!Valid() && status().ok()) { if (!Valid() && status().ok()) {
@ -464,11 +483,13 @@ class MemTableIterator : public InternalIterator {
} }
} }
void SeekToFirst() override { void SeekToFirst() override {
status_ = Status::OK();
iter_->SeekToFirst(); iter_->SeekToFirst();
valid_ = iter_->Valid(); valid_ = iter_->Valid();
VerifyEntryChecksum(); VerifyEntryChecksum();
} }
void SeekToLast() override { void SeekToLast() override {
status_ = Status::OK();
iter_->SeekToLast(); iter_->SeekToLast();
valid_ = iter_->Valid(); valid_ = iter_->Valid();
VerifyEntryChecksum(); VerifyEntryChecksum();
@ -476,8 +497,12 @@ class MemTableIterator : public InternalIterator {
void Next() override { void Next() override {
PERF_COUNTER_ADD(next_on_memtable_count, 1); PERF_COUNTER_ADD(next_on_memtable_count, 1);
assert(Valid()); assert(Valid());
iter_->Next(); if (paranoid_memory_checks_) {
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_); status_ = iter_->NextAndValidate(allow_data_in_error);
} else {
iter_->Next();
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
}
valid_ = iter_->Valid(); valid_ = iter_->Valid();
VerifyEntryChecksum(); VerifyEntryChecksum();
} }
@ -494,7 +519,11 @@ class MemTableIterator : public InternalIterator {
void Prev() override { void Prev() override {
PERF_COUNTER_ADD(prev_on_memtable_count, 1); PERF_COUNTER_ADD(prev_on_memtable_count, 1);
assert(Valid()); assert(Valid());
iter_->Prev(); if (paranoid_memory_checks_) {
status_ = iter_->PrevAndValidate(allow_data_in_error);
} else {
iter_->Prev();
}
valid_ = iter_->Valid(); valid_ = iter_->Valid();
VerifyEntryChecksum(); VerifyEntryChecksum();
} }
@ -540,15 +569,17 @@ class MemTableIterator : public InternalIterator {
const SliceTransform* const prefix_extractor_; const SliceTransform* const prefix_extractor_;
const MemTable::KeyComparator comparator_; const MemTable::KeyComparator comparator_;
MemTableRep::Iterator* iter_; MemTableRep::Iterator* iter_;
bool valid_;
// The seqno to time mapping is owned by the SuperVersion. // The seqno to time mapping is owned by the SuperVersion.
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping_; UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping_;
bool arena_mode_;
bool value_pinned_;
uint32_t protection_bytes_per_key_;
Status status_; Status status_;
Logger* logger_; Logger* logger_;
size_t ts_sz_; size_t ts_sz_;
uint32_t protection_bytes_per_key_;
bool valid_;
bool value_pinned_;
bool arena_mode_;
const bool paranoid_memory_checks_;
const bool allow_data_in_error;
void VerifyEntryChecksum() { void VerifyEntryChecksum() {
if (protection_bytes_per_key_ > 0 && Valid()) { if (protection_bytes_per_key_ > 0 && Valid()) {
@ -933,6 +964,8 @@ static bool SaveValue(void* arg, const char* entry) {
Saver* s = static_cast<Saver*>(arg); Saver* s = static_cast<Saver*>(arg);
assert(s != nullptr); assert(s != nullptr);
assert(!s->value || !s->columns); assert(!s->value || !s->columns);
assert(!*(s->found_final_value));
assert(s->status->ok() || s->status->IsMergeInProgress());
MergeContext* merge_context = s->merge_context; MergeContext* merge_context = s->merge_context;
SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq; SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
@ -966,6 +999,7 @@ static bool SaveValue(void* arg, const char* entry) {
*(s->status) = MemTable::VerifyEntryChecksum( *(s->status) = MemTable::VerifyEntryChecksum(
entry, s->protection_bytes_per_key, s->allow_data_in_errors); entry, s->protection_bytes_per_key, s->allow_data_in_errors);
if (!s->status->ok()) { if (!s->status->ok()) {
*(s->found_final_value) = true;
ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState()); ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState());
// Memtable entry corrupted // Memtable entry corrupted
return false; return false;
@ -1231,6 +1265,7 @@ static bool SaveValue(void* arg, const char* entry) {
". "); ". ");
msg.append("seq: " + std::to_string(seq) + "."); msg.append("seq: " + std::to_string(seq) + ".");
} }
*(s->found_final_value) = true;
*(s->status) = Status::Corruption(msg.c_str()); *(s->status) = Status::Corruption(msg.c_str());
return false; return false;
} }
@ -1310,8 +1345,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value,
// No change to value, since we have not yet found a Put/Delete // No change to value, since we have not yet found a Put/Delete
// Propagate corruption error // Propagate corruption error
if (!found_final_value && merge_in_progress && !s->IsCorruption()) { if (!found_final_value && merge_in_progress) {
*s = Status::MergeInProgress(); if (s->ok()) {
*s = Status::MergeInProgress();
} else {
assert(s->IsMergeInProgress());
}
} }
PERF_COUNTER_ADD(get_from_memtable_count, 1); PERF_COUNTER_ADD(get_from_memtable_count, 1);
return found_final_value; return found_final_value;
@ -1347,7 +1386,19 @@ void MemTable::GetFromTable(const LookupKey& key,
saver.do_merge = do_merge; saver.do_merge = do_merge;
saver.allow_data_in_errors = moptions_.allow_data_in_errors; saver.allow_data_in_errors = moptions_.allow_data_in_errors;
saver.protection_bytes_per_key = moptions_.protection_bytes_per_key; saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
table_->Get(key, &saver, SaveValue);
if (!moptions_.paranoid_memory_checks) {
table_->Get(key, &saver, SaveValue);
} else {
Status check_s = table_->GetAndValidate(key, &saver, SaveValue,
moptions_.allow_data_in_errors);
if (check_s.IsCorruption()) {
*(saver.status) = check_s;
// Should stop searching the LSM.
*(saver.found_final_value) = true;
}
}
assert(s->ok() || s->IsMergeInProgress() || *found_final_value);
*seq = saver.seq; *seq = saver.seq;
} }
@ -1421,10 +1472,19 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
&found_final_value, &merge_in_progress); &found_final_value, &merge_in_progress);
if (!found_final_value && merge_in_progress) { if (!found_final_value && merge_in_progress) {
*(iter->s) = Status::MergeInProgress(); if (iter->s->ok()) {
*(iter->s) = Status::MergeInProgress();
} else {
assert(iter->s->IsMergeInProgress());
}
} }
if (found_final_value) { if (found_final_value ||
(!iter->s->ok() && !iter->s->IsMergeInProgress())) {
// `found_final_value` should be set if an error/corruption occurs.
// The check on iter->s is just there in case GetFromTable() did not
// set `found_final_value` properly.
assert(found_final_value);
if (iter->value) { if (iter->value) {
iter->value->PinSelf(); iter->value->PinSelf();
range->AddValueSize(iter->value->size()); range->AddValueSize(iter->value->size());

View File

@ -60,8 +60,9 @@ struct ImmutableMemTableOptions {
Statistics* statistics; Statistics* statistics;
MergeOperator* merge_operator; MergeOperator* merge_operator;
Logger* info_log; Logger* info_log;
bool allow_data_in_errors;
uint32_t protection_bytes_per_key; uint32_t protection_bytes_per_key;
bool allow_data_in_errors;
bool paranoid_memory_checks;
}; };
// Batched counters to updated when inserting keys in one write batch. // Batched counters to updated when inserting keys in one write batch.
@ -249,12 +250,14 @@ class MemTable {
// If do_merge = true the default behavior which is Get value for key is // If do_merge = true the default behavior which is Get value for key is
// executed. Expected behavior is described right below. // executed. Expected behavior is described right below.
// If memtable contains a value for key, store it in *value and return true. // If memtable contains a value for key, store it in *value and return true.
// If memtable contains a deletion for key, store a NotFound() error // If memtable contains a deletion for key, store NotFound() in *status and
// in *status and return true. // return true.
// If memtable contains Merge operation as the most recent entry for a key, // If memtable contains Merge operation as the most recent entry for a key,
// and the merge process does not stop (not reaching a value or delete), // and the merge process does not stop (not reaching a value or delete),
// prepend the current merge operand to *operands. // prepend the current merge operand to *operands.
// store MergeInProgress in s, and return false. // store MergeInProgress in s, and return false.
// If an unexpected error or corruption occurs, store Corruption() or other
// error in *status and return true.
// Else, return false. // Else, return false.
// If any operation was found, its most recent sequence number // If any operation was found, its most recent sequence number
// will be stored in *seq on success (regardless of whether true/false is // will be stored in *seq on success (regardless of whether true/false is
@ -264,6 +267,11 @@ class MemTable {
// If do_merge = false then any Merge Operands encountered for key are simply // If do_merge = false then any Merge Operands encountered for key are simply
// stored in merge_context.operands_list and never actually merged to get a // stored in merge_context.operands_list and never actually merged to get a
// final value. The raw Merge Operands are eventually returned to the user. // final value. The raw Merge Operands are eventually returned to the user.
// @param value If not null and memtable contains a value for key, `value`
// will be set to the result value.
// @param column If not null and memtable contains a value/WideColumn for key,
// `column` will be set to the result value/WideColumn.
// Note: only one of `value` and `column` can be non-nullptr.
// @param immutable_memtable Whether this memtable is immutable. Used // @param immutable_memtable Whether this memtable is immutable. Used
// internally by NewRangeTombstoneIterator(). See comment above // internally by NewRangeTombstoneIterator(). See comment above
// NewRangeTombstoneIterator() for more detail. // NewRangeTombstoneIterator() for more detail.

View File

@ -181,7 +181,8 @@ bool MemTableListVersion::GetFromList(
} }
if (done) { if (done) {
assert(*seq != kMaxSequenceNumber || s->IsNotFound()); assert(*seq != kMaxSequenceNumber ||
(!s->ok() && !s->IsMergeInProgress()));
return true; return true;
} }
if (!s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { if (!s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {

View File

@ -287,6 +287,7 @@ TEST_F(MemTableListTest, GetTest) {
// Fetch the newly written keys // Fetch the newly written keys
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr, found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context, /*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(), &max_covering_tombstone_seq, ReadOptions(),
@ -295,6 +296,7 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_EQ(value, "value1"); ASSERT_EQ(value, "value1");
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key1", 2), &value, /*columns*/ nullptr, found = mem->Get(LookupKey("key1", 2), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context, /*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(), &max_covering_tombstone_seq, ReadOptions(),
@ -303,6 +305,7 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_TRUE(found && s.IsNotFound()); ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr, found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context, /*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(), &max_covering_tombstone_seq, ReadOptions(),
@ -311,6 +314,7 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_EQ(value, "value2.2"); ASSERT_EQ(value, "value2.2");
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key3", seq), &value, /*columns*/ nullptr, found = mem->Get(LookupKey("key3", seq), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context, /*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(), &max_covering_tombstone_seq, ReadOptions(),
@ -350,6 +354,7 @@ TEST_F(MemTableListTest, GetTest) {
// Fetch keys via MemTableList // Fetch keys via MemTableList
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = found =
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr, list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, /*timestamp=*/nullptr, &s, &merge_context,
@ -357,6 +362,7 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_TRUE(found && s.IsNotFound()); ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = list.current()->Get(LookupKey("key1", saved_seq), &value, found = list.current()->Get(LookupKey("key1", saved_seq), &value,
/*columns=*/nullptr, /*timestamp=*/nullptr, &s, /*columns=*/nullptr, /*timestamp=*/nullptr, &s,
&merge_context, &max_covering_tombstone_seq, &merge_context, &max_covering_tombstone_seq,
@ -365,6 +371,7 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_EQ("value1", value); ASSERT_EQ("value1", value);
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = found =
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, /*timestamp=*/nullptr, &s, &merge_context,
@ -373,12 +380,14 @@ TEST_F(MemTableListTest, GetTest) {
ASSERT_EQ(value, "value2.3"); ASSERT_EQ(value, "value2.3");
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = list.current()->Get(LookupKey("key2", 1), &value, /*columns=*/nullptr, found = list.current()->Get(LookupKey("key2", 1), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, /*timestamp=*/nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions()); &max_covering_tombstone_seq, ReadOptions());
ASSERT_FALSE(found); ASSERT_FALSE(found);
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = found =
list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr, list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, /*timestamp=*/nullptr, &s, &merge_context,
@ -438,6 +447,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Fetch the newly written keys // Fetch the newly written keys
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr, found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context, /*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(), &max_covering_tombstone_seq, ReadOptions(),
@ -446,6 +456,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_TRUE(found && s.IsNotFound()); ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr, found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
/*timestamp*/ nullptr, &s, &merge_context, /*timestamp*/ nullptr, &s, &merge_context,
&max_covering_tombstone_seq, ReadOptions(), &max_covering_tombstone_seq, ReadOptions(),
@ -462,6 +473,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Fetch keys via MemTableList // Fetch keys via MemTableList
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = found =
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr, list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, /*timestamp=*/nullptr, &s, &merge_context,
@ -469,6 +481,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_TRUE(found && s.IsNotFound()); ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = found =
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, /*timestamp=*/nullptr, &s, &merge_context,
@ -508,6 +521,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Verify keys are present in history // Verify keys are present in history
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = list.current()->GetFromHistory( found = list.current()->GetFromHistory(
LookupKey("key1", seq), &value, /*columns=*/nullptr, LookupKey("key1", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@ -515,6 +529,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_TRUE(found && s.IsNotFound()); ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = list.current()->GetFromHistory( found = list.current()->GetFromHistory(
LookupKey("key2", seq), &value, /*columns=*/nullptr, LookupKey("key2", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@ -568,6 +583,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Verify keys are no longer in MemTableList // Verify keys are no longer in MemTableList
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = found =
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr, list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, /*timestamp=*/nullptr, &s, &merge_context,
@ -575,6 +591,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_FALSE(found); ASSERT_FALSE(found);
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = found =
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, /*timestamp=*/nullptr, &s, &merge_context,
@ -582,6 +599,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_FALSE(found); ASSERT_FALSE(found);
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = found =
list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr, list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, /*timestamp=*/nullptr, &s, &merge_context,
@ -590,6 +608,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Verify that the second memtable's keys are in the history // Verify that the second memtable's keys are in the history
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = list.current()->GetFromHistory( found = list.current()->GetFromHistory(
LookupKey("key1", seq), &value, /*columns=*/nullptr, LookupKey("key1", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@ -597,6 +616,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
ASSERT_TRUE(found && s.IsNotFound()); ASSERT_TRUE(found && s.IsNotFound());
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = list.current()->GetFromHistory( found = list.current()->GetFromHistory(
LookupKey("key3", seq), &value, /*columns=*/nullptr, LookupKey("key3", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@ -606,6 +626,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
// Verify that key2 from the first memtable is no longer in the history // Verify that key2 from the first memtable is no longer in the history
merge_context.Clear(); merge_context.Clear();
s = Status::OK();
found = found =
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr, list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
/*timestamp=*/nullptr, &s, &merge_context, /*timestamp=*/nullptr, &s, &merge_context,

View File

@ -29,6 +29,7 @@
#include "db/internal_stats.h" #include "db/internal_stats.h"
#include "db/table_cache.h" #include "db/table_cache.h"
#include "db/version_edit.h" #include "db/version_edit.h"
#include "db/version_edit_handler.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "port/port.h" #include "port/port.h"
#include "table/table_reader.h" #include "table/table_reader.h"
@ -37,6 +38,25 @@
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
class VersionBuilder::Rep { class VersionBuilder::Rep {
class NewestFirstBySeqNo {
public:
bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
assert(lhs);
assert(rhs);
if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
}
if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
}
// Break ties by file number
return lhs->fd.GetNumber() > rhs->fd.GetNumber();
}
};
class NewestFirstByEpochNumber { class NewestFirstByEpochNumber {
private: private:
inline static const NewestFirstBySeqNo seqno_cmp; inline static const NewestFirstBySeqNo seqno_cmp;
@ -249,9 +269,10 @@ class VersionBuilder::Rep {
std::unordered_map<uint64_t, int> table_file_levels_; std::unordered_map<uint64_t, int> table_file_levels_;
// Current compact cursors that should be changed after the last compaction // Current compact cursors that should be changed after the last compaction
std::unordered_map<int, InternalKey> updated_compact_cursors_; std::unordered_map<int, InternalKey> updated_compact_cursors_;
NewestFirstByEpochNumber level_zero_cmp_by_epochno_; const std::shared_ptr<const NewestFirstByEpochNumber>
NewestFirstBySeqNo level_zero_cmp_by_seqno_; level_zero_cmp_by_epochno_;
BySmallestKey level_nonzero_cmp_; const std::shared_ptr<const NewestFirstBySeqNo> level_zero_cmp_by_seqno_;
const std::shared_ptr<const BySmallestKey> level_nonzero_cmp_;
// Mutable metadata objects for all blob files affected by the series of // Mutable metadata objects for all blob files affected by the series of
// version edits. // version edits.
@ -259,11 +280,56 @@ class VersionBuilder::Rep {
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_; std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
ColumnFamilyData* cfd_;
VersionEditHandler* version_edit_handler_;
bool track_found_and_missing_files_;
// If false, only a complete Version with all files consisting it found is
// considered valid. If true, besides complete Version, if the Version is
// never edited in an atomic group, an incomplete Version with only a suffix
// of L0 files missing is also considered valid.
bool allow_incomplete_valid_version_;
// These are only tracked if `track_found_and_missing_files_` is enabled.
// The SST files that are found (blob files not included yet).
std::unordered_set<uint64_t> found_files_;
// Missing SST files for L0
std::unordered_set<uint64_t> l0_missing_files_;
// Missing SST files for non L0 levels
std::unordered_set<uint64_t> non_l0_missing_files_;
// Intermediate SST files (blob files not included yet)
std::vector<std::string> intermediate_files_;
// The highest file number for all the missing blob files, useful to check
// if a complete Version is available.
uint64_t missing_blob_files_high_ = kInvalidBlobFileNumber;
// Missing blob files, useful to check if only the missing L0 files'
// associated blob files are missing.
std::unordered_set<uint64_t> missing_blob_files_;
// True if all files consisting the Version can be found. Or if
// `allow_incomplete_valid_version_` is true and the version history is not
// ever edited in an atomic group, this will be true if only a
// suffix of L0 SST files and their associated blob files are missing.
bool valid_version_available_;
// True if version is ever edited in an atomic group.
bool edited_in_atomic_group_;
// Flag to indicate if the Version is updated since last validity check. If no
// `Apply` call is made between a `Rep`'s construction and a
// `ValidVersionAvailable` check or between two `ValidVersionAvailable` calls.
// This flag will be true to indicate the cached validity value can be
// directly used without a recheck.
bool version_updated_since_last_check_;
// End of fields that are only tracked when `track_found_and_missing_files_`
// is enabled.
public: public:
Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions, Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions,
TableCache* table_cache, VersionStorageInfo* base_vstorage, TableCache* table_cache, VersionStorageInfo* base_vstorage,
VersionSet* version_set, VersionSet* version_set,
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr) std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr,
ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler,
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
: file_options_(file_options), : file_options_(file_options),
ioptions_(ioptions), ioptions_(ioptions),
table_cache_(table_cache), table_cache_(table_cache),
@ -271,11 +337,76 @@ class VersionBuilder::Rep {
version_set_(version_set), version_set_(version_set),
num_levels_(base_vstorage->num_levels()), num_levels_(base_vstorage->num_levels()),
has_invalid_levels_(false), has_invalid_levels_(false),
level_nonzero_cmp_(base_vstorage_->InternalComparator()), level_zero_cmp_by_epochno_(
file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) { std::make_shared<NewestFirstByEpochNumber>()),
level_zero_cmp_by_seqno_(std::make_shared<NewestFirstBySeqNo>()),
level_nonzero_cmp_(std::make_shared<BySmallestKey>(
base_vstorage_->InternalComparator())),
file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr),
cfd_(cfd),
version_edit_handler_(version_edit_handler),
track_found_and_missing_files_(track_found_and_missing_files),
allow_incomplete_valid_version_(allow_incomplete_valid_version) {
assert(ioptions_); assert(ioptions_);
levels_ = new LevelState[num_levels_]; levels_ = new LevelState[num_levels_];
if (track_found_and_missing_files_) {
assert(cfd_);
assert(version_edit_handler_);
// `track_found_and_missing_files_` mode used by VersionEditHandlerPIT
// assumes the initial base version is valid. For best efforts recovery,
// base will be empty. For manifest tailing usage like secondary instance,
// they do not allow incomplete version, so the base version in subsequent
// catch up attempts should be valid too.
valid_version_available_ = true;
edited_in_atomic_group_ = false;
version_updated_since_last_check_ = false;
}
}
Rep(const Rep& other)
: file_options_(other.file_options_),
ioptions_(other.ioptions_),
table_cache_(other.table_cache_),
base_vstorage_(other.base_vstorage_),
version_set_(other.version_set_),
num_levels_(other.num_levels_),
invalid_level_sizes_(other.invalid_level_sizes_),
has_invalid_levels_(other.has_invalid_levels_),
table_file_levels_(other.table_file_levels_),
updated_compact_cursors_(other.updated_compact_cursors_),
level_zero_cmp_by_epochno_(other.level_zero_cmp_by_epochno_),
level_zero_cmp_by_seqno_(other.level_zero_cmp_by_seqno_),
level_nonzero_cmp_(other.level_nonzero_cmp_),
mutable_blob_file_metas_(other.mutable_blob_file_metas_),
file_metadata_cache_res_mgr_(other.file_metadata_cache_res_mgr_),
cfd_(other.cfd_),
version_edit_handler_(other.version_edit_handler_),
track_found_and_missing_files_(other.track_found_and_missing_files_),
allow_incomplete_valid_version_(other.allow_incomplete_valid_version_),
found_files_(other.found_files_),
l0_missing_files_(other.l0_missing_files_),
non_l0_missing_files_(other.non_l0_missing_files_),
intermediate_files_(other.intermediate_files_),
missing_blob_files_high_(other.missing_blob_files_high_),
missing_blob_files_(other.missing_blob_files_),
valid_version_available_(other.valid_version_available_),
edited_in_atomic_group_(other.edited_in_atomic_group_),
version_updated_since_last_check_(
other.version_updated_since_last_check_) {
assert(ioptions_);
levels_ = new LevelState[num_levels_];
for (int level = 0; level < num_levels_; level++) {
levels_[level] = other.levels_[level];
const auto& added = levels_[level].added_files;
for (auto& pair : added) {
RefFile(pair.second);
}
}
if (track_found_and_missing_files_) {
assert(cfd_);
assert(version_edit_handler_);
}
} }
~Rep() { ~Rep() {
@ -289,6 +420,12 @@ class VersionBuilder::Rep {
delete[] levels_; delete[] levels_;
} }
void RefFile(FileMetaData* f) {
assert(f);
assert(f->refs > 0);
f->refs++;
}
void UnrefFile(FileMetaData* f) { void UnrefFile(FileMetaData* f) {
f->refs--; f->refs--;
if (f->refs <= 0) { if (f->refs <= 0) {
@ -397,7 +534,7 @@ class VersionBuilder::Rep {
if (epoch_number_requirement == if (epoch_number_requirement ==
EpochNumberRequirement::kMightMissing) { EpochNumberRequirement::kMightMissing) {
if (!level_zero_cmp_by_seqno_(lhs, rhs)) { if (!level_zero_cmp_by_seqno_->operator()(lhs, rhs)) {
std::ostringstream oss; std::ostringstream oss;
oss << "L0 files are not sorted properly: files #" oss << "L0 files are not sorted properly: files #"
<< lhs->fd.GetNumber() << " with seqnos (largest, smallest) " << lhs->fd.GetNumber() << " with seqnos (largest, smallest) "
@ -429,7 +566,7 @@ class VersionBuilder::Rep {
} }
} }
if (!level_zero_cmp_by_epochno_(lhs, rhs)) { if (!level_zero_cmp_by_epochno_->operator()(lhs, rhs)) {
std::ostringstream oss; std::ostringstream oss;
oss << "L0 files are not sorted properly: files #" oss << "L0 files are not sorted properly: files #"
<< lhs->fd.GetNumber() << " with epoch number " << lhs->fd.GetNumber() << " with epoch number "
@ -458,7 +595,7 @@ class VersionBuilder::Rep {
assert(lhs); assert(lhs);
assert(rhs); assert(rhs);
if (!level_nonzero_cmp_(lhs, rhs)) { if (!level_nonzero_cmp_->operator()(lhs, rhs)) {
std::ostringstream oss; std::ostringstream oss;
oss << 'L' << level << " files are not sorted properly: files #" oss << 'L' << level << " files are not sorted properly: files #"
<< lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber(); << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
@ -634,7 +771,22 @@ class VersionBuilder::Rep {
mutable_blob_file_metas_.emplace( mutable_blob_file_metas_.emplace(
blob_file_number, MutableBlobFileMetaData(std::move(shared_meta))); blob_file_number, MutableBlobFileMetaData(std::move(shared_meta)));
return Status::OK(); Status s;
if (track_found_and_missing_files_) {
assert(version_edit_handler_);
s = version_edit_handler_->VerifyBlobFile(cfd_, blob_file_number,
blob_file_addition);
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
missing_blob_files_high_ =
std::max(missing_blob_files_high_, blob_file_number);
missing_blob_files_.insert(blob_file_number);
s = Status::OK();
} else if (!s.ok()) {
return s;
}
}
return s;
} }
Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) { Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) {
@ -752,6 +904,29 @@ class VersionBuilder::Rep {
table_file_levels_[file_number] = table_file_levels_[file_number] =
VersionStorageInfo::FileLocation::Invalid().GetLevel(); VersionStorageInfo::FileLocation::Invalid().GetLevel();
if (track_found_and_missing_files_) {
assert(version_edit_handler_);
if (l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
l0_missing_files_.erase(file_number);
} else if (non_l0_missing_files_.find(file_number) !=
non_l0_missing_files_.end()) {
non_l0_missing_files_.erase(file_number);
} else {
auto fiter = found_files_.find(file_number);
// Only mark new files added during this catchup attempt for deletion.
// These files were never installed in VersionStorageInfo.
// Already referenced files that are deleted by a VersionEdit will
// be added to the VersionStorageInfo's obsolete files when the old
// version is dereferenced.
if (fiter != found_files_.end()) {
assert(!ioptions_->cf_paths.empty());
intermediate_files_.emplace_back(
MakeTableFileName(ioptions_->cf_paths[0].path, file_number));
found_files_.erase(fiter);
}
}
}
return Status::OK(); return Status::OK();
} }
@ -824,7 +999,31 @@ class VersionBuilder::Rep {
table_file_levels_[file_number] = level; table_file_levels_[file_number] = level;
return Status::OK(); Status s;
if (track_found_and_missing_files_) {
assert(version_edit_handler_);
assert(!ioptions_->cf_paths.empty());
const std::string fpath =
MakeTableFileName(ioptions_->cf_paths[0].path, file_number);
s = version_edit_handler_->VerifyFile(cfd_, fpath, level, meta);
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
if (0 == level) {
l0_missing_files_.insert(file_number);
} else {
non_l0_missing_files_.insert(file_number);
}
if (s.IsCorruption()) {
found_files_.insert(file_number);
}
s = Status::OK();
} else if (!s.ok()) {
return s;
} else {
found_files_.insert(file_number);
}
}
return s;
} }
Status ApplyCompactCursors(int level, Status ApplyCompactCursors(int level,
@ -845,6 +1044,7 @@ class VersionBuilder::Rep {
// Apply all of the edits in *edit to the current state. // Apply all of the edits in *edit to the current state.
Status Apply(const VersionEdit* edit) { Status Apply(const VersionEdit* edit) {
bool version_updated = false;
{ {
const Status s = CheckConsistency(base_vstorage_); const Status s = CheckConsistency(base_vstorage_);
if (!s.ok()) { if (!s.ok()) {
@ -862,6 +1062,7 @@ class VersionBuilder::Rep {
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
version_updated = true;
} }
// Increase the amount of garbage for blob files affected by GC // Increase the amount of garbage for blob files affected by GC
@ -870,6 +1071,7 @@ class VersionBuilder::Rep {
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
version_updated = true;
} }
// Delete table files // Delete table files
@ -881,6 +1083,7 @@ class VersionBuilder::Rep {
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
version_updated = true;
} }
// Add new table files // Add new table files
@ -892,6 +1095,7 @@ class VersionBuilder::Rep {
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
version_updated = true;
} }
// Populate compact cursors for round-robin compaction, leave // Populate compact cursors for round-robin compaction, leave
@ -904,6 +1108,13 @@ class VersionBuilder::Rep {
return s; return s;
} }
} }
if (track_found_and_missing_files_ && version_updated) {
version_updated_since_last_check_ = true;
if (!edited_in_atomic_group_ && edit->IsInAtomicGroup()) {
edited_in_atomic_group_ = true;
}
}
return Status::OK(); return Status::OK();
} }
@ -1046,14 +1257,35 @@ class VersionBuilder::Rep {
mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes()); mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes());
} }
bool OnlyLinkedToMissingL0Files(
const std::unordered_set<uint64_t>& linked_ssts) const {
return std::all_of(
linked_ssts.begin(), linked_ssts.end(), [&](const uint64_t& element) {
return l0_missing_files_.find(element) != l0_missing_files_.end();
});
}
// Add the blob file specified by meta to *vstorage if it is determined to // Add the blob file specified by meta to *vstorage if it is determined to
// contain valid data (blobs). // contain valid data (blobs).
template <typename Meta> template <typename Meta>
static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) { void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta,
uint64_t blob_file_number) const {
assert(vstorage); assert(vstorage);
assert(meta); assert(meta);
if (meta->GetLinkedSsts().empty() && const auto& linked_ssts = meta->GetLinkedSsts();
if (track_found_and_missing_files_) {
if (missing_blob_files_.find(blob_file_number) !=
missing_blob_files_.end()) {
return;
}
// Leave the empty case for the below blob garbage collection logic.
if (!linked_ssts.empty() && OnlyLinkedToMissingL0Files(linked_ssts)) {
return;
}
}
if (linked_ssts.empty() &&
meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) { meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) {
return; return;
} }
@ -1065,6 +1297,7 @@ class VersionBuilder::Rep {
// applied, and save the result into *vstorage. // applied, and save the result into *vstorage.
void SaveBlobFilesTo(VersionStorageInfo* vstorage) const { void SaveBlobFilesTo(VersionStorageInfo* vstorage) const {
assert(vstorage); assert(vstorage);
assert(!track_found_and_missing_files_ || valid_version_available_);
assert(base_vstorage_); assert(base_vstorage_);
vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() + vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() +
@ -1080,22 +1313,24 @@ class VersionBuilder::Rep {
} }
auto process_base = auto process_base =
[vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) { [this, vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) {
assert(base_meta); assert(base_meta);
AddBlobFileIfNeeded(vstorage, base_meta); AddBlobFileIfNeeded(vstorage, base_meta,
base_meta->GetBlobFileNumber());
return true; return true;
}; };
auto process_mutable = auto process_mutable =
[vstorage](const MutableBlobFileMetaData& mutable_meta) { [this, vstorage](const MutableBlobFileMetaData& mutable_meta) {
AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta),
mutable_meta.GetBlobFileNumber());
return true; return true;
}; };
auto process_both = [vstorage]( auto process_both = [this, vstorage](
const std::shared_ptr<BlobFileMetaData>& base_meta, const std::shared_ptr<BlobFileMetaData>& base_meta,
const MutableBlobFileMetaData& mutable_meta) { const MutableBlobFileMetaData& mutable_meta) {
assert(base_meta); assert(base_meta);
@ -1108,12 +1343,14 @@ class VersionBuilder::Rep {
mutable_meta.GetGarbageBlobBytes()); mutable_meta.GetGarbageBlobBytes());
assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts()); assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts());
AddBlobFileIfNeeded(vstorage, base_meta); AddBlobFileIfNeeded(vstorage, base_meta,
base_meta->GetBlobFileNumber());
return true; return true;
} }
AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta),
mutable_meta.GetBlobFileNumber());
return true; return true;
}; };
@ -1125,6 +1362,10 @@ class VersionBuilder::Rep {
void MaybeAddFile(VersionStorageInfo* vstorage, int level, void MaybeAddFile(VersionStorageInfo* vstorage, int level,
FileMetaData* f) const { FileMetaData* f) const {
const uint64_t file_number = f->fd.GetNumber(); const uint64_t file_number = f->fd.GetNumber();
if (track_found_and_missing_files_ && level == 0 &&
l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
return;
}
const auto& level_state = levels_[level]; const auto& level_state = levels_[level];
@ -1148,6 +1389,29 @@ class VersionBuilder::Rep {
} }
} }
bool ContainsCompleteVersion() const {
assert(track_found_and_missing_files_);
return l0_missing_files_.empty() && non_l0_missing_files_.empty() &&
(missing_blob_files_high_ == kInvalidBlobFileNumber ||
missing_blob_files_high_ < GetMinOldestBlobFileNumber());
}
bool HasMissingFiles() const {
assert(track_found_and_missing_files_);
return !l0_missing_files_.empty() || !non_l0_missing_files_.empty() ||
missing_blob_files_high_ != kInvalidBlobFileNumber;
}
std::vector<std::string>& GetAndClearIntermediateFiles() {
assert(track_found_and_missing_files_);
return intermediate_files_;
}
void ClearFoundFiles() {
assert(track_found_and_missing_files_);
found_files_.clear();
}
template <typename Cmp> template <typename Cmp>
void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const { void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const {
// Merge the set of added files with the set of pre-existing files. // Merge the set of added files with the set of pre-existing files.
@ -1156,6 +1420,16 @@ class VersionBuilder::Rep {
const auto& unordered_added_files = levels_[level].added_files; const auto& unordered_added_files = levels_[level].added_files;
vstorage->Reserve(level, base_files.size() + unordered_added_files.size()); vstorage->Reserve(level, base_files.size() + unordered_added_files.size());
MergeUnorderdAddedFilesWithBase(
base_files, unordered_added_files, cmp,
[&](FileMetaData* file) { MaybeAddFile(vstorage, level, file); });
}
template <typename Cmp, typename AddFileFunc>
void MergeUnorderdAddedFilesWithBase(
const std::vector<FileMetaData*>& base_files,
const std::unordered_map<uint64_t, FileMetaData*>& unordered_added_files,
Cmp cmp, AddFileFunc add_file_func) const {
// Sort added files for the level. // Sort added files for the level.
std::vector<FileMetaData*> added_files; std::vector<FileMetaData*> added_files;
added_files.reserve(unordered_added_files.size()); added_files.reserve(unordered_added_files.size());
@ -1171,9 +1445,9 @@ class VersionBuilder::Rep {
while (added_iter != added_end || base_iter != base_end) { while (added_iter != added_end || base_iter != base_end) {
if (base_iter == base_end || if (base_iter == base_end ||
(added_iter != added_end && cmp(*added_iter, *base_iter))) { (added_iter != added_end && cmp(*added_iter, *base_iter))) {
MaybeAddFile(vstorage, level, *added_iter++); add_file_func(*added_iter++);
} else { } else {
MaybeAddFile(vstorage, level, *base_iter++); add_file_func(*base_iter++);
} }
} }
} }
@ -1215,13 +1489,13 @@ class VersionBuilder::Rep {
} }
if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) { if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) {
SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_seqno_); SaveSSTFilesTo(vstorage, /* level */ 0, *level_zero_cmp_by_seqno_);
} else { } else {
SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_epochno_); SaveSSTFilesTo(vstorage, /* level */ 0, *level_zero_cmp_by_epochno_);
} }
for (int level = 1; level < num_levels_; ++level) { for (int level = 1; level < num_levels_; ++level) {
SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_); SaveSSTFilesTo(vstorage, level, *level_nonzero_cmp_);
} }
} }
@ -1232,8 +1506,111 @@ class VersionBuilder::Rep {
} }
} }
bool ValidVersionAvailable() {
assert(track_found_and_missing_files_);
if (version_updated_since_last_check_) {
valid_version_available_ = ContainsCompleteVersion();
if (!valid_version_available_ && !edited_in_atomic_group_ &&
allow_incomplete_valid_version_) {
valid_version_available_ = OnlyMissingL0Suffix();
}
version_updated_since_last_check_ = false;
}
return valid_version_available_;
}
bool OnlyMissingL0Suffix() const {
if (!non_l0_missing_files_.empty()) {
return false;
}
assert(!(l0_missing_files_.empty() && missing_blob_files_.empty()));
if (!l0_missing_files_.empty() && !MissingL0FilesAreL0Suffix()) {
return false;
}
if (!missing_blob_files_.empty() &&
!RemainingSstFilesNotMissingBlobFiles()) {
return false;
}
return true;
}
// Check missing L0 files are a suffix of expected sorted L0 files.
bool MissingL0FilesAreL0Suffix() const {
assert(non_l0_missing_files_.empty());
assert(!l0_missing_files_.empty());
std::vector<FileMetaData*> expected_sorted_l0_files;
const auto& base_files = base_vstorage_->LevelFiles(0);
const auto& unordered_added_files = levels_[0].added_files;
expected_sorted_l0_files.reserve(base_files.size() +
unordered_added_files.size());
EpochNumberRequirement epoch_number_requirement =
base_vstorage_->GetEpochNumberRequirement();
if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) {
MergeUnorderdAddedFilesWithBase(
base_files, unordered_added_files, *level_zero_cmp_by_seqno_,
[&](FileMetaData* file) {
expected_sorted_l0_files.push_back(file);
});
} else {
MergeUnorderdAddedFilesWithBase(
base_files, unordered_added_files, *level_zero_cmp_by_epochno_,
[&](FileMetaData* file) {
expected_sorted_l0_files.push_back(file);
});
}
assert(expected_sorted_l0_files.size() >= l0_missing_files_.size());
std::unordered_set<uint64_t> unaddressed_missing_files = l0_missing_files_;
for (auto iter = expected_sorted_l0_files.begin();
iter != expected_sorted_l0_files.end(); iter++) {
uint64_t file_number = (*iter)->fd.GetNumber();
if (l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
assert(unaddressed_missing_files.find(file_number) !=
unaddressed_missing_files.end());
unaddressed_missing_files.erase(file_number);
} else if (!unaddressed_missing_files.empty()) {
return false;
} else {
break;
}
}
return true;
}
// Check for each of the missing blob file missing, it either is older than
// the minimum oldest blob file required by this Version or only linked to
// the missing L0 files.
bool RemainingSstFilesNotMissingBlobFiles() const {
assert(non_l0_missing_files_.empty());
assert(!missing_blob_files_.empty());
bool no_l0_files_missing = l0_missing_files_.empty();
uint64_t min_oldest_blob_file_num = GetMinOldestBlobFileNumber();
for (const auto& missing_blob_file : missing_blob_files_) {
if (missing_blob_file < min_oldest_blob_file_num) {
continue;
}
auto iter = mutable_blob_file_metas_.find(missing_blob_file);
assert(iter != mutable_blob_file_metas_.end());
const std::unordered_set<uint64_t>& linked_ssts =
iter->second.GetLinkedSsts();
// TODO(yuzhangyu): In theory, if no L0 SST files ara missing, and only
// blob files exclusively linked to a L0 suffix are missing, we can
// recover to a valid point in time too. We don't recover that type of
// incomplete Version yet.
if (!linked_ssts.empty() && no_l0_files_missing) {
return false;
}
if (!OnlyLinkedToMissingL0Files(linked_ssts)) {
return false;
}
}
return true;
}
// Save the current state in *vstorage. // Save the current state in *vstorage.
Status SaveTo(VersionStorageInfo* vstorage) const { Status SaveTo(VersionStorageInfo* vstorage) const {
assert(!track_found_and_missing_files_ || valid_version_available_);
Status s; Status s;
#ifndef NDEBUG #ifndef NDEBUG
@ -1266,6 +1643,7 @@ class VersionBuilder::Rep {
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
uint8_t block_protection_bytes_per_key) { uint8_t block_protection_bytes_per_key) {
assert(table_cache_ != nullptr); assert(table_cache_ != nullptr);
assert(!track_found_and_missing_files_ || valid_version_available_);
size_t table_cache_capacity = size_t table_cache_capacity =
table_cache_->get_cache().get()->GetCapacity(); table_cache_->get_cache().get()->GetCapacity();
@ -1305,6 +1683,11 @@ class VersionBuilder::Rep {
for (int level = 0; level < num_levels_; level++) { for (int level = 0; level < num_levels_; level++) {
for (auto& file_meta_pair : levels_[level].added_files) { for (auto& file_meta_pair : levels_[level].added_files) {
auto* file_meta = file_meta_pair.second; auto* file_meta = file_meta_pair.second;
uint64_t file_number = file_meta->fd.GetNumber();
if (track_found_and_missing_files_ && level == 0 &&
l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
continue;
}
// If the file has been opened before, just skip it. // If the file has been opened before, just skip it.
if (!file_meta->table_reader_handle) { if (!file_meta->table_reader_handle) {
files_meta.emplace_back(file_meta, level); files_meta.emplace_back(file_meta, level);
@ -1369,9 +1752,13 @@ VersionBuilder::VersionBuilder(
const FileOptions& file_options, const ImmutableCFOptions* ioptions, const FileOptions& file_options, const ImmutableCFOptions* ioptions,
TableCache* table_cache, VersionStorageInfo* base_vstorage, TableCache* table_cache, VersionStorageInfo* base_vstorage,
VersionSet* version_set, VersionSet* version_set,
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr) std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr,
ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler,
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
: rep_(new Rep(file_options, ioptions, table_cache, base_vstorage, : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage,
version_set, file_metadata_cache_res_mgr)) {} version_set, file_metadata_cache_res_mgr, cfd,
version_edit_handler, track_found_and_missing_files,
allow_incomplete_valid_version)) {}
VersionBuilder::~VersionBuilder() = default; VersionBuilder::~VersionBuilder() = default;
@ -1399,27 +1786,71 @@ Status VersionBuilder::LoadTableHandlers(
read_options, block_protection_bytes_per_key); read_options, block_protection_bytes_per_key);
} }
uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { void VersionBuilder::CreateOrReplaceSavePoint() {
return rep_->GetMinOldestBlobFileNumber(); assert(rep_);
savepoint_ = std::move(rep_);
rep_ = std::make_unique<Rep>(*savepoint_);
} }
bool VersionBuilder::ValidVersionAvailable() {
return rep_->ValidVersionAvailable();
}
bool VersionBuilder::HasMissingFiles() const { return rep_->HasMissingFiles(); }
std::vector<std::string>& VersionBuilder::GetAndClearIntermediateFiles() {
return rep_->GetAndClearIntermediateFiles();
}
void VersionBuilder::ClearFoundFiles() { return rep_->ClearFoundFiles(); }
Status VersionBuilder::SaveSavePointTo(VersionStorageInfo* vstorage) const {
if (!savepoint_ || !savepoint_->ValidVersionAvailable()) {
return Status::InvalidArgument();
}
return savepoint_->SaveTo(vstorage);
}
Status VersionBuilder::LoadSavePointTableHandlers(
InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
uint8_t block_protection_bytes_per_key) {
if (!savepoint_ || !savepoint_->ValidVersionAvailable()) {
return Status::InvalidArgument();
}
return savepoint_->LoadTableHandlers(
internal_stats, max_threads, prefetch_index_and_filter_in_cache,
is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin,
read_options, block_protection_bytes_per_key);
}
void VersionBuilder::ClearSavePoint() { savepoint_.reset(nullptr); }
BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
ColumnFamilyData* cfd) ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler,
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
: version_builder_(new VersionBuilder( : version_builder_(new VersionBuilder(
cfd->current()->version_set()->file_options(), cfd->ioptions(), cfd->current()->version_set()->file_options(), cfd->ioptions(),
cfd->table_cache(), cfd->current()->storage_info(), cfd->table_cache(), cfd->current()->storage_info(),
cfd->current()->version_set(), cfd->current()->version_set(),
cfd->GetFileMetadataCacheReservationManager())), cfd->GetFileMetadataCacheReservationManager(), cfd,
version_edit_handler, track_found_and_missing_files,
allow_incomplete_valid_version)),
version_(cfd->current()) { version_(cfd->current()) {
version_->Ref(); version_->Ref();
} }
BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
ColumnFamilyData* cfd, Version* v) ColumnFamilyData* cfd, Version* v, VersionEditHandler* version_edit_handler,
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
: version_builder_(new VersionBuilder( : version_builder_(new VersionBuilder(
cfd->current()->version_set()->file_options(), cfd->ioptions(), cfd->current()->version_set()->file_options(), cfd->ioptions(),
cfd->table_cache(), v->storage_info(), v->version_set(), cfd->table_cache(), v->storage_info(), v->version_set(),
cfd->GetFileMetadataCacheReservationManager())), cfd->GetFileMetadataCacheReservationManager(), cfd,
version_edit_handler, track_found_and_missing_files,
allow_incomplete_valid_version)),
version_(v) { version_(v) {
assert(version_ != cfd->current()); assert(version_ != cfd->current());
} }

View File

@ -26,6 +26,7 @@ struct FileMetaData;
class InternalStats; class InternalStats;
class Version; class Version;
class VersionSet; class VersionSet;
class VersionEditHandler;
class ColumnFamilyData; class ColumnFamilyData;
class CacheReservationManager; class CacheReservationManager;
@ -38,22 +39,80 @@ class VersionBuilder {
const ImmutableCFOptions* ioptions, TableCache* table_cache, const ImmutableCFOptions* ioptions, TableCache* table_cache,
VersionStorageInfo* base_vstorage, VersionSet* version_set, VersionStorageInfo* base_vstorage, VersionSet* version_set,
std::shared_ptr<CacheReservationManager> std::shared_ptr<CacheReservationManager>
file_metadata_cache_res_mgr = nullptr); file_metadata_cache_res_mgr = nullptr,
ColumnFamilyData* cfd = nullptr,
VersionEditHandler* version_edit_handler = nullptr,
bool track_found_and_missing_files = false,
bool allow_incomplete_valid_version = false);
~VersionBuilder(); ~VersionBuilder();
bool CheckConsistencyForNumLevels(); bool CheckConsistencyForNumLevels();
Status Apply(const VersionEdit* edit); Status Apply(const VersionEdit* edit);
// Save the current Version to the provided `vstorage`.
Status SaveTo(VersionStorageInfo* vstorage) const; Status SaveTo(VersionStorageInfo* vstorage) const;
// Load all the table handlers for the current Version in the builder.
Status LoadTableHandlers( Status LoadTableHandlers(
InternalStats* internal_stats, int max_threads, InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load, bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr<const SliceTransform>& prefix_extractor, const std::shared_ptr<const SliceTransform>& prefix_extractor,
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
uint8_t block_protection_bytes_per_key); uint8_t block_protection_bytes_per_key);
uint64_t GetMinOldestBlobFileNumber() const;
//============APIs only used by VersionEditHandlerPointInTime ============//
// Creates a save point for the Version that has been built so far. Subsequent
// VersionEdits applied to the builder will not affect the Version in this
// save point. VersionBuilder currently only supports creating one save point,
// so when `CreateOrReplaceSavePoint` is called again, the previous save point
// is cleared. `ClearSavePoint` can be called explicitly to clear
// the save point too.
void CreateOrReplaceSavePoint();
// The builder can find all the files to build a `Version`. Or if
// `allow_incomplete_valid_version_` is true and the version history is never
// edited in an atomic group, and only a suffix of L0 SST files and their
// associated blob files are missing.
// From the users' perspective, missing a suffix of L0 files means missing the
// user's most recently written data. So the remaining available files still
// presents a valid point in time view, although for some previous time.
// This validity check result will be cached and reused if the Version is not
// updated between two validity checks.
bool ValidVersionAvailable();
bool HasMissingFiles() const;
// When applying a sequence of VersionEdit, intermediate files are the ones
// that are added and then deleted. The caller should clear this intermediate
// files tracking after calling this API. So that the tracking for subsequent
// VersionEdits can start over with a clean state.
std::vector<std::string>& GetAndClearIntermediateFiles();
// Clearing all the found files in this Version.
void ClearFoundFiles();
// Save the Version in the save point to the provided `vstorage`.
// Non-OK status will be returned if there is not a valid save point.
Status SaveSavePointTo(VersionStorageInfo* vstorage) const;
// Load all the table handlers for the Version in the save point.
// Non-OK status will be returned if there is not a valid save point.
Status LoadSavePointTableHandlers(
InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr<const SliceTransform>& prefix_extractor,
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
uint8_t block_protection_bytes_per_key);
void ClearSavePoint();
//======= End of APIs only used by VersionEditPointInTime==========//
private: private:
class Rep; class Rep;
std::unique_ptr<Rep> savepoint_;
std::unique_ptr<Rep> rep_; std::unique_ptr<Rep> rep_;
}; };
@ -62,8 +121,15 @@ class VersionBuilder {
// Both of the constructor and destructor need to be called inside DB Mutex. // Both of the constructor and destructor need to be called inside DB Mutex.
class BaseReferencedVersionBuilder { class BaseReferencedVersionBuilder {
public: public:
explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd); explicit BaseReferencedVersionBuilder(
BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v); ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler = nullptr,
bool track_found_and_missing_files = false,
bool allow_incomplete_valid_version = false);
BaseReferencedVersionBuilder(
ColumnFamilyData* cfd, Version* v,
VersionEditHandler* version_edit_handler = nullptr,
bool track_found_and_missing_files = false,
bool allow_incomplete_valid_version = false);
~BaseReferencedVersionBuilder(); ~BaseReferencedVersionBuilder();
VersionBuilder* version_builder() const { return version_builder_.get(); } VersionBuilder* version_builder() const { return version_builder_.get(); }
@ -71,23 +137,4 @@ class BaseReferencedVersionBuilder {
std::unique_ptr<VersionBuilder> version_builder_; std::unique_ptr<VersionBuilder> version_builder_;
Version* version_; Version* version_;
}; };
class NewestFirstBySeqNo {
public:
bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
assert(lhs);
assert(rhs);
if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
}
if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
}
// Break ties by file number
return lhs->fd.GetNumber() > rhs->fd.GetNumber();
}
};
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -155,6 +155,7 @@ VersionEditHandler::VersionEditHandler(
VersionSet* version_set, bool track_found_and_missing_files, VersionSet* version_set, bool track_found_and_missing_files,
bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer, bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options, bool skip_load_table_files, const ReadOptions& read_options, bool skip_load_table_files,
bool allow_incomplete_valid_version,
EpochNumberRequirement epoch_number_requirement) EpochNumberRequirement epoch_number_requirement)
: VersionEditHandlerBase(read_options), : VersionEditHandlerBase(read_options),
read_only_(read_only), read_only_(read_only),
@ -165,6 +166,7 @@ VersionEditHandler::VersionEditHandler(
io_tracer_(io_tracer), io_tracer_(io_tracer),
skip_load_table_files_(skip_load_table_files), skip_load_table_files_(skip_load_table_files),
initialized_(false), initialized_(false),
allow_incomplete_valid_version_(allow_incomplete_valid_version),
epoch_number_requirement_(epoch_number_requirement) { epoch_number_requirement_(epoch_number_requirement) {
assert(version_set_ != nullptr); assert(version_set_ != nullptr);
} }
@ -218,15 +220,15 @@ Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit,
Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit, Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
ColumnFamilyData** cfd) { ColumnFamilyData** cfd) {
bool cf_in_not_found = false; bool do_not_open_cf = false;
bool cf_in_builders = false; bool cf_in_builders = false;
CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders);
assert(cfd != nullptr); assert(cfd != nullptr);
*cfd = nullptr; *cfd = nullptr;
const std::string& cf_name = edit.GetColumnFamilyName(); const std::string& cf_name = edit.GetColumnFamilyName();
Status s; Status s;
if (cf_in_builders || cf_in_not_found) { if (cf_in_builders || do_not_open_cf) {
s = Status::Corruption("MANIFEST adding the same column family twice: " + s = Status::Corruption("MANIFEST adding the same column family twice: " +
cf_name); cf_name);
} }
@ -239,7 +241,7 @@ Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
cf_name.compare(kPersistentStatsColumnFamilyName) == 0; cf_name.compare(kPersistentStatsColumnFamilyName) == 0;
if (cf_options == name_to_options_.end() && if (cf_options == name_to_options_.end() &&
!is_persistent_stats_column_family) { !is_persistent_stats_column_family) {
column_families_not_found_.emplace(edit.GetColumnFamily(), cf_name); do_not_open_column_families_.emplace(edit.GetColumnFamily(), cf_name);
} else { } else {
if (is_persistent_stats_column_family) { if (is_persistent_stats_column_family) {
ColumnFamilyOptions cfo; ColumnFamilyOptions cfo;
@ -256,9 +258,9 @@ Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit, Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit,
ColumnFamilyData** cfd) { ColumnFamilyData** cfd) {
bool cf_in_not_found = false; bool do_not_open_cf = false;
bool cf_in_builders = false; bool cf_in_builders = false;
CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders);
assert(cfd != nullptr); assert(cfd != nullptr);
*cfd = nullptr; *cfd = nullptr;
@ -266,8 +268,8 @@ Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit,
Status s; Status s;
if (cf_in_builders) { if (cf_in_builders) {
tmp_cfd = DestroyCfAndCleanup(edit); tmp_cfd = DestroyCfAndCleanup(edit);
} else if (cf_in_not_found) { } else if (do_not_open_cf) {
column_families_not_found_.erase(edit.GetColumnFamily()); do_not_open_column_families_.erase(edit.GetColumnFamily());
} else { } else {
s = Status::Corruption("MANIFEST - dropping non-existing column family"); s = Status::Corruption("MANIFEST - dropping non-existing column family");
} }
@ -288,22 +290,20 @@ Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) {
Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit, Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit,
ColumnFamilyData** cfd) { ColumnFamilyData** cfd) {
bool cf_in_not_found = false; bool do_not_open_cf = false;
bool cf_in_builders = false; bool cf_in_builders = false;
CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders);
assert(cfd != nullptr); assert(cfd != nullptr);
*cfd = nullptr; *cfd = nullptr;
Status s; Status s;
if (!cf_in_not_found) { if (!do_not_open_cf) {
if (!cf_in_builders) { if (!cf_in_builders) {
s = Status::Corruption( s = Status::Corruption(
"MANIFEST record referencing unknown column family"); "MANIFEST record referencing unknown column family");
} }
ColumnFamilyData* tmp_cfd = nullptr; ColumnFamilyData* tmp_cfd = nullptr;
if (s.ok()) { if (s.ok()) {
auto builder_iter = builders_.find(edit.GetColumnFamily());
assert(builder_iter != builders_.end());
tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily( tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily(
edit.GetColumnFamily()); edit.GetColumnFamily());
assert(tmp_cfd != nullptr); assert(tmp_cfd != nullptr);
@ -318,56 +318,33 @@ Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit,
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false); s = MaybeCreateVersionBeforeApplyEdit(edit, tmp_cfd,
if (s.ok()) { /*force_create_version=*/false);
s = builder_iter->second->version_builder()->Apply(&edit);
}
} }
*cfd = tmp_cfd; *cfd = tmp_cfd;
} }
return s; return s;
} }
// TODO maybe cache the computation result
bool VersionEditHandler::HasMissingFiles() const {
bool ret = false;
for (const auto& elem : cf_to_missing_files_) {
const auto& missing_files = elem.second;
if (!missing_files.empty()) {
ret = true;
break;
}
}
if (!ret) {
for (const auto& elem : cf_to_missing_blob_files_high_) {
if (elem.second != kInvalidBlobFileNumber) {
ret = true;
break;
}
}
}
return ret;
}
void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit, void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit,
bool* cf_in_not_found, bool* do_not_open_cf,
bool* cf_in_builders) const { bool* cf_in_builders) const {
assert(cf_in_not_found != nullptr); assert(do_not_open_cf != nullptr);
assert(cf_in_builders != nullptr); assert(cf_in_builders != nullptr);
// Not found means that user didn't supply that column // Not found means that user didn't supply that column
// family option AND we encountered column family add // family option AND we encountered column family add
// record. Once we encounter column family drop record, // record. Once we encounter column family drop record,
// we will delete the column family from // we will delete the column family from
// column_families_not_found. // do_not_open_column_families_.
uint32_t cf_id = edit.GetColumnFamily(); uint32_t cf_id = edit.GetColumnFamily();
bool in_not_found = column_families_not_found_.find(cf_id) != bool in_do_not_open = do_not_open_column_families_.find(cf_id) !=
column_families_not_found_.end(); do_not_open_column_families_.end();
// in builders means that user supplied that column family // in builders means that user supplied that column family
// option AND that we encountered column family add record // option AND that we encountered column family add record
bool in_builders = builders_.find(cf_id) != builders_.end(); bool in_builders = builders_.find(cf_id) != builders_.end();
// They cannot both be true // They cannot both be true
assert(!(in_not_found && in_builders)); assert(!(in_do_not_open && in_builders));
*cf_in_not_found = in_not_found; *do_not_open_cf = in_do_not_open;
*cf_in_builders = in_builders; *cf_in_builders = in_builders;
} }
@ -396,9 +373,9 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
// There were some column families in the MANIFEST that weren't specified // There were some column families in the MANIFEST that weren't specified
// in the argument. This is OK in read_only mode // in the argument. This is OK in read_only mode
if (s->ok() && MustOpenAllColumnFamilies() && if (s->ok() && MustOpenAllColumnFamilies() &&
!column_families_not_found_.empty()) { !do_not_open_column_families_.empty()) {
std::string msg; std::string msg;
for (const auto& cf : column_families_not_found_) { for (const auto& cf : do_not_open_column_families_) {
msg.append(", "); msg.append(", ");
msg.append(cf.second); msg.append(cf.second);
} }
@ -453,7 +430,8 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
} }
assert(cfd->initialized()); assert(cfd->initialized());
VersionEdit edit; VersionEdit edit;
*s = MaybeCreateVersion(edit, cfd, /*force_create_version=*/true); *s = MaybeCreateVersionBeforeApplyEdit(edit, cfd,
/*force_create_version=*/true);
if (!s->ok()) { if (!s->ok()) {
break; break;
} }
@ -498,13 +476,9 @@ ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
assert(cfd != nullptr); assert(cfd != nullptr);
cfd->set_initialized(); cfd->set_initialized();
assert(builders_.find(cf_id) == builders_.end()); assert(builders_.find(cf_id) == builders_.end());
builders_.emplace(cf_id, builders_.emplace(cf_id, VersionBuilderUPtr(new BaseReferencedVersionBuilder(
VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd))); cfd, this, track_found_and_missing_files_,
if (track_found_and_missing_files_) { allow_incomplete_valid_version_)));
cf_to_found_files_.emplace(cf_id, std::unordered_set<uint64_t>());
cf_to_missing_files_.emplace(cf_id, std::unordered_set<uint64_t>());
cf_to_missing_blob_files_high_.emplace(cf_id, kInvalidBlobFileNumber);
}
return cfd; return cfd;
} }
@ -514,21 +488,6 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
auto builder_iter = builders_.find(cf_id); auto builder_iter = builders_.find(cf_id);
assert(builder_iter != builders_.end()); assert(builder_iter != builders_.end());
builders_.erase(builder_iter); builders_.erase(builder_iter);
if (track_found_and_missing_files_) {
auto found_files_iter = cf_to_found_files_.find(cf_id);
assert(found_files_iter != cf_to_found_files_.end());
cf_to_found_files_.erase(found_files_iter);
auto missing_files_iter = cf_to_missing_files_.find(cf_id);
assert(missing_files_iter != cf_to_missing_files_.end());
cf_to_missing_files_.erase(missing_files_iter);
auto missing_blob_files_high_iter =
cf_to_missing_blob_files_high_.find(cf_id);
assert(missing_blob_files_high_iter !=
cf_to_missing_blob_files_high_.end());
cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter);
}
ColumnFamilyData* ret = ColumnFamilyData* ret =
version_set_->GetColumnFamilySet()->GetColumnFamily(cf_id); version_set_->GetColumnFamilySet()->GetColumnFamily(cf_id);
assert(ret != nullptr); assert(ret != nullptr);
@ -538,15 +497,14 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
return ret; return ret;
} }
Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, Status VersionEditHandler::MaybeCreateVersionBeforeApplyEdit(
ColumnFamilyData* cfd, const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) {
bool force_create_version) {
assert(cfd->initialized()); assert(cfd->initialized());
Status s; Status s;
auto builder_iter = builders_.find(cfd->GetID());
assert(builder_iter != builders_.end());
auto* builder = builder_iter->second->version_builder();
if (force_create_version) { if (force_create_version) {
auto builder_iter = builders_.find(cfd->GetID());
assert(builder_iter != builders_.end());
auto* builder = builder_iter->second->version_builder();
auto* v = new Version(cfd, version_set_, version_set_->file_options_, auto* v = new Version(cfd, version_set_, version_set_->file_options_,
*cfd->GetLatestMutableCFOptions(), io_tracer_, *cfd->GetLatestMutableCFOptions(), io_tracer_,
version_set_->current_version_number_++, version_set_->current_version_number_++,
@ -562,6 +520,7 @@ Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/,
delete v; delete v;
} }
} }
s = builder->Apply(&edit);
return s; return s;
} }
@ -731,12 +690,13 @@ Status VersionEditHandler::MaybeHandleFileBoundariesForNewFiles(
VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( VersionEditHandlerPointInTime::VersionEditHandlerPointInTime(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families, bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer, VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options, const ReadOptions& read_options, bool allow_incomplete_valid_version,
EpochNumberRequirement epoch_number_requirement) EpochNumberRequirement epoch_number_requirement)
: VersionEditHandler(read_only, column_families, version_set, : VersionEditHandler(read_only, column_families, version_set,
/*track_found_and_missing_files=*/true, /*track_found_and_missing_files=*/true,
/*no_error_if_files_missing=*/true, io_tracer, /*no_error_if_files_missing=*/true, io_tracer,
read_options, epoch_number_requirement) {} read_options, allow_incomplete_valid_version,
epoch_number_requirement) {}
VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() {
for (const auto& cfid_and_version : atomic_update_versions_) { for (const auto& cfid_and_version : atomic_update_versions_) {
@ -762,7 +722,8 @@ Status VersionEditHandlerPointInTime::OnAtomicGroupReplayBegin() {
assert(!cfd->IsDropped()); assert(!cfd->IsDropped());
assert(cfd->initialized()); assert(cfd->initialized());
VersionEdit edit; VersionEdit edit;
Status s = MaybeCreateVersion(edit, cfd, true /* force_create_version */); Status s = MaybeCreateVersionBeforeApplyEdit(
edit, cfd, true /* force_create_version */);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -824,17 +785,17 @@ void VersionEditHandlerPointInTime::CheckIterationResult(
} }
assert(cfd->initialized()); assert(cfd->initialized());
auto v_iter = versions_.find(cfd->GetID()); auto v_iter = versions_.find(cfd->GetID());
auto builder_iter = builders_.find(cfd->GetID());
if (v_iter != versions_.end()) { if (v_iter != versions_.end()) {
assert(v_iter->second != nullptr); assert(v_iter->second != nullptr);
assert(builder_iter != builders_.end());
version_set_->AppendVersion(cfd, v_iter->second); version_set_->AppendVersion(cfd, v_iter->second);
versions_.erase(v_iter); versions_.erase(v_iter);
// Let's clear found_files, since any files in that are part of the // Let's clear found_files, since any files in that are part of the
// installed Version. Any files that got obsoleted would have already // installed Version. Any files that got obsoleted would have already
// been moved to intermediate_files_ // been moved to intermediate_files_
auto found_files_iter = cf_to_found_files_.find(cfd->GetID()); builder_iter->second->version_builder()->ClearFoundFiles();
assert(found_files_iter != cf_to_found_files_.end());
found_files_iter->second.clear();
} }
} }
} else { } else {
@ -863,147 +824,50 @@ ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup(
return cfd; return cfd;
} }
Status VersionEditHandlerPointInTime::MaybeCreateVersion( Status VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit(
const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) { const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) {
TEST_SYNC_POINT("VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1"); TEST_SYNC_POINT(
TEST_SYNC_POINT("VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"); "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin1");
TEST_SYNC_POINT(
"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin2");
assert(cfd != nullptr); assert(cfd != nullptr);
if (!force_create_version) { if (!force_create_version) {
assert(edit.GetColumnFamily() == cfd->GetID()); assert(edit.GetColumnFamily() == cfd->GetID());
} }
auto found_files_iter = cf_to_found_files_.find(cfd->GetID());
assert(found_files_iter != cf_to_found_files_.end());
std::unordered_set<uint64_t>& found_files = found_files_iter->second;
auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID());
assert(missing_files_iter != cf_to_missing_files_.end());
std::unordered_set<uint64_t>& missing_files = missing_files_iter->second;
auto missing_blob_files_high_iter =
cf_to_missing_blob_files_high_.find(cfd->GetID());
assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end());
const uint64_t prev_missing_blob_file_high =
missing_blob_files_high_iter->second;
VersionBuilder* builder = nullptr;
if (prev_missing_blob_file_high != kInvalidBlobFileNumber) {
auto builder_iter = builders_.find(cfd->GetID());
assert(builder_iter != builders_.end());
builder = builder_iter->second->version_builder();
assert(builder != nullptr);
}
// At this point, we have not yet applied the new version edits read from the
// MANIFEST. We check whether we have any missing table and blob files.
const bool prev_has_missing_files =
!missing_files.empty() ||
(prev_missing_blob_file_high != kInvalidBlobFileNumber &&
prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber());
for (const auto& file : edit.GetDeletedFiles()) {
uint64_t file_num = file.second;
auto fiter = missing_files.find(file_num);
if (fiter != missing_files.end()) {
missing_files.erase(fiter);
} else {
fiter = found_files.find(file_num);
// Only mark new files added during this catchup attempt for deletion.
// These files were never installed in VersionStorageInfo.
// Already referenced files that are deleted by a VersionEdit will
// be added to the VersionStorageInfo's obsolete files when the old
// version is dereferenced.
if (fiter != found_files.end()) {
intermediate_files_.emplace_back(
MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num));
found_files.erase(fiter);
}
}
}
assert(!cfd->ioptions()->cf_paths.empty());
Status s;
for (const auto& elem : edit.GetNewFiles()) {
int level = elem.first;
const FileMetaData& meta = elem.second;
const FileDescriptor& fd = meta.fd;
uint64_t file_num = fd.GetNumber();
const std::string fpath =
MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num);
s = VerifyFile(cfd, fpath, level, meta);
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
missing_files.insert(file_num);
if (s.IsCorruption()) {
found_files.insert(file_num);
}
s = Status::OK();
} else if (!s.ok()) {
break;
} else {
found_files.insert(file_num);
}
}
uint64_t missing_blob_file_num = prev_missing_blob_file_high;
for (const auto& elem : edit.GetBlobFileAdditions()) {
uint64_t file_num = elem.GetBlobFileNumber();
s = VerifyBlobFile(cfd, file_num, elem);
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
missing_blob_file_num = std::max(missing_blob_file_num, file_num);
s = Status::OK();
} else if (!s.ok()) {
break;
}
}
bool has_missing_blob_files = false;
if (missing_blob_file_num != kInvalidBlobFileNumber &&
missing_blob_file_num >= prev_missing_blob_file_high) {
missing_blob_files_high_iter->second = missing_blob_file_num;
has_missing_blob_files = true;
} else if (missing_blob_file_num < prev_missing_blob_file_high) {
assert(false);
}
// We still have not applied the new version edit, but have tried to add new
// table and blob files after verifying their presence and consistency.
// Therefore, we know whether we will see new missing table and blob files
// later after actually applying the version edit. We perform the check here
// and record the result.
const bool has_missing_files =
!missing_files.empty() || has_missing_blob_files;
bool missing_info = !version_edit_params_.HasLogNumber() || bool missing_info = !version_edit_params_.HasLogNumber() ||
!version_edit_params_.HasNextFile() || !version_edit_params_.HasNextFile() ||
!version_edit_params_.HasLastSequence(); !version_edit_params_.HasLastSequence();
// Create version before apply edit. The version will represent the state Status s;
// before applying the version edit. auto builder_iter = builders_.find(cfd->GetID());
assert(builder_iter != builders_.end());
VersionBuilder* builder = builder_iter->second->version_builder();
const bool valid_pit_before_edit = builder->ValidVersionAvailable();
builder->CreateOrReplaceSavePoint();
s = builder->Apply(&edit);
const bool valid_pit_after_edit = builder->ValidVersionAvailable();
// A new version will be created if: // A new version will be created if:
// 1) no error has occurred so far, and // 1) no error has occurred so far, and
// 2) log_number_, next_file_number_ and last_sequence_ are known, and // 2) log_number_, next_file_number_ and last_sequence_ are known, and
// 3) not in an AtomicGroup // 3) not in an AtomicGroup
// 4) any of the following: // 4) any of the following:
// a) no missing file before, but will have missing file(s) after applying // a) a valid Version is available before applying the edit
// this version edit. // and a valid Version is not available after the edit.
// b) no missing file after applying the version edit, and the caller // b) a valid Version is available after the edit and the
// explicitly request that a new version be created. // caller explicitly request that a new version be created.
if (s.ok() && !missing_info && !in_atomic_group_ && if (s.ok() && !missing_info && !in_atomic_group_ &&
((has_missing_files && !prev_has_missing_files) || ((!valid_pit_after_edit && valid_pit_before_edit) ||
(!has_missing_files && force_create_version))) { (valid_pit_after_edit && force_create_version))) {
if (!builder) {
auto builder_iter = builders_.find(cfd->GetID());
assert(builder_iter != builders_.end());
builder = builder_iter->second->version_builder();
assert(builder);
}
const MutableCFOptions* cf_opts_ptr = cfd->GetLatestMutableCFOptions(); const MutableCFOptions* cf_opts_ptr = cfd->GetLatestMutableCFOptions();
auto* version = new Version(cfd, version_set_, version_set_->file_options_, auto* version = new Version(cfd, version_set_, version_set_->file_options_,
*cf_opts_ptr, io_tracer_, *cf_opts_ptr, io_tracer_,
version_set_->current_version_number_++, version_set_->current_version_number_++,
epoch_number_requirement_); epoch_number_requirement_);
s = builder->LoadTableHandlers( s = builder->LoadSavePointTableHandlers(
cfd->internal_stats(), cfd->internal_stats(),
version_set_->db_options_->max_file_opening_threads, false, true, version_set_->db_options_->max_file_opening_threads, false, true,
cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr), cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr),
@ -1015,7 +879,7 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
} }
return s; return s;
} }
s = builder->SaveTo(version->storage_info()); s = builder->SaveSavePointTo(version->storage_info());
if (s.ok()) { if (s.ok()) {
if (AtomicUpdateVersionsContains(cfd->GetID())) { if (AtomicUpdateVersionsContains(cfd->GetID())) {
AtomicUpdateVersionsPut(version); AtomicUpdateVersionsPut(version);
@ -1038,6 +902,8 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
delete version; delete version;
} }
} }
builder->ClearSavePoint();
return s; return s;
} }
@ -1072,6 +938,15 @@ Status VersionEditHandlerPointInTime::LoadTables(
return Status::OK(); return Status::OK();
} }
bool VersionEditHandlerPointInTime::HasMissingFiles() const {
for (const auto& builder : builders_) {
if (builder.second->version_builder()->HasMissingFiles()) {
return true;
}
}
return false;
}
bool VersionEditHandlerPointInTime::AtomicUpdateVersionsCompleted() { bool VersionEditHandlerPointInTime::AtomicUpdateVersionsCompleted() {
return atomic_update_versions_missing_ == 0; return atomic_update_versions_missing_ == 0;
} }
@ -1145,8 +1020,9 @@ Status ManifestTailer::Initialize() {
Version* base_version = dummy_version->Next(); Version* base_version = dummy_version->Next();
assert(base_version); assert(base_version);
base_version->Ref(); base_version->Ref();
VersionBuilderUPtr new_builder( VersionBuilderUPtr new_builder(new BaseReferencedVersionBuilder(
new BaseReferencedVersionBuilder(default_cfd, base_version)); default_cfd, base_version, this, track_found_and_missing_files_,
allow_incomplete_valid_version_));
builder_iter->second = std::move(new_builder); builder_iter->second = std::move(new_builder);
initialized_ = true; initialized_ = true;
@ -1189,8 +1065,8 @@ Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit,
Version* base_version = dummy_version->Next(); Version* base_version = dummy_version->Next();
assert(base_version); assert(base_version);
base_version->Ref(); base_version->Ref();
VersionBuilderUPtr new_builder( VersionBuilderUPtr new_builder(new BaseReferencedVersionBuilder(
new BaseReferencedVersionBuilder(tmp_cfd, base_version)); tmp_cfd, base_version, this, track_found_and_missing_files_));
builder_iter->second = std::move(new_builder); builder_iter->second = std::move(new_builder);
#ifndef NDEBUG #ifndef NDEBUG
@ -1213,6 +1089,18 @@ void ManifestTailer::CheckIterationResult(const log::Reader& reader,
} }
} }
std::vector<std::string> ManifestTailer::GetAndClearIntermediateFiles() {
std::vector<std::string> res;
for (const auto& builder : builders_) {
auto files =
builder.second->version_builder()->GetAndClearIntermediateFiles();
res.insert(res.end(), std::make_move_iterator(files.begin()),
std::make_move_iterator(files.end()));
files.erase(files.begin(), files.end());
}
return res;
}
Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd, Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd,
const std::string& fpath, int level, const std::string& fpath, int level,
const FileMetaData& fmeta) { const FileMetaData& fmeta) {

View File

@ -100,7 +100,9 @@ using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
// A class used for scanning MANIFEST file. // A class used for scanning MANIFEST file.
// VersionEditHandler reads a MANIFEST file, parses the version edits, and // VersionEditHandler reads a MANIFEST file, parses the version edits, and
// builds the version set's in-memory state, e.g. the version storage info for // builds the version set's in-memory state, e.g. the version storage info for
// the versions of column families. // the versions of column families. It replays all the version edits in one
// MANIFEST file to build the end version.
//
// To use this class and its subclasses, // To use this class and its subclasses,
// 1. Create an object of VersionEditHandler or its subclasses. // 1. Create an object of VersionEditHandler or its subclasses.
// VersionEditHandler handler(read_only, column_families, version_set, // VersionEditHandler handler(read_only, column_families, version_set,
@ -119,13 +121,14 @@ class VersionEditHandler : public VersionEditHandlerBase {
VersionSet* version_set, bool track_found_and_missing_files, VersionSet* version_set, bool track_found_and_missing_files,
bool no_error_if_files_missing, bool no_error_if_files_missing,
const std::shared_ptr<IOTracer>& io_tracer, const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options, const ReadOptions& read_options, bool allow_incomplete_valid_version,
EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement epoch_number_requirement =
EpochNumberRequirement::kMustPresent) EpochNumberRequirement::kMustPresent)
: VersionEditHandler(read_only, column_families, version_set, : VersionEditHandler(read_only, column_families, version_set,
track_found_and_missing_files, track_found_and_missing_files,
no_error_if_files_missing, io_tracer, read_options, no_error_if_files_missing, io_tracer, read_options,
/*skip_load_table_files=*/false, /*skip_load_table_files=*/false,
allow_incomplete_valid_version,
epoch_number_requirement) {} epoch_number_requirement) {}
~VersionEditHandler() override {} ~VersionEditHandler() override {}
@ -134,14 +137,24 @@ class VersionEditHandler : public VersionEditHandlerBase {
return version_edit_params_; return version_edit_params_;
} }
bool HasMissingFiles() const;
void GetDbId(std::string* db_id) const { void GetDbId(std::string* db_id) const {
if (db_id && version_edit_params_.HasDbId()) { if (db_id && version_edit_params_.HasDbId()) {
*db_id = version_edit_params_.GetDbId(); *db_id = version_edit_params_.GetDbId();
} }
} }
virtual Status VerifyFile(ColumnFamilyData* /*cfd*/,
const std::string& /*fpath*/, int /*level*/,
const FileMetaData& /*fmeta*/) {
return Status::OK();
}
virtual Status VerifyBlobFile(ColumnFamilyData* /*cfd*/,
uint64_t /*blob_file_num*/,
const BlobFileAddition& /*blob_addition*/) {
return Status::OK();
}
protected: protected:
explicit VersionEditHandler( explicit VersionEditHandler(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families, bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
@ -149,6 +162,7 @@ class VersionEditHandler : public VersionEditHandlerBase {
bool no_error_if_files_missing, bool no_error_if_files_missing,
const std::shared_ptr<IOTracer>& io_tracer, const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options, bool skip_load_table_files, const ReadOptions& read_options, bool skip_load_table_files,
bool allow_incomplete_valid_version,
EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement epoch_number_requirement =
EpochNumberRequirement::kMustPresent); EpochNumberRequirement::kMustPresent);
@ -166,7 +180,7 @@ class VersionEditHandler : public VersionEditHandlerBase {
Status Initialize() override; Status Initialize() override;
void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found, void CheckColumnFamilyId(const VersionEdit& edit, bool* do_not_open_cf,
bool* cf_in_builders) const; bool* cf_in_builders) const;
void CheckIterationResult(const log::Reader& reader, Status* s) override; void CheckIterationResult(const log::Reader& reader, Status* s) override;
@ -176,9 +190,9 @@ class VersionEditHandler : public VersionEditHandlerBase {
virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit); virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit);
virtual Status MaybeCreateVersion(const VersionEdit& edit, virtual Status MaybeCreateVersionBeforeApplyEdit(const VersionEdit& edit,
ColumnFamilyData* cfd, ColumnFamilyData* cfd,
bool force_create_version); bool force_create_version);
virtual Status LoadTables(ColumnFamilyData* cfd, virtual Status LoadTables(ColumnFamilyData* cfd,
bool prefetch_index_and_filter_in_cache, bool prefetch_index_and_filter_in_cache,
@ -191,21 +205,23 @@ class VersionEditHandler : public VersionEditHandlerBase {
VersionSet* version_set_; VersionSet* version_set_;
std::unordered_map<uint32_t, VersionBuilderUPtr> builders_; std::unordered_map<uint32_t, VersionBuilderUPtr> builders_;
std::unordered_map<std::string, ColumnFamilyOptions> name_to_options_; std::unordered_map<std::string, ColumnFamilyOptions> name_to_options_;
// Keeps track of column families in manifest that were not found in
// column families parameters. if those column families are not dropped
// by subsequent manifest records, Recover() will return failure status.
std::unordered_map<uint32_t, std::string> column_families_not_found_;
VersionEditParams version_edit_params_;
const bool track_found_and_missing_files_; const bool track_found_and_missing_files_;
std::unordered_map<uint32_t, std::unordered_set<uint64_t>> cf_to_found_files_; // Keeps track of column families in manifest that were not found in
std::unordered_map<uint32_t, std::unordered_set<uint64_t>> // column families parameters. Namely, the user asks to not open these column
cf_to_missing_files_; // families. In non read only mode, if those column families are not dropped
std::unordered_map<uint32_t, uint64_t> cf_to_missing_blob_files_high_; // by subsequent manifest records, Recover() will return failure status.
std::unordered_map<uint32_t, std::string> do_not_open_column_families_;
VersionEditParams version_edit_params_;
bool no_error_if_files_missing_; bool no_error_if_files_missing_;
std::shared_ptr<IOTracer> io_tracer_; std::shared_ptr<IOTracer> io_tracer_;
bool skip_load_table_files_; bool skip_load_table_files_;
bool initialized_; bool initialized_;
std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_; std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_;
// If false, only a complete Version for which all files consisting it can be
// found is considered a valid Version. If true, besides complete Version, an
// incomplete Version with only a suffix of L0 files missing is also
// considered valid if the Version is never edited in an atomic group.
const bool allow_incomplete_valid_version_;
EpochNumberRequirement epoch_number_requirement_; EpochNumberRequirement epoch_number_requirement_;
std::unordered_set<uint32_t> cfds_to_mark_no_udt_; std::unordered_set<uint32_t> cfds_to_mark_no_udt_;
@ -226,8 +242,18 @@ class VersionEditHandler : public VersionEditHandlerBase {
}; };
// A class similar to its base class, i.e. VersionEditHandler. // A class similar to its base class, i.e. VersionEditHandler.
// VersionEditHandlerPointInTime restores the versions to the most recent point // Unlike VersionEditHandler that only aims to build the end version, this class
// in time such that at this point, the version does not have missing files. // supports building the most recent point in time version. A point in time
// version is a version for which no files are missing, or if
// `allow_incomplete_valid_version` is true, only a suffix of L0 files (and
// their associated blob files) are missing.
//
// Building a point in time version when end version is not available can
// be useful for best efforts recovery (options.best_efforts_recovery), which
// uses this class and sets `allow_incomplete_valid_version` to true.
// It's also useful for secondary instances/follower instances for which end
// version could be transiently unavailable. These two cases use subclass
// `ManifestTailer` and sets `allow_incomplete_valid_version` to false.
// //
// Not thread-safe, external synchronization is necessary if an object of // Not thread-safe, external synchronization is necessary if an object of
// VersionEditHandlerPointInTime is shared by multiple threads. // VersionEditHandlerPointInTime is shared by multiple threads.
@ -236,28 +262,32 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
VersionEditHandlerPointInTime( VersionEditHandlerPointInTime(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families, bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer, VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options, const ReadOptions& read_options, bool allow_incomplete_valid_version,
EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement epoch_number_requirement =
EpochNumberRequirement::kMustPresent); EpochNumberRequirement::kMustPresent);
~VersionEditHandlerPointInTime() override; ~VersionEditHandlerPointInTime() override;
bool HasMissingFiles() const;
virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath,
int level, const FileMetaData& fmeta) override;
virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
const BlobFileAddition& blob_addition) override;
protected: protected:
Status OnAtomicGroupReplayBegin() override; Status OnAtomicGroupReplayBegin() override;
Status OnAtomicGroupReplayEnd() override; Status OnAtomicGroupReplayEnd() override;
void CheckIterationResult(const log::Reader& reader, Status* s) override; void CheckIterationResult(const log::Reader& reader, Status* s) override;
ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override; ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override;
// `MaybeCreateVersion(..., false)` creates a version upon a negative edge // `MaybeCreateVersionBeforeApplyEdit(..., false)` creates a version upon a
// trigger (transition from valid to invalid). // negative edge trigger (transition from valid to invalid).
// //
// `MaybeCreateVersion(..., true)` creates a version on a positive level // `MaybeCreateVersionBeforeApplyEdit(..., true)` creates a version on a
// trigger (state is valid). // positive level trigger (state is valid).
Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd, Status MaybeCreateVersionBeforeApplyEdit(const VersionEdit& edit,
bool force_create_version) override; ColumnFamilyData* cfd,
virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, bool force_create_version) override;
int level, const FileMetaData& fmeta);
virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
const BlobFileAddition& blob_addition);
Status LoadTables(ColumnFamilyData* cfd, Status LoadTables(ColumnFamilyData* cfd,
bool prefetch_index_and_filter_in_cache, bool prefetch_index_and_filter_in_cache,
@ -275,8 +305,6 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
bool in_atomic_group_ = false; bool in_atomic_group_ = false;
std::vector<std::string> intermediate_files_;
private: private:
bool AtomicUpdateVersionsCompleted(); bool AtomicUpdateVersionsCompleted();
bool AtomicUpdateVersionsContains(uint32_t cfid); bool AtomicUpdateVersionsContains(uint32_t cfid);
@ -292,6 +320,12 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
void AtomicUpdateVersionsApply(); void AtomicUpdateVersionsApply();
}; };
// A class similar to `VersionEditHandlerPointInTime` that parse MANIFEST and
// builds point in time version.
// `ManifestTailer` supports reading one MANIFEST file in multiple tailing
// attempts and supports switching to a different MANIFEST after
// `PrepareToReadNewManifest` is called. This class is used by secondary and
// follower instance.
class ManifestTailer : public VersionEditHandlerPointInTime { class ManifestTailer : public VersionEditHandlerPointInTime {
public: public:
explicit ManifestTailer(std::vector<ColumnFamilyDescriptor> column_families, explicit ManifestTailer(std::vector<ColumnFamilyDescriptor> column_families,
@ -302,9 +336,13 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
EpochNumberRequirement::kMustPresent) EpochNumberRequirement::kMustPresent)
: VersionEditHandlerPointInTime(/*read_only=*/false, column_families, : VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
version_set, io_tracer, read_options, version_set, io_tracer, read_options,
/*allow_incomplete_valid_version=*/false,
epoch_number_requirement), epoch_number_requirement),
mode_(Mode::kRecovery) {} mode_(Mode::kRecovery) {}
Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
const FileMetaData& fmeta) override;
void PrepareToReadNewManifest() { void PrepareToReadNewManifest() {
initialized_ = false; initialized_ = false;
ClearReadBuffer(); ClearReadBuffer();
@ -314,9 +352,7 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
return cfds_changed_; return cfds_changed_;
} }
std::vector<std::string>& GetIntermediateFiles() { std::vector<std::string> GetAndClearIntermediateFiles();
return intermediate_files_;
}
protected: protected:
Status Initialize() override; Status Initialize() override;
@ -329,9 +365,6 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
void CheckIterationResult(const log::Reader& reader, Status* s) override; void CheckIterationResult(const log::Reader& reader, Status* s) override;
Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
const FileMetaData& fmeta) override;
enum Mode : uint8_t { enum Mode : uint8_t {
kRecovery = 0, kRecovery = 0,
kCatchUp = 1, kCatchUp = 1,
@ -352,7 +385,9 @@ class DumpManifestHandler : public VersionEditHandler {
/*read_only=*/true, column_families, version_set, /*read_only=*/true, column_families, version_set,
/*track_found_and_missing_files=*/false, /*track_found_and_missing_files=*/false,
/*no_error_if_files_missing=*/false, io_tracer, read_options, /*no_error_if_files_missing=*/false, io_tracer, read_options,
/*skip_load_table_files=*/true), /*skip_load_table_files=*/true,
/*allow_incomplete_valid_version=*/false,
/*epoch_number_requirement=*/EpochNumberRequirement::kMustPresent),
verbose_(verbose), verbose_(verbose),
hex_(hex), hex_(hex),
json_(json), json_(json),

View File

@ -5511,6 +5511,10 @@ Status VersionSet::ProcessManifestWrites(
std::unique_ptr<log::Writer> new_desc_log_ptr; std::unique_ptr<log::Writer> new_desc_log_ptr;
{ {
FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_); FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
// DB option (in file_options_) takes precedence when not kUnknown
if (file_options_.temperature != Temperature::kUnknown) {
opt_file_opts.temperature = file_options_.temperature;
}
mu->Unlock(); mu->Unlock();
TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart"); TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart");
TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr); TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr);
@ -5637,9 +5641,9 @@ Status VersionSet::ProcessManifestWrites(
assert(manifest_io_status.ok()); assert(manifest_io_status.ok());
} }
if (s.ok() && new_descriptor_log) { if (s.ok() && new_descriptor_log) {
io_s = SetCurrentFile(write_options, fs_.get(), dbname_, io_s = SetCurrentFile(
pending_manifest_file_number_, write_options, fs_.get(), dbname_, pending_manifest_file_number_,
dir_contains_current_file); file_options_.temperature, dir_contains_current_file);
if (!io_s.ok()) { if (!io_s.ok()) {
s = io_s; s = io_s;
// Quarantine old manifest file in case new manifest file's CURRENT file // Quarantine old manifest file in case new manifest file's CURRENT file
@ -6080,7 +6084,8 @@ Status VersionSet::Recover(
VersionEditHandler handler( VersionEditHandler handler(
read_only, column_families, const_cast<VersionSet*>(this), read_only, column_families, const_cast<VersionSet*>(this),
/*track_found_and_missing_files=*/false, no_error_if_files_missing, /*track_found_and_missing_files=*/false, no_error_if_files_missing,
io_tracer_, read_options, EpochNumberRequirement::kMightMissing); io_tracer_, read_options, /*allow_incomplete_valid_version=*/false,
EpochNumberRequirement::kMightMissing);
handler.Iterate(reader, &log_read_status); handler.Iterate(reader, &log_read_status);
s = handler.status(); s = handler.status();
if (s.ok()) { if (s.ok()) {
@ -6256,7 +6261,8 @@ Status VersionSet::TryRecoverFromOneManifest(
/*checksum=*/true, /*log_num=*/0); /*checksum=*/true, /*log_num=*/0);
VersionEditHandlerPointInTime handler_pit( VersionEditHandlerPointInTime handler_pit(
read_only, column_families, const_cast<VersionSet*>(this), io_tracer_, read_only, column_families, const_cast<VersionSet*>(this), io_tracer_,
read_options, EpochNumberRequirement::kMightMissing); read_options, /*allow_incomplete_valid_version=*/true,
EpochNumberRequirement::kMightMissing);
handler_pit.Iterate(reader, &s); handler_pit.Iterate(reader, &s);
@ -7477,7 +7483,7 @@ Status ReactiveVersionSet::ReadAndApply(
*cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies()); *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
} }
if (files_to_delete) { if (files_to_delete) {
*files_to_delete = std::move(manifest_tailer_->GetIntermediateFiles()); *files_to_delete = manifest_tailer_->GetAndClearIntermediateFiles();
} }
return s; return s;

View File

@ -1277,6 +1277,15 @@ class VersionSet {
bool no_error_if_files_missing = false, bool is_retry = false, bool no_error_if_files_missing = false, bool is_retry = false,
Status* log_status = nullptr); Status* log_status = nullptr);
// Do a best-efforts recovery (Options.best_efforts_recovery=true) from all
// available MANIFEST files. Similar to `Recover` with these differences:
// 1) not only the latest MANIFEST can be used, if it's not available or
// no successful recovery can be achieved with it, this function also tries
// to recover from previous MANIFEST files, in reverse chronological order
// until a successful recovery can be achieved.
// 2) this function doesn't just aim to recover to the latest version, if that
// is not available, the most recent point in time version will be saved in
// memory. Check doc for `VersionEditHandlerPointInTime` for more details.
Status TryRecover(const std::vector<ColumnFamilyDescriptor>& column_families, Status TryRecover(const std::vector<ColumnFamilyDescriptor>& column_families,
bool read_only, bool read_only,
const std::vector<std::string>& files_in_dbname, const std::vector<std::string>& files_in_dbname,

View File

@ -11,6 +11,7 @@
#include <algorithm> #include <algorithm>
#include "db/blob/blob_log_writer.h"
#include "db/db_impl/db_impl.h" #include "db/db_impl/db_impl.h"
#include "db/db_test_util.h" #include "db/db_test_util.h"
#include "db/log_writer.h" #include "db/log_writer.h"
@ -1345,18 +1346,27 @@ class VersionSetTestBase {
std::string key; // the only key std::string key; // the only key
int level = 0; int level = 0;
uint64_t epoch_number; uint64_t epoch_number;
bool file_missing = false;
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
SstInfo(uint64_t file_num, const std::string& cf_name, SstInfo(uint64_t file_num, const std::string& cf_name,
const std::string& _key, const std::string& _key,
uint64_t _epoch_number = kUnknownEpochNumber) uint64_t _epoch_number = kUnknownEpochNumber,
: SstInfo(file_num, cf_name, _key, 0, _epoch_number) {} bool _file_missing = false,
uint64_t _oldest_blob_file_number = kInvalidBlobFileNumber)
: SstInfo(file_num, cf_name, _key, 0, _epoch_number, _file_missing,
_oldest_blob_file_number) {}
SstInfo(uint64_t file_num, const std::string& cf_name, SstInfo(uint64_t file_num, const std::string& cf_name,
const std::string& _key, int lvl, const std::string& _key, int lvl,
uint64_t _epoch_number = kUnknownEpochNumber) uint64_t _epoch_number = kUnknownEpochNumber,
bool _file_missing = false,
uint64_t _oldest_blob_file_number = kInvalidBlobFileNumber)
: file_number(file_num), : file_number(file_num),
column_family(cf_name), column_family(cf_name),
key(_key), key(_key),
level(lvl), level(lvl),
epoch_number(_epoch_number) {} epoch_number(_epoch_number),
file_missing(_file_missing),
oldest_blob_file_number(_oldest_blob_file_number) {}
}; };
// Create dummy sst, return their metadata. Note that only file name and size // Create dummy sst, return their metadata. Note that only file name and size
@ -1395,22 +1405,32 @@ class VersionSetTestBase {
ASSERT_NE(0, file_size); ASSERT_NE(0, file_size);
file_metas->emplace_back( file_metas->emplace_back(
file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false, file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false,
Temperature::kUnknown, 0, 0, 0, info.epoch_number, Temperature::kUnknown, info.oldest_blob_file_number, 0, 0,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
0, 0, /* user_defined_timestamps_persisted */ true); kNullUniqueId64x2, 0, 0,
/* user_defined_timestamps_persisted */ true);
if (info.file_missing) {
ASSERT_OK(fs_->DeleteFile(fname, IOOptions(), nullptr));
}
} }
} }
void CreateCurrentFile() {
// Make "CURRENT" file point to the new manifest file.
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
Temperature::kUnknown,
/* dir_contains_current_file */ nullptr));
}
// Create DB with 3 column families. // Create DB with 3 column families.
void NewDB() { void NewDB() {
SequenceNumber last_seqno; SequenceNumber last_seqno;
std::unique_ptr<log::Writer> log_writer; std::unique_ptr<log::Writer> log_writer;
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); ASSERT_OK(
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
PrepareManifest(&column_families_, &last_seqno, &log_writer); PrepareManifest(&column_families_, &last_seqno, &log_writer);
log_writer.reset(); log_writer.reset();
// Make "CURRENT" file point to the new manifest file. CreateCurrentFile();
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
ASSERT_OK(s);
EXPECT_OK(versions_->Recover(column_families_, false)); EXPECT_OK(versions_->Recover(column_families_, false));
EXPECT_EQ(column_families_.size(), EXPECT_EQ(column_families_.size(),
@ -2586,7 +2606,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
edits_[i].MarkAtomicGroup(--remaining); edits_[i].MarkAtomicGroup(--remaining);
edits_[i].SetLastSequence(last_seqno_++); edits_[i].SetLastSequence(last_seqno_++);
} }
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); CreateCurrentFile();
} }
void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) { void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
@ -2598,7 +2618,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
edits_[i].MarkAtomicGroup(--remaining); edits_[i].MarkAtomicGroup(--remaining);
edits_[i].SetLastSequence(last_seqno_++); edits_[i].SetLastSequence(last_seqno_++);
} }
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); CreateCurrentFile();
} }
void SetupCorruptedAtomicGroup(int atomic_group_size) { void SetupCorruptedAtomicGroup(int atomic_group_size) {
@ -2612,7 +2632,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
} }
edits_[i].SetLastSequence(last_seqno_++); edits_[i].SetLastSequence(last_seqno_++);
} }
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); CreateCurrentFile();
} }
void SetupIncorrectAtomicGroup(int atomic_group_size) { void SetupIncorrectAtomicGroup(int atomic_group_size) {
@ -2628,7 +2648,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
} }
edits_[i].SetLastSequence(last_seqno_++); edits_[i].SetLastSequence(last_seqno_++);
} }
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); CreateCurrentFile();
} }
void SetupTestSyncPoints() { void SetupTestSyncPoints() {
@ -3394,8 +3414,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
SequenceNumber last_seqno; SequenceNumber last_seqno;
std::unique_ptr<log::Writer> log_writer; std::unique_ptr<log::Writer> log_writer;
PrepareManifest(&column_families, &last_seqno, &log_writer); PrepareManifest(&column_families, &last_seqno, &log_writer);
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); CreateCurrentFile();
ASSERT_OK(s);
EXPECT_OK(versions_->Recover(column_families, false /* read_only */)); EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
EXPECT_EQ(column_families.size(), EXPECT_EQ(column_families.size(),
@ -3417,7 +3436,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
cfd_to_drop->Ref(); cfd_to_drop->Ref();
drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID()); drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
mutex_.Lock(); mutex_.Lock();
s = versions_->LogAndApply( Status s = versions_->LogAndApply(
cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), read_options, cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), read_options,
write_options, &drop_cf_edit, &mutex_, nullptr); write_options, &drop_cf_edit, &mutex_, nullptr);
mutex_.Unlock(); mutex_.Unlock();
@ -3527,9 +3546,7 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase,
TEST_F(EmptyDefaultCfNewManifest, Recover) { TEST_F(EmptyDefaultCfNewManifest, Recover) {
PrepareManifest(nullptr, nullptr, &log_writer_); PrepareManifest(nullptr, nullptr, &log_writer_);
log_writer_.reset(); log_writer_.reset();
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, CreateCurrentFile();
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
std::string manifest_path; std::string manifest_path;
VerifyManifest(&manifest_path); VerifyManifest(&manifest_path);
std::vector<ColumnFamilyDescriptor> column_families; std::vector<ColumnFamilyDescriptor> column_families;
@ -3538,7 +3555,7 @@ TEST_F(EmptyDefaultCfNewManifest, Recover) {
cf_options_); cf_options_);
std::string db_id; std::string db_id;
bool has_missing_table_file = false; bool has_missing_table_file = false;
s = versions_->TryRecoverFromOneManifest( Status s = versions_->TryRecoverFromOneManifest(
manifest_path, column_families, false, &db_id, &has_missing_table_file); manifest_path, column_families, false, &db_id, &has_missing_table_file);
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_FALSE(has_missing_table_file); ASSERT_FALSE(has_missing_table_file);
@ -3559,7 +3576,8 @@ class VersionSetTestEmptyDb
assert(nullptr != log_writer); assert(nullptr != log_writer);
VersionEdit new_db; VersionEdit new_db;
if (db_options_.write_dbid_to_manifest) { if (db_options_.write_dbid_to_manifest) {
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_,
Temperature::kUnknown));
DBOptions tmp_db_options; DBOptions tmp_db_options;
tmp_db_options.env = env_; tmp_db_options.env = env_;
std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_)); std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
@ -3592,9 +3610,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
PrepareManifest(nullptr, nullptr, &log_writer_); PrepareManifest(nullptr, nullptr, &log_writer_);
log_writer_.reset(); log_writer_.reset();
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, CreateCurrentFile();
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
std::string manifest_path; std::string manifest_path;
VerifyManifest(&manifest_path); VerifyManifest(&manifest_path);
@ -3609,9 +3625,9 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
std::string db_id; std::string db_id;
bool has_missing_table_file = false; bool has_missing_table_file = false;
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, Status s = versions_->TryRecoverFromOneManifest(
read_only, &db_id, manifest_path, column_families, read_only, &db_id,
&has_missing_table_file); &has_missing_table_file);
auto iter = auto iter =
std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
if (iter == cf_names.end()) { if (iter == cf_names.end()) {
@ -3637,9 +3653,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
ASSERT_OK(s); ASSERT_OK(s);
} }
log_writer_.reset(); log_writer_.reset();
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, CreateCurrentFile();
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
std::string manifest_path; std::string manifest_path;
VerifyManifest(&manifest_path); VerifyManifest(&manifest_path);
@ -3685,9 +3699,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
ASSERT_OK(s); ASSERT_OK(s);
} }
log_writer_.reset(); log_writer_.reset();
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, CreateCurrentFile();
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
std::string manifest_path; std::string manifest_path;
VerifyManifest(&manifest_path); VerifyManifest(&manifest_path);
@ -3744,9 +3756,7 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
ASSERT_OK(s); ASSERT_OK(s);
} }
log_writer_.reset(); log_writer_.reset();
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, CreateCurrentFile();
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
std::string manifest_path; std::string manifest_path;
VerifyManifest(&manifest_path); VerifyManifest(&manifest_path);
@ -3802,9 +3812,7 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
ASSERT_OK(s); ASSERT_OK(s);
} }
log_writer_.reset(); log_writer_.reset();
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, CreateCurrentFile();
/* dir_contains_current_file */ nullptr);
ASSERT_OK(s);
std::string manifest_path; std::string manifest_path;
VerifyManifest(&manifest_path); VerifyManifest(&manifest_path);
@ -3869,8 +3877,9 @@ INSTANTIATE_TEST_CASE_P(
class VersionSetTestMissingFiles : public VersionSetTestBase, class VersionSetTestMissingFiles : public VersionSetTestBase,
public testing::Test { public testing::Test {
public: public:
VersionSetTestMissingFiles() explicit VersionSetTestMissingFiles(
: VersionSetTestBase("version_set_test_missing_files"), const std::string& test_name = "version_set_test_missing_files")
: VersionSetTestBase(test_name),
internal_comparator_( internal_comparator_(
std::make_shared<InternalKeyComparator>(options_.comparator)) {} std::make_shared<InternalKeyComparator>(options_.comparator)) {}
@ -3947,7 +3956,8 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
// This method updates last_sequence_. // This method updates last_sequence_.
void WriteFileAdditionAndDeletionToManifest( void WriteFileAdditionAndDeletionToManifest(
uint32_t cf, const std::vector<std::pair<int, FileMetaData>>& added_files, uint32_t cf, const std::vector<std::pair<int, FileMetaData>>& added_files,
const std::vector<std::pair<int, uint64_t>>& deleted_files) { const std::vector<std::pair<int, uint64_t>>& deleted_files,
const std::vector<BlobFileAddition>& blob_files = {}) {
VersionEdit edit; VersionEdit edit;
edit.SetColumnFamily(cf); edit.SetColumnFamily(cf);
for (const auto& elem : added_files) { for (const auto& elem : added_files) {
@ -3958,6 +3968,9 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
int level = elem.first; int level = elem.first;
edit.DeleteFile(level, elem.second); edit.DeleteFile(level, elem.second);
} }
for (const auto& elem : blob_files) {
edit.AddBlobFile(elem);
}
edit.SetLastSequence(last_seqno_); edit.SetLastSequence(last_seqno_);
++last_seqno_; ++last_seqno_;
assert(log_writer_.get() != nullptr); assert(log_writer_.get() != nullptr);
@ -4006,15 +4019,14 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
WriteFileAdditionAndDeletionToManifest( WriteFileAdditionAndDeletionToManifest(
/*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files); /*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
log_writer_.reset(); log_writer_.reset();
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); CreateCurrentFile();
ASSERT_OK(s);
std::string manifest_path; std::string manifest_path;
VerifyManifest(&manifest_path); VerifyManifest(&manifest_path);
std::string db_id; std::string db_id;
bool has_missing_table_file = false; bool has_missing_table_file = false;
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, Status s = versions_->TryRecoverFromOneManifest(
/*read_only=*/false, &db_id, manifest_path, column_families_,
&has_missing_table_file); /*read_only=*/false, &db_id, &has_missing_table_file);
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_TRUE(has_missing_table_file); ASSERT_TRUE(has_missing_table_file);
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
@ -4064,15 +4076,14 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
WriteFileAdditionAndDeletionToManifest( WriteFileAdditionAndDeletionToManifest(
/*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>()); /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
log_writer_.reset(); log_writer_.reset();
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); CreateCurrentFile();
ASSERT_OK(s);
std::string manifest_path; std::string manifest_path;
VerifyManifest(&manifest_path); VerifyManifest(&manifest_path);
std::string db_id; std::string db_id;
bool has_missing_table_file = false; bool has_missing_table_file = false;
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, Status s = versions_->TryRecoverFromOneManifest(
/*read_only=*/false, &db_id, manifest_path, column_families_,
&has_missing_table_file); /*read_only=*/false, &db_id, &has_missing_table_file);
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_TRUE(has_missing_table_file); ASSERT_TRUE(has_missing_table_file);
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
@ -4118,15 +4129,14 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
WriteFileAdditionAndDeletionToManifest( WriteFileAdditionAndDeletionToManifest(
/*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files); /*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
log_writer_.reset(); log_writer_.reset();
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); CreateCurrentFile();
ASSERT_OK(s);
std::string manifest_path; std::string manifest_path;
VerifyManifest(&manifest_path); VerifyManifest(&manifest_path);
std::string db_id; std::string db_id;
bool has_missing_table_file = false; bool has_missing_table_file = false;
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, Status s = versions_->TryRecoverFromOneManifest(
/*read_only=*/false, &db_id, manifest_path, column_families_,
&has_missing_table_file); /*read_only=*/false, &db_id, &has_missing_table_file);
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_FALSE(has_missing_table_file); ASSERT_FALSE(has_missing_table_file);
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
@ -4171,6 +4181,250 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
} }
} }
class BestEffortsRecoverIncompleteVersionTest
: public VersionSetTestMissingFiles {
public:
BestEffortsRecoverIncompleteVersionTest()
: VersionSetTestMissingFiles("best_efforts_recover_incomplete_version") {}
struct BlobInfo {
uint64_t file_number;
bool file_missing;
std::string key;
std::string blob;
BlobInfo(uint64_t _file_number, bool _file_missing, std::string _key,
std::string _blob)
: file_number(_file_number),
file_missing(_file_missing),
key(_key),
blob(_blob) {}
};
void CreateDummyBlobFiles(const std::vector<BlobInfo>& infos,
std::vector<BlobFileAddition>* blob_metas) {
for (const auto& info : infos) {
if (!info.file_missing) {
WriteDummyBlobFile(info.file_number, info.key, info.blob);
}
blob_metas->emplace_back(
info.file_number, 1 /*total_blob_count*/,
info.key.size() + info.blob.size() /*total_blob_bytes*/,
"" /*checksum_method*/, "" /*check_sum_value*/);
}
}
// Creates a test blob file that is valid so it can pass the
// `VersionEditHandlerPointInTime::VerifyBlobFile` check.
void WriteDummyBlobFile(uint64_t blob_file_number, const Slice& key,
const Slice& blob) {
ImmutableOptions options;
std::string blob_file_path = BlobFileName(dbname_, blob_file_number);
std::unique_ptr<FSWritableFile> file;
ASSERT_OK(
fs_->NewWritableFile(blob_file_path, FileOptions(), &file, nullptr));
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
std::move(file), blob_file_path, FileOptions(), options.clock));
BlobLogWriter blob_log_writer(std::move(file_writer), options.clock,
/*statistics*/ nullptr, blob_file_number,
/*use_fsync*/ true,
/*do_flush*/ false);
constexpr ExpirationRange expiration_range;
BlobLogHeader header(/*column_family_id*/ 0, kNoCompression,
/*has_ttl*/ false, expiration_range);
ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header));
std::string compressed_blob;
uint64_t key_offset = 0;
uint64_t blob_offset = 0;
ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), key, blob, &key_offset,
&blob_offset));
BlobLogFooter footer;
footer.blob_count = 1;
footer.expiration_range = expiration_range;
std::string checksum_method;
std::string checksum_value;
ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer,
&checksum_method, &checksum_value));
}
void RecoverFromManifestWithMissingFiles(
const std::vector<std::pair<int, FileMetaData>>& added_files,
const std::vector<BlobFileAddition>& blob_files) {
PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
WriteFileAdditionAndDeletionToManifest(
/*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>(),
blob_files);
log_writer_.reset();
CreateCurrentFile();
std::string manifest_path;
VerifyManifest(&manifest_path);
std::string db_id;
bool has_missing_table_file = false;
Status s = versions_->TryRecoverFromOneManifest(
manifest_path, column_families_,
/*read_only=*/false, &db_id, &has_missing_table_file);
ASSERT_OK(s);
ASSERT_TRUE(has_missing_table_file);
}
};
TEST_F(BestEffortsRecoverIncompleteVersionTest, NonL0MissingFiles) {
std::vector<SstInfo> sst_files = {
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
100 /* epoch_number */, true /* file_missing */),
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
101 /* epoch_number */, false /* file_missing */),
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
102 /* epoch_number */, false /* file_missing */),
};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(sst_files, &file_metas);
std::vector<std::pair<int, FileMetaData>> added_files;
for (size_t i = 0; i < sst_files.size(); i++) {
const auto& info = sst_files[i];
const auto& meta = file_metas[i];
added_files.emplace_back(info.level, meta);
}
RecoverFromManifestWithMissingFiles(added_files,
std::vector<BlobFileAddition>());
std::vector<uint64_t> all_table_files;
std::vector<uint64_t> all_blob_files;
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
ASSERT_TRUE(all_table_files.empty());
}
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingNonSuffixL0Files) {
std::vector<SstInfo> sst_files = {
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
100 /* epoch_number */, false /* file_missing */),
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
101 /* epoch_number */, true /* file_missing */),
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
102 /* epoch_number */, false /* file_missing */),
};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(sst_files, &file_metas);
std::vector<std::pair<int, FileMetaData>> added_files;
for (size_t i = 0; i < sst_files.size(); i++) {
const auto& info = sst_files[i];
const auto& meta = file_metas[i];
added_files.emplace_back(info.level, meta);
}
RecoverFromManifestWithMissingFiles(added_files,
std::vector<BlobFileAddition>());
std::vector<uint64_t> all_table_files;
std::vector<uint64_t> all_blob_files;
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
ASSERT_TRUE(all_table_files.empty());
}
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingBlobFiles) {
std::vector<SstInfo> sst_files = {
SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */,
100 /* epoch_number */, false /* file_missing */,
102 /*oldest_blob_file_number*/),
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
101 /* epoch_number */, false /* file_missing */,
103 /*oldest_blob_file_number*/),
};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(sst_files, &file_metas);
std::vector<BlobInfo> blob_files = {
BlobInfo(102, true /*file_missing*/, "a", "blob1"),
BlobInfo(103, true /*file_missing*/, "a", "blob2"),
};
std::vector<BlobFileAddition> blob_meta;
CreateDummyBlobFiles(blob_files, &blob_meta);
std::vector<std::pair<int, FileMetaData>> added_files;
for (size_t i = 0; i < sst_files.size(); i++) {
const auto& info = sst_files[i];
const auto& meta = file_metas[i];
added_files.emplace_back(info.level, meta);
}
RecoverFromManifestWithMissingFiles(added_files, blob_meta);
std::vector<uint64_t> all_table_files;
std::vector<uint64_t> all_blob_files;
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
ASSERT_TRUE(all_table_files.empty());
}
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingL0SuffixOnly) {
std::vector<SstInfo> sst_files = {
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
100 /* epoch_number */, false /* file_missing */),
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
101 /* epoch_number */, false /* file_missing */),
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
102 /* epoch_number */, true /* file_missing */),
};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(sst_files, &file_metas);
std::vector<std::pair<int, FileMetaData>> added_files;
for (size_t i = 0; i < sst_files.size(); i++) {
const auto& info = sst_files[i];
const auto& meta = file_metas[i];
added_files.emplace_back(info.level, meta);
}
RecoverFromManifestWithMissingFiles(added_files,
std::vector<BlobFileAddition>());
std::vector<uint64_t> all_table_files;
std::vector<uint64_t> all_blob_files;
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
ASSERT_EQ(2, all_table_files.size());
ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
VersionStorageInfo* vstorage = cfd->current()->storage_info();
ASSERT_EQ(1, vstorage->LevelFiles(0).size());
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
}
TEST_F(BestEffortsRecoverIncompleteVersionTest,
MissingL0SuffixAndTheirBlobFiles) {
std::vector<SstInfo> sst_files = {
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
100 /* epoch_number */, false /* file_missing */),
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
101 /* epoch_number */, false /* file_missing */,
103 /*oldest_blob_file_number*/),
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
102 /* epoch_number */, true /* file_missing */,
104 /*oldest_blob_file_number*/),
};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(sst_files, &file_metas);
std::vector<BlobInfo> blob_files = {
BlobInfo(103, false /*file_missing*/, "a", "blob1"),
BlobInfo(104, true /*file_missing*/, "a", "blob2"),
};
std::vector<BlobFileAddition> blob_meta;
CreateDummyBlobFiles(blob_files, &blob_meta);
std::vector<std::pair<int, FileMetaData>> added_files;
for (size_t i = 0; i < sst_files.size(); i++) {
const auto& info = sst_files[i];
const auto& meta = file_metas[i];
added_files.emplace_back(info.level, meta);
}
RecoverFromManifestWithMissingFiles(added_files, blob_meta);
std::vector<uint64_t> all_table_files;
std::vector<uint64_t> all_blob_files;
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
ASSERT_EQ(2, all_table_files.size());
ASSERT_EQ(1, all_blob_files.size());
ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
VersionStorageInfo* vstorage = cfd->current()->storage_info();
ASSERT_EQ(1, vstorage->LevelFiles(0).size());
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
ASSERT_EQ(1, vstorage->GetBlobFiles().size());
}
class ChargeFileMetadataTest : public DBTestBase { class ChargeFileMetadataTest : public DBTestBase {
public: public:
ChargeFileMetadataTest() ChargeFileMetadataTest()

View File

@ -929,15 +929,19 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
} }
if (0 == ts_sz) { if (0 == ts_sz) {
return WriteBatchInternal::Put(this, cf_id, key, value); s = WriteBatchInternal::Put(this, cf_id, key, value);
} else {
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
SliceParts(&value, 1));
} }
if (s.ok()) {
needs_in_place_update_ts_ = true; MaybeTrackTimestampSize(cf_id, ts_sz);
has_key_with_ts_ = true; }
std::string dummy_ts(ts_sz, '\0'); return s;
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
SliceParts(&value, 1));
} }
Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key, Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key,
@ -962,7 +966,7 @@ Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key,
Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& ts, const Slice& value) { const Slice& ts, const Slice& value) {
const Status s = CheckColumnFamilyTimestampSize(column_family, ts); Status s = CheckColumnFamilyTimestampSize(column_family, ts);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -970,8 +974,12 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
assert(column_family); assert(column_family);
uint32_t cf_id = column_family->GetID(); uint32_t cf_id = column_family->GetID();
std::array<Slice, 2> key_with_ts{{key, ts}}; std::array<Slice, 2> key_with_ts{{key, ts}};
return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2), s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
SliceParts(&value, 1)); SliceParts(&value, 1));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts.size());
}
return s;
} }
Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key, Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key,
@ -1039,7 +1047,11 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
} }
if (ts_sz == 0) { if (ts_sz == 0) {
return WriteBatchInternal::Put(this, cf_id, key, value); s = WriteBatchInternal::Put(this, cf_id, key, value);
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
} }
return Status::InvalidArgument( return Status::InvalidArgument(
@ -1246,20 +1258,24 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
} }
if (0 == ts_sz) { if (0 == ts_sz) {
return WriteBatchInternal::Delete(this, cf_id, key); s = WriteBatchInternal::Delete(this, cf_id, key);
} else {
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
s = WriteBatchInternal::Delete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
} }
if (s.ok()) {
needs_in_place_update_ts_ = true; MaybeTrackTimestampSize(cf_id, ts_sz);
has_key_with_ts_ = true; }
std::string dummy_ts(ts_sz, '\0'); return s;
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
return WriteBatchInternal::Delete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
} }
Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key, Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& ts) { const Slice& ts) {
const Status s = CheckColumnFamilyTimestampSize(column_family, ts); Status s = CheckColumnFamilyTimestampSize(column_family, ts);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -1267,8 +1283,12 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
has_key_with_ts_ = true; has_key_with_ts_ = true;
uint32_t cf_id = column_family->GetID(); uint32_t cf_id = column_family->GetID();
std::array<Slice, 2> key_with_ts{{key, ts}}; std::array<Slice, 2> key_with_ts{{key, ts}};
return WriteBatchInternal::Delete(this, cf_id, s = WriteBatchInternal::Delete(this, cf_id,
SliceParts(key_with_ts.data(), 2)); SliceParts(key_with_ts.data(), 2));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts.size());
}
return s;
} }
Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
@ -1313,7 +1333,11 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family,
} }
if (0 == ts_sz) { if (0 == ts_sz) {
return WriteBatchInternal::Delete(this, cf_id, key); s = WriteBatchInternal::Delete(this, cf_id, key);
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
} }
return Status::InvalidArgument( return Status::InvalidArgument(
@ -1361,20 +1385,24 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
} }
if (0 == ts_sz) { if (0 == ts_sz) {
return WriteBatchInternal::SingleDelete(this, cf_id, key); s = WriteBatchInternal::SingleDelete(this, cf_id, key);
} else {
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
s = WriteBatchInternal::SingleDelete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
} }
if (s.ok()) {
needs_in_place_update_ts_ = true; MaybeTrackTimestampSize(cf_id, ts_sz);
has_key_with_ts_ = true; }
std::string dummy_ts(ts_sz, '\0'); return s;
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
return WriteBatchInternal::SingleDelete(this, cf_id,
SliceParts(key_with_ts.data(), 2));
} }
Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
const Slice& key, const Slice& ts) { const Slice& key, const Slice& ts) {
const Status s = CheckColumnFamilyTimestampSize(column_family, ts); Status s = CheckColumnFamilyTimestampSize(column_family, ts);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -1382,8 +1410,12 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
assert(column_family); assert(column_family);
uint32_t cf_id = column_family->GetID(); uint32_t cf_id = column_family->GetID();
std::array<Slice, 2> key_with_ts{{key, ts}}; std::array<Slice, 2> key_with_ts{{key, ts}};
return WriteBatchInternal::SingleDelete(this, cf_id, s = WriteBatchInternal::SingleDelete(this, cf_id,
SliceParts(key_with_ts.data(), 2)); SliceParts(key_with_ts.data(), 2));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts.size());
}
return s;
} }
Status WriteBatchInternal::SingleDelete(WriteBatch* b, Status WriteBatchInternal::SingleDelete(WriteBatch* b,
@ -1430,7 +1462,11 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
} }
if (0 == ts_sz) { if (0 == ts_sz) {
return WriteBatchInternal::SingleDelete(this, cf_id, key); s = WriteBatchInternal::SingleDelete(this, cf_id, key);
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
} }
return Status::InvalidArgument( return Status::InvalidArgument(
@ -1480,23 +1516,27 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
} }
if (0 == ts_sz) { if (0 == ts_sz) {
return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key); s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
} else {
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
s = WriteBatchInternal::DeleteRange(this, cf_id,
SliceParts(begin_key_with_ts.data(), 2),
SliceParts(end_key_with_ts.data(), 2));
} }
if (s.ok()) {
needs_in_place_update_ts_ = true; MaybeTrackTimestampSize(cf_id, ts_sz);
has_key_with_ts_ = true; }
std::string dummy_ts(ts_sz, '\0'); return s;
std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
return WriteBatchInternal::DeleteRange(
this, cf_id, SliceParts(begin_key_with_ts.data(), 2),
SliceParts(end_key_with_ts.data(), 2));
} }
Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
const Slice& begin_key, const Slice& end_key, const Slice& begin_key, const Slice& end_key,
const Slice& ts) { const Slice& ts) {
const Status s = CheckColumnFamilyTimestampSize(column_family, ts); Status s = CheckColumnFamilyTimestampSize(column_family, ts);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -1505,9 +1545,13 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
uint32_t cf_id = column_family->GetID(); uint32_t cf_id = column_family->GetID();
std::array<Slice, 2> key_with_ts{{begin_key, ts}}; std::array<Slice, 2> key_with_ts{{begin_key, ts}};
std::array<Slice, 2> end_key_with_ts{{end_key, ts}}; std::array<Slice, 2> end_key_with_ts{{end_key, ts}};
return WriteBatchInternal::DeleteRange(this, cf_id, s = WriteBatchInternal::DeleteRange(this, cf_id,
SliceParts(key_with_ts.data(), 2), SliceParts(key_with_ts.data(), 2),
SliceParts(end_key_with_ts.data(), 2)); SliceParts(end_key_with_ts.data(), 2));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts.size());
}
return s;
} }
Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id, Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
@ -1554,7 +1598,11 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
} }
if (0 == ts_sz) { if (0 == ts_sz) {
return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key); s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
} }
return Status::InvalidArgument( return Status::InvalidArgument(
@ -1608,21 +1656,25 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
} }
if (0 == ts_sz) { if (0 == ts_sz) {
return WriteBatchInternal::Merge(this, cf_id, key, value); s = WriteBatchInternal::Merge(this, cf_id, key, value);
} else {
needs_in_place_update_ts_ = true;
has_key_with_ts_ = true;
std::string dummy_ts(ts_sz, '\0');
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
s = WriteBatchInternal::Merge(
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
} }
if (s.ok()) {
needs_in_place_update_ts_ = true; MaybeTrackTimestampSize(cf_id, ts_sz);
has_key_with_ts_ = true; }
std::string dummy_ts(ts_sz, '\0'); return s;
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
return WriteBatchInternal::Merge(
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
} }
Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& ts, const Slice& value) { const Slice& ts, const Slice& value) {
const Status s = CheckColumnFamilyTimestampSize(column_family, ts); Status s = CheckColumnFamilyTimestampSize(column_family, ts);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -1630,8 +1682,12 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
assert(column_family); assert(column_family);
uint32_t cf_id = column_family->GetID(); uint32_t cf_id = column_family->GetID();
std::array<Slice, 2> key_with_ts{{key, ts}}; std::array<Slice, 2> key_with_ts{{key, ts}};
return WriteBatchInternal::Merge( s = WriteBatchInternal::Merge(this, cf_id, SliceParts(key_with_ts.data(), 2),
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1)); SliceParts(&value, 1));
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts.size());
}
return s;
} }
Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
@ -1680,7 +1736,11 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
} }
if (0 == ts_sz) { if (0 == ts_sz) {
return WriteBatchInternal::Merge(this, cf_id, key, value); s = WriteBatchInternal::Merge(this, cf_id, key, value);
if (s.ok()) {
MaybeTrackTimestampSize(cf_id, ts_sz);
}
return s;
} }
return Status::InvalidArgument( return Status::InvalidArgument(

View File

@ -125,7 +125,6 @@ DECLARE_int32(level0_stop_writes_trigger);
DECLARE_int32(block_size); DECLARE_int32(block_size);
DECLARE_int32(format_version); DECLARE_int32(format_version);
DECLARE_int32(index_block_restart_interval); DECLARE_int32(index_block_restart_interval);
DECLARE_bool(disable_auto_compactions);
DECLARE_int32(max_background_compactions); DECLARE_int32(max_background_compactions);
DECLARE_int32(num_bottom_pri_threads); DECLARE_int32(num_bottom_pri_threads);
DECLARE_int32(compaction_thread_pool_adjust_interval); DECLARE_int32(compaction_thread_pool_adjust_interval);
@ -151,6 +150,7 @@ DECLARE_bool(charge_filter_construction);
DECLARE_bool(charge_table_reader); DECLARE_bool(charge_table_reader);
DECLARE_bool(charge_file_metadata); DECLARE_bool(charge_file_metadata);
DECLARE_bool(charge_blob_cache); DECLARE_bool(charge_blob_cache);
DECLARE_bool(decouple_partitioned_filters);
DECLARE_int32(top_level_index_pinning); DECLARE_int32(top_level_index_pinning);
DECLARE_int32(partition_pinning); DECLARE_int32(partition_pinning);
DECLARE_int32(unpartitioned_pinning); DECLARE_int32(unpartitioned_pinning);
@ -274,6 +274,7 @@ DECLARE_bool(verification_only);
DECLARE_string(last_level_temperature); DECLARE_string(last_level_temperature);
DECLARE_string(default_write_temperature); DECLARE_string(default_write_temperature);
DECLARE_string(default_temperature); DECLARE_string(default_temperature);
DECLARE_bool(paranoid_memory_checks);
// Options for transaction dbs. // Options for transaction dbs.
// Use TransactionDB (a.k.a. Pessimistic Transaction DB) // Use TransactionDB (a.k.a. Pessimistic Transaction DB)
@ -318,7 +319,6 @@ DECLARE_int32(prepopulate_blob_cache);
DECLARE_int32(approximate_size_one_in); DECLARE_int32(approximate_size_one_in);
DECLARE_bool(best_efforts_recovery); DECLARE_bool(best_efforts_recovery);
DECLARE_bool(skip_verifydb); DECLARE_bool(skip_verifydb);
DECLARE_bool(enable_compaction_filter);
DECLARE_bool(paranoid_file_checks); DECLARE_bool(paranoid_file_checks);
DECLARE_bool(fail_if_options_file_error); DECLARE_bool(fail_if_options_file_error);
DECLARE_uint64(batch_protection_bytes_per_key); DECLARE_uint64(batch_protection_bytes_per_key);

View File

@ -49,7 +49,7 @@ class DbStressCompactionFilter : public CompactionFilter {
return Decision::kKeep; return Decision::kKeep;
} }
// Reaching here means we acquired the lock. // Reaching here means we acquired the lock.
key_mutex->AssertHeld();
bool key_exists = state_->Exists(cf_id_, key_num); bool key_exists = state_->Exists(cf_id_, key_num);
const bool allow_overwrite = state_->AllowsOverwrite(key_num); const bool allow_overwrite = state_->AllowsOverwrite(key_num);

View File

@ -167,7 +167,10 @@ bool RunStressTestImpl(SharedState* shared) {
{FileType::kWalFile}); {FileType::kWalFile});
} }
} }
now = clock->NowMicros(); if (ShouldDisableAutoCompactionsBeforeVerifyDb()) {
Status s = stress->EnableAutoCompaction();
assert(s.ok());
}
fprintf(stdout, "%s Starting database operations\n", fprintf(stdout, "%s Starting database operations\n",
clock->TimeToString(now / 1000000).c_str()); clock->TimeToString(now / 1000000).c_str());

View File

@ -380,6 +380,11 @@ DEFINE_bool(charge_blob_cache, false,
"CacheEntryRoleOptions::charged of " "CacheEntryRoleOptions::charged of "
"kBlobCache"); "kBlobCache");
DEFINE_bool(
decouple_partitioned_filters,
ROCKSDB_NAMESPACE::BlockBasedTableOptions().decouple_partitioned_filters,
"Decouple filter partitioning from index partitioning.");
DEFINE_int32( DEFINE_int32(
top_level_index_pinning, top_level_index_pinning,
static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback), static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
@ -1443,4 +1448,8 @@ DEFINE_uint32(uncache_aggressiveness,
"obsolete. 0 = disabled, 1 = minimum, 100 = moderate, 10000 = " "obsolete. 0 = disabled, 1 = minimum, 100 = moderate, 10000 = "
"normal max"); "normal max");
DEFINE_bool(paranoid_memory_checks,
ROCKSDB_NAMESPACE::Options().paranoid_memory_checks,
"Sets CF option paranoid_memory_checks.");
#endif // GFLAGS #endif // GFLAGS

View File

@ -45,6 +45,8 @@ DECLARE_int32(open_write_fault_one_in);
DECLARE_int32(open_read_fault_one_in); DECLARE_int32(open_read_fault_one_in);
DECLARE_int32(inject_error_severity); DECLARE_int32(inject_error_severity);
DECLARE_bool(disable_auto_compactions);
DECLARE_bool(enable_compaction_filter);
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
class StressTest; class StressTest;
@ -262,14 +264,10 @@ class SharedState {
// This is useful for crash-recovery testing when the process may crash // This is useful for crash-recovery testing when the process may crash
// before updating the corresponding expected value // before updating the corresponding expected value
// //
// It can fail and `*prepared` will be set to false if the previous write or
// delete is still in pending state (e.g, still in recovery for retryable IO
// errors). If succeeds,`*prepared` will be set to true
//
// Requires external locking covering `key` in `cf` to prevent // Requires external locking covering `key` in `cf` to prevent
// concurrent write or delete to the same `key`. // concurrent write or delete to the same `key`.
PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared) { PendingExpectedValue PreparePut(int cf, int64_t key) {
return expected_state_manager_->PreparePut(cf, key, prepared); return expected_state_manager_->PreparePut(cf, key);
} }
// Does not requires external locking. // Does not requires external locking.
@ -281,31 +279,24 @@ class SharedState {
// This is useful for crash-recovery testing when the process may crash // This is useful for crash-recovery testing when the process may crash
// before updating the corresponding expected value // before updating the corresponding expected value
// //
// It can fail and `*prepared` will be set to false if the previous write or
// delete is still in pending state (e.g, still in recovery for retryable IO
// errors). If succeeds,`*prepared` will be set to true
//
// Requires external locking covering `key` in `cf` to prevent concurrent // Requires external locking covering `key` in `cf` to prevent concurrent
// write or delete to the same `key`. // write or delete to the same `key`.
PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared) { PendingExpectedValue PrepareDelete(int cf, int64_t key) {
return expected_state_manager_->PrepareDelete(cf, key, prepared); return expected_state_manager_->PrepareDelete(cf, key);
} }
// Requires external locking covering `key` in `cf` to prevent concurrent // Requires external locking covering `key` in `cf` to prevent concurrent
// write or delete to the same `key`. // write or delete to the same `key`.
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key, PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) {
bool* prepared) { return expected_state_manager_->PrepareSingleDelete(cf, key);
return expected_state_manager_->PrepareSingleDelete(cf, key, prepared);
} }
// Requires external locking covering keys in `[begin_key, end_key)` in `cf` // Requires external locking covering keys in `[begin_key, end_key)` in `cf`
// to prevent concurrent write or delete to the same `key`. // to prevent concurrent write or delete to the same `key`.
std::vector<PendingExpectedValue> PrepareDeleteRange(int cf, std::vector<PendingExpectedValue> PrepareDeleteRange(int cf,
int64_t begin_key, int64_t begin_key,
int64_t end_key, int64_t end_key) {
bool* prepared) { return expected_state_manager_->PrepareDeleteRange(cf, begin_key, end_key);
return expected_state_manager_->PrepareDeleteRange(cf, begin_key, end_key,
prepared);
} }
bool AllowsOverwrite(int64_t key) const { bool AllowsOverwrite(int64_t key) const {

View File

@ -632,10 +632,8 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
for (auto cfh : column_families_) { for (auto cfh : column_families_) {
for (int64_t k = 0; k != number_of_keys; ++k) { for (int64_t k = 0; k != number_of_keys; ++k) {
const std::string key = Key(k); const std::string key = Key(k);
bool prepare = false;
PendingExpectedValue pending_expected_value = PendingExpectedValue pending_expected_value =
shared->PreparePut(cf_idx, k, &prepare); shared->PreparePut(cf_idx, k);
assert(prepare);
const uint32_t value_base = pending_expected_value.GetFinalValueBase(); const uint32_t value_base = pending_expected_value.GetFinalValueBase();
const size_t sz = GenerateValue(value_base, value, sizeof(value)); const size_t sz = GenerateValue(value_base, value, sizeof(value));
@ -3676,7 +3674,7 @@ void StressTest::Reopen(ThreadState* thread) {
// crash-recovery verification does. Therefore it always expects no data loss // crash-recovery verification does. Therefore it always expects no data loss
// and we should ensure no data loss in testing. // and we should ensure no data loss in testing.
// TODO(hx235): eliminate the FlushWAL(true /* sync */)/SyncWAL() below // TODO(hx235): eliminate the FlushWAL(true /* sync */)/SyncWAL() below
if (!FLAGS_disable_wal && !FLAGS_avoid_flush_during_shutdown) { if (!FLAGS_disable_wal && FLAGS_avoid_flush_during_shutdown) {
Status s; Status s;
if (FLAGS_manual_wal_flush_one_in > 0) { if (FLAGS_manual_wal_flush_one_in > 0) {
s = db_->FlushWAL(/*sync=*/true); s = db_->FlushWAL(/*sync=*/true);
@ -3834,6 +3832,10 @@ void CheckAndSetOptionsForUserTimestamp(Options& options) {
FLAGS_persist_user_defined_timestamps; FLAGS_persist_user_defined_timestamps;
} }
bool ShouldDisableAutoCompactionsBeforeVerifyDb() {
return !FLAGS_disable_auto_compactions && FLAGS_enable_compaction_filter;
}
bool InitializeOptionsFromFile(Options& options) { bool InitializeOptionsFromFile(Options& options) {
DBOptions db_options; DBOptions db_options;
ConfigOptions config_options; ConfigOptions config_options;
@ -3861,6 +3863,8 @@ void InitializeOptionsFromFlags(
const std::shared_ptr<const FilterPolicy>& filter_policy, const std::shared_ptr<const FilterPolicy>& filter_policy,
Options& options) { Options& options) {
BlockBasedTableOptions block_based_options; BlockBasedTableOptions block_based_options;
block_based_options.decouple_partitioned_filters =
FLAGS_decouple_partitioned_filters;
block_based_options.block_cache = cache; block_based_options.block_cache = cache;
block_based_options.cache_index_and_filter_blocks = block_based_options.cache_index_and_filter_blocks =
FLAGS_cache_index_and_filter_blocks; FLAGS_cache_index_and_filter_blocks;
@ -3947,7 +3951,11 @@ void InitializeOptionsFromFlags(
new WriteBufferManager(FLAGS_db_write_buffer_size, block_cache)); new WriteBufferManager(FLAGS_db_write_buffer_size, block_cache));
} }
options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering; options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
options.disable_auto_compactions = FLAGS_disable_auto_compactions; if (ShouldDisableAutoCompactionsBeforeVerifyDb()) {
options.disable_auto_compactions = true;
} else {
options.disable_auto_compactions = FLAGS_disable_auto_compactions;
}
options.max_background_compactions = FLAGS_max_background_compactions; options.max_background_compactions = FLAGS_max_background_compactions;
options.max_background_flushes = FLAGS_max_background_flushes; options.max_background_flushes = FLAGS_max_background_flushes;
options.compaction_style = options.compaction_style =
@ -4047,6 +4055,7 @@ void InitializeOptionsFromFlags(
options.memtable_protection_bytes_per_key = options.memtable_protection_bytes_per_key =
FLAGS_memtable_protection_bytes_per_key; FLAGS_memtable_protection_bytes_per_key;
options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key; options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key;
options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
// Integrated BlobDB // Integrated BlobDB
options.enable_blob_files = FLAGS_enable_blob_files; options.enable_blob_files = FLAGS_enable_blob_files;
@ -4262,6 +4271,7 @@ void InitializeOptionsGeneral(
options.disable_auto_compactions = true; options.disable_auto_compactions = true;
} }
options.table_properties_collector_factories.clear();
options.table_properties_collector_factories.emplace_back( options.table_properties_collector_factories.emplace_back(
std::make_shared<DbStressTablePropertiesCollectorFactory>()); std::make_shared<DbStressTablePropertiesCollectorFactory>());

View File

@ -48,7 +48,11 @@ class StressTest {
return FLAGS_sync_fault_injection || FLAGS_disable_wal || return FLAGS_sync_fault_injection || FLAGS_disable_wal ||
FLAGS_manual_wal_flush_one_in > 0; FLAGS_manual_wal_flush_one_in > 0;
} }
Status EnableAutoCompaction() {
assert(options_.disable_auto_compactions);
Status s = db_->EnableAutoCompaction(column_families_);
return s;
}
void CleanUp(); void CleanUp();
protected: protected:
@ -64,6 +68,42 @@ class StressTest {
} }
} }
void UpdateIfInitialWriteFails(Env* db_stress_env, const Status& write_s,
Status* initial_write_s,
bool* initial_wal_write_may_succeed,
uint64_t* wait_for_recover_start_time) {
assert(db_stress_env && initial_write_s && initial_wal_write_may_succeed &&
wait_for_recover_start_time);
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when the
// first write fails
if (!write_s.ok() && (*initial_write_s).ok()) {
*initial_write_s = write_s;
*initial_wal_write_may_succeed =
!FaultInjectionTestFS::IsFailedToWriteToWALError(*initial_write_s);
*wait_for_recover_start_time = db_stress_env->NowMicros();
}
}
void PrintWriteRecoveryWaitTimeIfNeeded(Env* db_stress_env,
const Status& initial_write_s,
bool initial_wal_write_may_succeed,
uint64_t wait_for_recover_start_time,
const std::string& thread_name) {
assert(db_stress_env);
bool waited_for_recovery = !initial_write_s.ok() &&
IsErrorInjectedAndRetryable(initial_write_s) &&
initial_wal_write_may_succeed;
if (waited_for_recovery) {
uint64_t elapsed_sec =
(db_stress_env->NowMicros() - wait_for_recover_start_time) / 1000000;
if (elapsed_sec > 10) {
fprintf(stdout,
"%s thread slept to wait for write recovery for "
"%" PRIu64 " seconds\n",
thread_name.c_str(), elapsed_sec);
}
}
}
void GetDeleteRangeKeyLocks( void GetDeleteRangeKeyLocks(
ThreadState* thread, int rand_column_family, int64_t rand_key, ThreadState* thread, int rand_column_family, int64_t rand_key,
std::vector<std::unique_ptr<MutexLock>>* range_locks) { std::vector<std::unique_ptr<MutexLock>>* range_locks) {
@ -411,5 +451,6 @@ void InitializeOptionsGeneral(
// user-defined timestamp. // user-defined timestamp.
void CheckAndSetOptionsForUserTimestamp(Options& options); void CheckAndSetOptionsForUserTimestamp(Options& options);
bool ShouldDisableAutoCompactionsBeforeVerifyDb();
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
#endif // GFLAGS #endif // GFLAGS

View File

@ -32,41 +32,29 @@ void ExpectedState::Precommit(int cf, int64_t key, const ExpectedValue& value) {
std::atomic_thread_fence(std::memory_order_release); std::atomic_thread_fence(std::memory_order_release);
} }
PendingExpectedValue ExpectedState::PreparePut(int cf, int64_t key, PendingExpectedValue ExpectedState::PreparePut(int cf, int64_t key) {
bool* prepared) {
assert(prepared);
ExpectedValue expected_value = Load(cf, key); ExpectedValue expected_value = Load(cf, key);
// Calculate the original expected value // Calculate the original expected value
const ExpectedValue orig_expected_value = expected_value; const ExpectedValue orig_expected_value = expected_value;
// Calculate the pending expected value // Calculate the pending expected value
bool res = expected_value.Put(true /* pending */); expected_value.Put(true /* pending */);
if (!res) {
PendingExpectedValue ret = PendingExpectedValue(
&Value(cf, key), orig_expected_value, orig_expected_value);
*prepared = false;
return ret;
}
const ExpectedValue pending_expected_value = expected_value; const ExpectedValue pending_expected_value = expected_value;
// Calculate the final expected value // Calculate the final expected value
res = expected_value.Put(false /* pending */); expected_value.Put(false /* pending */);
assert(res);
const ExpectedValue final_expected_value = expected_value; const ExpectedValue final_expected_value = expected_value;
// Precommit // Precommit
Precommit(cf, key, pending_expected_value); Precommit(cf, key, pending_expected_value);
*prepared = true;
return PendingExpectedValue(&Value(cf, key), orig_expected_value, return PendingExpectedValue(&Value(cf, key), orig_expected_value,
final_expected_value); final_expected_value);
} }
ExpectedValue ExpectedState::Get(int cf, int64_t key) { return Load(cf, key); } ExpectedValue ExpectedState::Get(int cf, int64_t key) { return Load(cf, key); }
PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key, PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key) {
bool* prepared) {
assert(prepared);
ExpectedValue expected_value = Load(cf, key); ExpectedValue expected_value = Load(cf, key);
// Calculate the original expected value // Calculate the original expected value
@ -77,47 +65,32 @@ PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key,
if (!res) { if (!res) {
PendingExpectedValue ret = PendingExpectedValue( PendingExpectedValue ret = PendingExpectedValue(
&Value(cf, key), orig_expected_value, orig_expected_value); &Value(cf, key), orig_expected_value, orig_expected_value);
*prepared = false;
return ret; return ret;
} }
const ExpectedValue pending_expected_value = expected_value; const ExpectedValue pending_expected_value = expected_value;
// Calculate the final expected value // Calculate the final expected value
res = expected_value.Delete(false /* pending */); expected_value.Delete(false /* pending */);
assert(res);
const ExpectedValue final_expected_value = expected_value; const ExpectedValue final_expected_value = expected_value;
// Precommit // Precommit
Precommit(cf, key, pending_expected_value); Precommit(cf, key, pending_expected_value);
*prepared = true;
return PendingExpectedValue(&Value(cf, key), orig_expected_value, return PendingExpectedValue(&Value(cf, key), orig_expected_value,
final_expected_value); final_expected_value);
} }
PendingExpectedValue ExpectedState::PrepareSingleDelete(int cf, int64_t key, PendingExpectedValue ExpectedState::PrepareSingleDelete(int cf, int64_t key) {
bool* prepared) { return PrepareDelete(cf, key);
return PrepareDelete(cf, key, prepared);
} }
std::vector<PendingExpectedValue> ExpectedState::PrepareDeleteRange( std::vector<PendingExpectedValue> ExpectedState::PrepareDeleteRange(
int cf, int64_t begin_key, int64_t end_key, bool* prepared) { int cf, int64_t begin_key, int64_t end_key) {
std::vector<PendingExpectedValue> pending_expected_values; std::vector<PendingExpectedValue> pending_expected_values;
bool has_prepared_failed = false;
for (int64_t key = begin_key; key < end_key; ++key) { for (int64_t key = begin_key; key < end_key; ++key) {
bool each_prepared = false; pending_expected_values.push_back(PrepareDelete(cf, key));
PendingExpectedValue pending_expected_value =
PrepareDelete(cf, key, &each_prepared);
if (each_prepared) {
pending_expected_values.push_back(pending_expected_value);
} else {
has_prepared_failed = true;
pending_expected_value.PermitUnclosedPendingState();
break;
}
} }
*prepared = !has_prepared_failed;
return pending_expected_values; return pending_expected_values;
} }
@ -759,8 +732,31 @@ Status FileExpectedStateManager::Restore(DB* db) {
s = Env::Default()->DeleteFile(state_file_path); s = Env::Default()->DeleteFile(state_file_path);
} }
if (s.ok()) { if (s.ok()) {
saved_seqno_ = kMaxSequenceNumber; std::vector<std::string> expected_state_dir_children;
s = Env::Default()->DeleteFile(trace_file_path); s = Env::Default()->GetChildren(expected_state_dir_path_,
&expected_state_dir_children);
if (s.ok()) {
for (size_t i = 0; i < expected_state_dir_children.size(); ++i) {
const auto& filename = expected_state_dir_children[i];
if (filename.size() >= kTraceFilenameSuffix.size() &&
filename.rfind(kTraceFilenameSuffix) ==
filename.size() - kTraceFilenameSuffix.size()) {
SequenceNumber found_seqno = ParseUint64(filename.substr(
0, filename.size() - kTraceFilenameSuffix.size()));
// Delete older trace files, but keep the one we just replayed for
// debugging purposes
if (found_seqno < saved_seqno_) {
s = Env::Default()->DeleteFile(GetPathForFilename(filename));
}
}
if (!s.ok()) {
break;
}
}
}
if (s.ok()) {
saved_seqno_ = kMaxSequenceNumber;
}
} }
return s; return s;
} }

View File

@ -44,7 +44,7 @@ class ExpectedState {
// //
// Requires external locking covering `key` in `cf` to prevent concurrent // Requires external locking covering `key` in `cf` to prevent concurrent
// write or delete to the same `key`. // write or delete to the same `key`.
PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared); PendingExpectedValue PreparePut(int cf, int64_t key);
// Does not requires external locking. // Does not requires external locking.
ExpectedValue Get(int cf, int64_t key); ExpectedValue Get(int cf, int64_t key);
@ -55,18 +55,17 @@ class ExpectedState {
// //
// Requires external locking covering `key` in `cf` to prevent concurrent // Requires external locking covering `key` in `cf` to prevent concurrent
// write or delete to the same `key`. // write or delete to the same `key`.
PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared); PendingExpectedValue PrepareDelete(int cf, int64_t key);
// Requires external locking covering `key` in `cf` to prevent concurrent // Requires external locking covering `key` in `cf` to prevent concurrent
// write or delete to the same `key`. // write or delete to the same `key`.
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key, bool* prepared); PendingExpectedValue PrepareSingleDelete(int cf, int64_t key);
// Requires external locking covering keys in `[begin_key, end_key)` in `cf` // Requires external locking covering keys in `[begin_key, end_key)` in `cf`
// to prevent concurrent write or delete to the same `key`. // to prevent concurrent write or delete to the same `key`.
std::vector<PendingExpectedValue> PrepareDeleteRange(int cf, std::vector<PendingExpectedValue> PrepareDeleteRange(int cf,
int64_t begin_key, int64_t begin_key,
int64_t end_key, int64_t end_key);
bool* prepared);
// Update the expected value for start of an incomplete write or delete // Update the expected value for start of an incomplete write or delete
// operation on the key assoicated with this expected value // operation on the key assoicated with this expected value
@ -197,30 +196,28 @@ class ExpectedStateManager {
void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); } void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); }
// See ExpectedState::PreparePut() // See ExpectedState::PreparePut()
PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared) { PendingExpectedValue PreparePut(int cf, int64_t key) {
return latest_->PreparePut(cf, key, prepared); return latest_->PreparePut(cf, key);
} }
// See ExpectedState::Get() // See ExpectedState::Get()
ExpectedValue Get(int cf, int64_t key) { return latest_->Get(cf, key); } ExpectedValue Get(int cf, int64_t key) { return latest_->Get(cf, key); }
// See ExpectedState::PrepareDelete() // See ExpectedState::PrepareDelete()
PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared) { PendingExpectedValue PrepareDelete(int cf, int64_t key) {
return latest_->PrepareDelete(cf, key, prepared); return latest_->PrepareDelete(cf, key);
} }
// See ExpectedState::PrepareSingleDelete() // See ExpectedState::PrepareSingleDelete()
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key, PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) {
bool* prepared) { return latest_->PrepareSingleDelete(cf, key);
return latest_->PrepareSingleDelete(cf, key, prepared);
} }
// See ExpectedState::PrepareDeleteRange() // See ExpectedState::PrepareDeleteRange()
std::vector<PendingExpectedValue> PrepareDeleteRange(int cf, std::vector<PendingExpectedValue> PrepareDeleteRange(int cf,
int64_t begin_key, int64_t begin_key,
int64_t end_key, int64_t end_key) {
bool* prepared) { return latest_->PrepareDeleteRange(cf, begin_key, end_key);
return latest_->PrepareDeleteRange(cf, begin_key, end_key, prepared);
} }
// See ExpectedState::Exists() // See ExpectedState::Exists()

View File

@ -10,11 +10,7 @@
#include <atomic> #include <atomic>
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
bool ExpectedValue::Put(bool pending) { void ExpectedValue::Put(bool pending) {
if (pending && (PendingWrite() || PendingDelete())) {
return false;
}
if (pending) { if (pending) {
SetPendingWrite(); SetPendingWrite();
} else { } else {
@ -22,15 +18,10 @@ bool ExpectedValue::Put(bool pending) {
ClearDeleted(); ClearDeleted();
ClearPendingWrite(); ClearPendingWrite();
} }
return true;
} }
bool ExpectedValue::Delete(bool pending) { bool ExpectedValue::Delete(bool pending) {
if (pending && (PendingWrite() || PendingDelete())) { if (pending && !Exists()) {
return false;
}
if (!Exists()) {
return false; return false;
} }
if (pending) { if (pending) {

View File

@ -37,11 +37,14 @@ class ExpectedValue {
explicit ExpectedValue(uint32_t expected_value) explicit ExpectedValue(uint32_t expected_value)
: expected_value_(expected_value) {} : expected_value_(expected_value) {}
bool Exists() const { return PendingWrite() || !IsDeleted(); } bool Exists() const {
assert(!PendingWrite() && !PendingDelete());
return !IsDeleted();
}
uint32_t Read() const { return expected_value_; } uint32_t Read() const { return expected_value_; }
bool Put(bool pending); void Put(bool pending);
bool Delete(bool pending); bool Delete(bool pending);

View File

@ -1619,28 +1619,21 @@ class NonBatchedOpsStressTest : public StressTest {
// write // write
bool initial_wal_write_may_succeed = true; bool initial_wal_write_may_succeed = true;
bool prepared = false;
PendingExpectedValue pending_expected_value = PendingExpectedValue pending_expected_value =
shared->PreparePut(rand_column_family, rand_key, &prepared); shared->PreparePut(rand_column_family, rand_key);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
return s;
}
const uint32_t value_base = pending_expected_value.GetFinalValueBase(); const uint32_t value_base = pending_expected_value.GetFinalValueBase();
const size_t sz = GenerateValue(value_base, value, sizeof(value)); const size_t sz = GenerateValue(value_base, value, sizeof(value));
const Slice v(value, sz); const Slice v(value, sz);
uint64_t wait_for_recover_start_time = 0;
do { do {
// In order to commit the expected state for the initial write failed with // In order to commit the expected state for the initial write failed with
// injected retryable error and successful WAL write, retry the write // injected retryable error and successful WAL write, retry the write
// until it succeeds after the recovery finishes // until it succeeds after the recovery finishes
if (!s.ok() && IsErrorInjectedAndRetryable(s) && if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed) { initial_wal_write_may_succeed) {
lock.reset();
std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000)); std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000));
lock.reset(new MutexLock(
shared->GetMutexForKey(rand_column_family, rand_key)));
} }
if (FLAGS_use_put_entity_one_in > 0 && if (FLAGS_use_put_entity_one_in > 0 &&
(value_base % FLAGS_use_put_entity_one_in) == 0) { (value_base % FLAGS_use_put_entity_one_in) == 0) {
@ -1691,13 +1684,10 @@ class NonBatchedOpsStressTest : public StressTest {
}); });
} }
} }
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when the UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
// first write fails &initial_wal_write_may_succeed,
if (!s.ok() && initial_write_s.ok()) { &wait_for_recover_start_time);
initial_write_s = s;
initial_wal_write_may_succeed =
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
}
} while (!s.ok() && IsErrorInjectedAndRetryable(s) && } while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed); initial_wal_write_may_succeed);
@ -1719,6 +1709,9 @@ class NonBatchedOpsStressTest : public StressTest {
thread->shared->SafeTerminate(); thread->shared->SafeTerminate();
} }
} else { } else {
PrintWriteRecoveryWaitTimeIfNeeded(
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
wait_for_recover_start_time, "TestPut");
pending_expected_value.Commit(); pending_expected_value.Commit();
thread->stats.AddBytesForWrites(1, sz); thread->stats.AddBytesForWrites(1, sz);
PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value, PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
@ -1756,25 +1749,18 @@ class NonBatchedOpsStressTest : public StressTest {
// Use delete if the key may be overwritten and a single deletion // Use delete if the key may be overwritten and a single deletion
// otherwise. // otherwise.
if (shared->AllowsOverwrite(rand_key)) { if (shared->AllowsOverwrite(rand_key)) {
bool prepared = false;
PendingExpectedValue pending_expected_value = PendingExpectedValue pending_expected_value =
shared->PrepareDelete(rand_column_family, rand_key, &prepared); shared->PrepareDelete(rand_column_family, rand_key);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
return s;
}
uint64_t wait_for_recover_start_time = 0;
do { do {
// In order to commit the expected state for the initial write failed // In order to commit the expected state for the initial write failed
// with injected retryable error and successful WAL write, retry the // with injected retryable error and successful WAL write, retry the
// write until it succeeds after the recovery finishes // write until it succeeds after the recovery finishes
if (!s.ok() && IsErrorInjectedAndRetryable(s) && if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed) { initial_wal_write_may_succeed) {
lock.reset();
std::this_thread::sleep_for( std::this_thread::sleep_for(
std::chrono::microseconds(1 * 1000 * 1000)); std::chrono::microseconds(1 * 1000 * 1000));
lock.reset(new MutexLock(
shared->GetMutexForKey(rand_column_family, rand_key)));
} }
if (!FLAGS_use_txn) { if (!FLAGS_use_txn) {
if (FLAGS_user_timestamp_size == 0) { if (FLAGS_user_timestamp_size == 0) {
@ -1787,13 +1773,9 @@ class NonBatchedOpsStressTest : public StressTest {
return txn.Delete(cfh, key); return txn.Delete(cfh, key);
}); });
} }
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
// the first write fails &initial_wal_write_may_succeed,
if (!s.ok() && initial_write_s.ok()) { &wait_for_recover_start_time);
initial_write_s = s;
initial_wal_write_may_succeed =
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
}
} while (!s.ok() && IsErrorInjectedAndRetryable(s) && } while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed); initial_wal_write_may_succeed);
@ -1816,29 +1798,25 @@ class NonBatchedOpsStressTest : public StressTest {
thread->shared->SafeTerminate(); thread->shared->SafeTerminate();
} }
} else { } else {
PrintWriteRecoveryWaitTimeIfNeeded(
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
wait_for_recover_start_time, "TestDelete");
pending_expected_value.Commit(); pending_expected_value.Commit();
thread->stats.AddDeletes(1); thread->stats.AddDeletes(1);
} }
} else { } else {
bool prepared = false;
PendingExpectedValue pending_expected_value = PendingExpectedValue pending_expected_value =
shared->PrepareSingleDelete(rand_column_family, rand_key, &prepared); shared->PrepareSingleDelete(rand_column_family, rand_key);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
return s;
}
uint64_t wait_for_recover_start_time = 0;
do { do {
// In order to commit the expected state for the initial write failed // In order to commit the expected state for the initial write failed
// with injected retryable error and successful WAL write, retry the // with injected retryable error and successful WAL write, retry the
// write until it succeeds after the recovery finishes // write until it succeeds after the recovery finishes
if (!s.ok() && IsErrorInjectedAndRetryable(s) && if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed) { initial_wal_write_may_succeed) {
lock.reset();
std::this_thread::sleep_for( std::this_thread::sleep_for(
std::chrono::microseconds(1 * 1000 * 1000)); std::chrono::microseconds(1 * 1000 * 1000));
lock.reset(new MutexLock(
shared->GetMutexForKey(rand_column_family, rand_key)));
} }
if (!FLAGS_use_txn) { if (!FLAGS_use_txn) {
if (FLAGS_user_timestamp_size == 0) { if (FLAGS_user_timestamp_size == 0) {
@ -1851,13 +1829,9 @@ class NonBatchedOpsStressTest : public StressTest {
return txn.SingleDelete(cfh, key); return txn.SingleDelete(cfh, key);
}); });
} }
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
// the first write fails &initial_wal_write_may_succeed,
if (!s.ok() && initial_write_s.ok()) { &wait_for_recover_start_time);
initial_write_s = s;
initial_wal_write_may_succeed =
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
}
} while (!s.ok() && IsErrorInjectedAndRetryable(s) && } while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed); initial_wal_write_may_succeed);
@ -1880,6 +1854,9 @@ class NonBatchedOpsStressTest : public StressTest {
thread->shared->SafeTerminate(); thread->shared->SafeTerminate();
} }
} else { } else {
PrintWriteRecoveryWaitTimeIfNeeded(
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
wait_for_recover_start_time, "TestDelete");
pending_expected_value.Commit(); pending_expected_value.Commit();
thread->stats.AddSingleDeletes(1); thread->stats.AddSingleDeletes(1);
} }
@ -1914,18 +1891,9 @@ class NonBatchedOpsStressTest : public StressTest {
// write // write
bool initial_wal_write_may_succeed = true; bool initial_wal_write_may_succeed = true;
bool prepared = false;
std::vector<PendingExpectedValue> pending_expected_values = std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key, shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width, rand_key + FLAGS_range_deletion_width);
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
const int covered = static_cast<int>(pending_expected_values.size()); const int covered = static_cast<int>(pending_expected_values.size());
std::string keystr = Key(rand_key); std::string keystr = Key(rand_key);
@ -1935,6 +1903,7 @@ class NonBatchedOpsStressTest : public StressTest {
Slice end_key = end_keystr; Slice end_key = end_keystr;
std::string write_ts_str; std::string write_ts_str;
Slice write_ts; Slice write_ts;
uint64_t wait_for_recover_start_time = 0;
do { do {
// In order to commit the expected state for the initial write failed with // In order to commit the expected state for the initial write failed with
@ -1942,10 +1911,7 @@ class NonBatchedOpsStressTest : public StressTest {
// until it succeeds after the recovery finishes // until it succeeds after the recovery finishes
if (!s.ok() && IsErrorInjectedAndRetryable(s) && if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed) { initial_wal_write_may_succeed) {
range_locks.clear();
std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000)); std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000));
GetDeleteRangeKeyLocks(thread, rand_column_family, rand_key,
&range_locks);
} }
if (FLAGS_user_timestamp_size) { if (FLAGS_user_timestamp_size) {
write_ts_str = GetNowNanos(); write_ts_str = GetNowNanos();
@ -1954,13 +1920,9 @@ class NonBatchedOpsStressTest : public StressTest {
} else { } else {
s = db_->DeleteRange(write_opts, cfh, key, end_key); s = db_->DeleteRange(write_opts, cfh, key, end_key);
} }
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when the UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
// first write fails &initial_wal_write_may_succeed,
if (!s.ok() && initial_write_s.ok()) { &wait_for_recover_start_time);
initial_write_s = s;
initial_wal_write_may_succeed =
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
}
} while (!s.ok() && IsErrorInjectedAndRetryable(s) && } while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
initial_wal_write_may_succeed); initial_wal_write_may_succeed);
@ -1985,6 +1947,9 @@ class NonBatchedOpsStressTest : public StressTest {
thread->shared->SafeTerminate(); thread->shared->SafeTerminate();
} }
} else { } else {
PrintWriteRecoveryWaitTimeIfNeeded(
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
wait_for_recover_start_time, "TestDeleteRange");
for (PendingExpectedValue& pending_expected_value : for (PendingExpectedValue& pending_expected_value :
pending_expected_values) { pending_expected_values) {
pending_expected_value.Commit(); pending_expected_value.Commit();
@ -2057,16 +2022,8 @@ class NonBatchedOpsStressTest : public StressTest {
} }
keys.push_back(key); keys.push_back(key);
bool prepared = false;
PendingExpectedValue pending_expected_value = PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared); shared->PreparePut(column_family, key);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
const uint32_t value_base = pending_expected_value.GetFinalValueBase(); const uint32_t value_base = pending_expected_value.GetFinalValueBase();
values.push_back(value_base); values.push_back(value_base);
@ -2630,6 +2587,8 @@ class NonBatchedOpsStressTest : public StressTest {
// Value doesn't exist in db, update state to reflect that // Value doesn't exist in db, update state to reflect that
shared->SyncDelete(cf, key); shared->SyncDelete(cf, key);
return true; return true;
} else {
assert(false);
} }
} }
char expected_value_data[kValueMaxLen]; char expected_value_data[kValueMaxLen];
@ -2728,7 +2687,11 @@ class NonBatchedOpsStressTest : public StressTest {
SharedState* const shared = thread->shared; SharedState* const shared = thread->shared;
assert(shared); assert(shared);
if (!shared->AllowsOverwrite(key) && shared->Exists(column_family, key)) { const ExpectedValue expected_value =
thread->shared->Get(column_family, key);
bool may_exist = !ExpectedValueHelper::MustHaveNotExisted(expected_value,
expected_value);
if (!shared->AllowsOverwrite(key) && may_exist) {
// Just do read your write checks for keys that allow overwrites. // Just do read your write checks for keys that allow overwrites.
return; return;
} }

6
env/file_system.cc vendored
View File

@ -181,10 +181,10 @@ FileOptions FileSystem::OptimizeForBlobFileRead(
IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
const std::string& fname, bool should_sync, const std::string& fname, bool should_sync,
const IOOptions& io_options) { const IOOptions& io_options,
const FileOptions& file_options) {
std::unique_ptr<FSWritableFile> file; std::unique_ptr<FSWritableFile> file;
EnvOptions soptions; IOStatus s = fs->NewWritableFile(fname, file_options, &file, nullptr);
IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }

View File

@ -31,6 +31,7 @@ DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs,
total_trash_size_(0), total_trash_size_(0),
rate_bytes_per_sec_(rate_bytes_per_sec), rate_bytes_per_sec_(rate_bytes_per_sec),
pending_files_(0), pending_files_(0),
next_trash_bucket_(0),
bytes_max_delete_chunk_(bytes_max_delete_chunk), bytes_max_delete_chunk_(bytes_max_delete_chunk),
closing_(false), closing_(false),
cv_(&mu_), cv_(&mu_),
@ -66,10 +67,8 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
total_trash_size_.load() > total_size * max_trash_db_ratio_.load())) { total_trash_size_.load() > total_size * max_trash_db_ratio_.load())) {
// Rate limiting is disabled or trash size makes up more than // Rate limiting is disabled or trash size makes up more than
// max_trash_db_ratio_ (default 25%) of the total DB size // max_trash_db_ratio_ (default 25%) of the total DB size
TEST_SYNC_POINT("DeleteScheduler::DeleteFile"); Status s = DeleteFileImmediately(file_path, /*accounted=*/true);
Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
if (s.ok()) { if (s.ok()) {
s = sst_file_manager_->OnDeleteFile(file_path);
ROCKS_LOG_INFO(info_log_, ROCKS_LOG_INFO(info_log_,
"Deleted file %s immediately, rate_bytes_per_sec %" PRIi64 "Deleted file %s immediately, rate_bytes_per_sec %" PRIi64
", total_trash_size %" PRIu64 ", total_size %" PRIi64 ", total_trash_size %" PRIu64 ", total_size %" PRIi64
@ -77,15 +76,57 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
file_path.c_str(), rate_bytes_per_sec_.load(), file_path.c_str(), rate_bytes_per_sec_.load(),
total_trash_size_.load(), total_size, total_trash_size_.load(), total_size,
max_trash_db_ratio_.load()); max_trash_db_ratio_.load());
InstrumentedMutexLock l(&mu_);
RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
} }
return s; return s;
} }
return AddFileToDeletionQueue(file_path, dir_to_sync, /*bucket=*/std::nullopt,
/*accounted=*/true);
}
Status DeleteScheduler::DeleteUnaccountedFile(const std::string& file_path,
const std::string& dir_to_sync,
const bool force_bg,
std::optional<int32_t> bucket) {
uint64_t num_hard_links = 1;
fs_->NumFileLinks(file_path, IOOptions(), &num_hard_links, nullptr)
.PermitUncheckedError();
// We can tolerate rare races where we might immediately delete both links
// to a file.
if (rate_bytes_per_sec_.load() <= 0 || (!force_bg && num_hard_links > 1)) {
Status s = DeleteFileImmediately(file_path, /*accounted=*/false);
if (s.ok()) {
ROCKS_LOG_INFO(info_log_,
"Deleted file %s immediately, rate_bytes_per_sec %" PRIi64,
file_path.c_str(), rate_bytes_per_sec_.load());
}
return s;
}
return AddFileToDeletionQueue(file_path, dir_to_sync, bucket,
/*accounted=*/false);
}
Status DeleteScheduler::DeleteFileImmediately(const std::string& file_path,
bool accounted) {
TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::DeleteFile::cb",
const_cast<std::string*>(&file_path));
Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
if (s.ok()) {
s = OnDeleteFile(file_path, accounted);
InstrumentedMutexLock l(&mu_);
RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
}
return s;
}
Status DeleteScheduler::AddFileToDeletionQueue(const std::string& file_path,
const std::string& dir_to_sync,
std::optional<int32_t> bucket,
bool accounted) {
// Move file to trash // Move file to trash
std::string trash_file; std::string trash_file;
Status s = MarkAsTrash(file_path, &trash_file); Status s = MarkAsTrash(file_path, accounted, &trash_file);
ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(), ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(),
s.ToString().c_str()); s.ToString().c_str());
@ -94,7 +135,7 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
file_path.c_str(), s.ToString().c_str()); file_path.c_str(), s.ToString().c_str());
s = fs_->DeleteFile(file_path, IOOptions(), nullptr); s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
if (s.ok()) { if (s.ok()) {
s = sst_file_manager_->OnDeleteFile(file_path); s = OnDeleteFile(file_path, accounted);
ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately", ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately",
trash_file.c_str()); trash_file.c_str());
InstrumentedMutexLock l(&mu_); InstrumentedMutexLock l(&mu_);
@ -104,11 +145,13 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
} }
// Update the total trash size // Update the total trash size
uint64_t trash_file_size = 0; if (accounted) {
IOStatus io_s = uint64_t trash_file_size = 0;
fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr); IOStatus io_s =
if (io_s.ok()) { fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
total_trash_size_.fetch_add(trash_file_size); if (io_s.ok()) {
total_trash_size_.fetch_add(trash_file_size);
}
} }
//**TODO: What should we do if we failed to //**TODO: What should we do if we failed to
// get the file size? // get the file size?
@ -117,8 +160,15 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
{ {
InstrumentedMutexLock l(&mu_); InstrumentedMutexLock l(&mu_);
RecordTick(stats_.get(), FILES_MARKED_TRASH); RecordTick(stats_.get(), FILES_MARKED_TRASH);
queue_.emplace(trash_file, dir_to_sync); queue_.emplace(trash_file, dir_to_sync, accounted, bucket);
pending_files_++; pending_files_++;
if (bucket.has_value()) {
auto iter = pending_files_in_buckets_.find(bucket.value());
assert(iter != pending_files_in_buckets_.end());
if (iter != pending_files_in_buckets_.end()) {
iter->second++;
}
}
if (pending_files_ == 1) { if (pending_files_ == 1) {
cv_.SignalAll(); cv_.SignalAll();
} }
@ -177,7 +227,7 @@ Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
} }
Status DeleteScheduler::MarkAsTrash(const std::string& file_path, Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
std::string* trash_file) { bool accounted, std::string* trash_file) {
// Sanity check of the path // Sanity check of the path
size_t idx = file_path.rfind('/'); size_t idx = file_path.rfind('/');
if (idx == std::string::npos || idx == file_path.size() - 1) { if (idx == std::string::npos || idx == file_path.size() - 1) {
@ -211,7 +261,7 @@ Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
} }
cnt++; cnt++;
} }
if (s.ok()) { if (s.ok() && accounted) {
s = sst_file_manager_->OnMoveFile(file_path, *trash_file); s = sst_file_manager_->OnMoveFile(file_path, *trash_file);
} }
return s; return s;
@ -235,6 +285,8 @@ void DeleteScheduler::BackgroundEmptyTrash() {
uint64_t total_deleted_bytes = 0; uint64_t total_deleted_bytes = 0;
int64_t current_delete_rate = rate_bytes_per_sec_.load(); int64_t current_delete_rate = rate_bytes_per_sec_.load();
while (!queue_.empty() && !closing_) { while (!queue_.empty() && !closing_) {
// Satisfy static analysis.
std::optional<int32_t> bucket = std::nullopt;
if (current_delete_rate != rate_bytes_per_sec_.load()) { if (current_delete_rate != rate_bytes_per_sec_.load()) {
// User changed the delete rate // User changed the delete rate
current_delete_rate = rate_bytes_per_sec_.load(); current_delete_rate = rate_bytes_per_sec_.load();
@ -247,14 +299,17 @@ void DeleteScheduler::BackgroundEmptyTrash() {
// Get new file to delete // Get new file to delete
const FileAndDir& fad = queue_.front(); const FileAndDir& fad = queue_.front();
std::string path_in_trash = fad.fname; std::string path_in_trash = fad.fname;
std::string dir_to_sync = fad.dir;
bool accounted = fad.accounted;
bucket = fad.bucket;
// We don't need to hold the lock while deleting the file // We don't need to hold the lock while deleting the file
mu_.Unlock(); mu_.Unlock();
uint64_t deleted_bytes = 0; uint64_t deleted_bytes = 0;
bool is_complete = true; bool is_complete = true;
// Delete file from trash and update total_penlty value // Delete file from trash and update total_penlty value
Status s = Status s = DeleteTrashFile(path_in_trash, dir_to_sync, accounted,
DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete); &deleted_bytes, &is_complete);
total_deleted_bytes += deleted_bytes; total_deleted_bytes += deleted_bytes;
mu_.Lock(); mu_.Lock();
if (is_complete) { if (is_complete) {
@ -288,12 +343,20 @@ void DeleteScheduler::BackgroundEmptyTrash() {
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait", TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
&total_penalty); &total_penalty);
int32_t pending_files_in_bucket = std::numeric_limits<int32_t>::max();
if (is_complete) { if (is_complete) {
pending_files_--; pending_files_--;
if (bucket.has_value()) {
auto iter = pending_files_in_buckets_.find(bucket.value());
assert(iter != pending_files_in_buckets_.end());
if (iter != pending_files_in_buckets_.end()) {
pending_files_in_bucket = iter->second--;
}
}
} }
if (pending_files_ == 0) { if (pending_files_ == 0 || pending_files_in_bucket == 0) {
// Unblock WaitForEmptyTrash since there are no more files waiting // Unblock WaitForEmptyTrash or WaitForEmptyTrashBucket since there are
// to be deleted // no more files waiting to be deleted
cv_.SignalAll(); cv_.SignalAll();
} }
} }
@ -302,12 +365,14 @@ void DeleteScheduler::BackgroundEmptyTrash() {
Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
const std::string& dir_to_sync, const std::string& dir_to_sync,
uint64_t* deleted_bytes, bool accounted, uint64_t* deleted_bytes,
bool* is_complete) { bool* is_complete) {
uint64_t file_size; uint64_t file_size;
Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr); Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr);
*is_complete = true; *is_complete = true;
TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile"); TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::DeleteTrashFile::cb",
const_cast<std::string*>(&path_in_trash));
if (s.ok()) { if (s.ok()) {
bool need_full_delete = true; bool need_full_delete = true;
if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) { if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
@ -374,7 +439,7 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
} }
if (s.ok()) { if (s.ok()) {
*deleted_bytes = file_size; *deleted_bytes = file_size;
s = sst_file_manager_->OnDeleteFile(path_in_trash); s = OnDeleteFile(path_in_trash, accounted);
} }
} }
} }
@ -384,12 +449,24 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
path_in_trash.c_str(), s.ToString().c_str()); path_in_trash.c_str(), s.ToString().c_str());
*deleted_bytes = 0; *deleted_bytes = 0;
} else { } else {
total_trash_size_.fetch_sub(*deleted_bytes); if (accounted) {
total_trash_size_.fetch_sub(*deleted_bytes);
}
} }
return s; return s;
} }
Status DeleteScheduler::OnDeleteFile(const std::string& file_path,
bool accounted) {
if (accounted) {
return sst_file_manager_->OnDeleteFile(file_path);
}
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::OnDeleteFile",
const_cast<std::string*>(&file_path));
return Status::OK();
}
void DeleteScheduler::WaitForEmptyTrash() { void DeleteScheduler::WaitForEmptyTrash() {
InstrumentedMutexLock l(&mu_); InstrumentedMutexLock l(&mu_);
while (pending_files_ > 0 && !closing_) { while (pending_files_ > 0 && !closing_) {
@ -397,6 +474,30 @@ void DeleteScheduler::WaitForEmptyTrash() {
} }
} }
std::optional<int32_t> DeleteScheduler::NewTrashBucket() {
if (rate_bytes_per_sec_.load() <= 0) {
return std::nullopt;
}
InstrumentedMutexLock l(&mu_);
int32_t bucket_number = next_trash_bucket_++;
pending_files_in_buckets_.emplace(bucket_number, 0);
return bucket_number;
}
void DeleteScheduler::WaitForEmptyTrashBucket(int32_t bucket) {
InstrumentedMutexLock l(&mu_);
if (bucket >= next_trash_bucket_) {
return;
}
auto iter = pending_files_in_buckets_.find(bucket);
while (iter != pending_files_in_buckets_.end() && iter->second > 0 &&
!closing_) {
cv_.Wait();
iter = pending_files_in_buckets_.find(bucket);
}
pending_files_in_buckets_.erase(bucket);
}
void DeleteScheduler::MaybeCreateBackgroundThread() { void DeleteScheduler::MaybeCreateBackgroundThread() {
if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) { if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) {
bg_thread_.reset( bg_thread_.reset(

View File

@ -7,6 +7,7 @@
#include <map> #include <map>
#include <optional>
#include <queue> #include <queue>
#include <string> #include <string>
#include <thread> #include <thread>
@ -48,16 +49,45 @@ class DeleteScheduler {
MaybeCreateBackgroundThread(); MaybeCreateBackgroundThread();
} }
// Mark file as trash directory and schedule its deletion. If force_bg is // Delete an accounted file that is tracked by `SstFileManager` and should be
// set, it forces the file to always be deleted in the background thread, // tracked by this `DeleteScheduler` when it's deleted.
// except when rate limiting is disabled // The file is deleted immediately if slow deletion is disabled. If force_bg
// is not set and trash to db size ratio exceeded the configured threshold,
// it is immediately deleted too. In all other cases, the file will be moved
// to a trash directory and scheduled for deletion by a background thread.
Status DeleteFile(const std::string& fname, const std::string& dir_to_sync, Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
const bool force_bg = false); const bool force_bg = false);
// Wait for all files being deleteing in the background to finish or for // Delete an unaccounted file that is not tracked by `SstFileManager` and
// should not be tracked by this `DeleteScheduler` when it's deleted.
// The file is deleted immediately if slow deletion is disabled. If force_bg
// is not set and the file have more than 1 hard link, it is immediately
// deleted too. In all other cases, the file will be moved to a trash
// directory and scheduled for deletion by a background thread.
// This API also supports assign a file to a specified bucket created by
// `NewTrashBucket` when delete files in the background. So the caller can
// wait for a specific bucket to be empty by checking the
// `WaitForEmptyTrashBucket` API.
Status DeleteUnaccountedFile(const std::string& file_path,
const std::string& dir_to_sync,
const bool force_bg = false,
std::optional<int32_t> bucket = std::nullopt);
// Wait for all files being deleted in the background to finish or for
// destructor to be called. // destructor to be called.
void WaitForEmptyTrash(); void WaitForEmptyTrash();
// Creates a new trash bucket. A bucket is only created and returned when slow
// deletion is enabled.
// For each bucket that is created, the user should also call
// `WaitForEmptyTrashBucket` after scheduling file deletions to make sure the
// trash files are all cleared.
std::optional<int32_t> NewTrashBucket();
// Wait for all the files in the specified bucket to be deleted in the
// background or for the destructor to be called.
void WaitForEmptyTrashBucket(int32_t bucket);
// Return a map containing errors that happened in BackgroundEmptyTrash // Return a map containing errors that happened in BackgroundEmptyTrash
// file_path => error status // file_path => error status
std::map<std::string, Status> GetBackgroundErrors(); std::map<std::string, Status> GetBackgroundErrors();
@ -87,12 +117,21 @@ class DeleteScheduler {
} }
private: private:
Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash); Status DeleteFileImmediately(const std::string& file_path, bool accounted);
Status AddFileToDeletionQueue(const std::string& file_path,
const std::string& dir_to_sync,
std::optional<int32_t> bucket, bool accounted);
Status MarkAsTrash(const std::string& file_path, bool accounted,
std::string* path_in_trash);
Status DeleteTrashFile(const std::string& path_in_trash, Status DeleteTrashFile(const std::string& path_in_trash,
const std::string& dir_to_sync, const std::string& dir_to_sync, bool accounted,
uint64_t* deleted_bytes, bool* is_complete); uint64_t* deleted_bytes, bool* is_complete);
Status OnDeleteFile(const std::string& file_path, bool accounted);
void BackgroundEmptyTrash(); void BackgroundEmptyTrash();
void MaybeCreateBackgroundThread(); void MaybeCreateBackgroundThread();
@ -104,19 +143,28 @@ class DeleteScheduler {
std::atomic<uint64_t> total_trash_size_; std::atomic<uint64_t> total_trash_size_;
// Maximum number of bytes that should be deleted per second // Maximum number of bytes that should be deleted per second
std::atomic<int64_t> rate_bytes_per_sec_; std::atomic<int64_t> rate_bytes_per_sec_;
// Mutex to protect queue_, pending_files_, bg_errors_, closing_, stats_ // Mutex to protect queue_, pending_files_, next_trash_bucket_,
// pending_files_in_buckets_, bg_errors_, closing_, stats_
InstrumentedMutex mu_; InstrumentedMutex mu_;
struct FileAndDir { struct FileAndDir {
FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {} FileAndDir(const std::string& _fname, const std::string& _dir,
bool _accounted, std::optional<int32_t> _bucket)
: fname(_fname), dir(_dir), accounted(_accounted), bucket(_bucket) {}
std::string fname; std::string fname;
std::string dir; // empty will be skipped. std::string dir; // empty will be skipped.
bool accounted;
std::optional<int32_t> bucket;
}; };
// Queue of trash files that need to be deleted // Queue of trash files that need to be deleted
std::queue<FileAndDir> queue_; std::queue<FileAndDir> queue_;
// Number of trash files that are waiting to be deleted // Number of trash files that are waiting to be deleted
int32_t pending_files_; int32_t pending_files_;
// Next trash bucket that can be created
int32_t next_trash_bucket_;
// A mapping from trash bucket to number of pending files in the bucket
std::map<int32_t, int32_t> pending_files_in_buckets_;
uint64_t bytes_max_delete_chunk_; uint64_t bytes_max_delete_chunk_;
// Errors that happened in BackgroundEmptyTrash (file_path => error) // Errors that happened in BackgroundEmptyTrash (file_path => error)
std::map<std::string, Status> bg_errors_; std::map<std::string, Status> bg_errors_;
@ -127,6 +175,7 @@ class DeleteScheduler {
// Condition variable signaled in these conditions // Condition variable signaled in these conditions
// - pending_files_ value change from 0 => 1 // - pending_files_ value change from 0 => 1
// - pending_files_ value change from 1 => 0 // - pending_files_ value change from 1 => 0
// - a value in pending_files_in_buckets change from 1 => 0
// - closing_ value is set to true // - closing_ value is set to true
InstrumentedCondVar cv_; InstrumentedCondVar cv_;
// Background thread running BackgroundEmptyTrash // Background thread running BackgroundEmptyTrash
@ -138,6 +187,10 @@ class DeleteScheduler {
// If the trash size constitutes for more than this fraction of the total DB // If the trash size constitutes for more than this fraction of the total DB
// size we will start deleting new files passed to DeleteScheduler // size we will start deleting new files passed to DeleteScheduler
// immediately // immediately
// Unaccounted files passed for deletion will not cause change in
// total_trash_size_ or affect the DeleteScheduler::total_trash_size_ over
// SstFileManager::total_size_ ratio. Their slow deletion is not subject to
// this configured threshold either.
std::atomic<double> max_trash_db_ratio_; std::atomic<double> max_trash_db_ratio_;
static const uint64_t kMicrosInSecond = 1000 * 1000LL; static const uint64_t kMicrosInSecond = 1000 * 1000LL;
std::shared_ptr<Statistics> stats_; std::shared_ptr<Statistics> stats_;

View File

@ -78,7 +78,7 @@ class DeleteSchedulerTest : public testing::Test {
} }
std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024, std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024,
size_t dummy_files_dirs_idx = 0) { size_t dummy_files_dirs_idx = 0, bool track = true) {
std::string file_path = std::string file_path =
dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name; dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name;
std::unique_ptr<WritableFile> f; std::unique_ptr<WritableFile> f;
@ -86,7 +86,9 @@ class DeleteSchedulerTest : public testing::Test {
std::string data(size, 'A'); std::string data(size, 'A');
EXPECT_OK(f->Append(data)); EXPECT_OK(f->Append(data));
EXPECT_OK(f->Close()); EXPECT_OK(f->Close());
EXPECT_OK(sst_file_mgr_->OnAddFile(file_path)); if (track) {
EXPECT_OK(sst_file_mgr_->OnAddFile(file_path));
}
return file_path; return file_path;
} }
@ -353,6 +355,8 @@ TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
ASSERT_EQ(num_files, ASSERT_EQ(num_files,
stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
ASSERT_FALSE(delete_scheduler_->NewTrashBucket().has_value());
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
} }
@ -718,6 +722,141 @@ TEST_F(DeleteSchedulerTest, IsTrashCheck) {
ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx")); ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx"));
} }
TEST_F(DeleteSchedulerTest, DeleteAccountedAndUnaccountedFiles) {
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / s
NewDeleteScheduler();
// Create 100 files, every file is 1 KB
int num_files = 100; // 100 files
uint64_t file_size = 1024; // 1 KB as a file size
std::vector<std::string> generated_files;
for (int i = 0; i < num_files; i++) {
std::string file_name = "file" + std::to_string(i) + ".data";
generated_files.push_back(NewDummyFile(file_name, file_size,
/*dummy_files_dirs_idx*/ 0,
/*track=*/false));
}
for (int i = 0; i < num_files; i++) {
if (i % 2) {
ASSERT_OK(sst_file_mgr_->OnAddFile(generated_files[i], file_size));
ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
} else {
ASSERT_OK(
delete_scheduler_->DeleteUnaccountedFile(generated_files[i], ""));
}
}
delete_scheduler_->WaitForEmptyTrash();
ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
ASSERT_EQ(0, sst_file_mgr_->GetTotalSize());
}
TEST_F(DeleteSchedulerTest, ConcurrentlyDeleteUnaccountedFilesInBuckets) {
int bg_delete_file = 0;
int fg_delete_file = 0;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::DeleteTrashFile:DeleteFile",
[&](void* /*arg*/) { bg_delete_file++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / s
NewDeleteScheduler();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
// Create 1000 files, every file is 1 KB
int num_files = 1000;
uint64_t file_size = 1024; // 1 KB as a file size
std::vector<std::string> generated_files;
for (int i = 0; i < num_files; i++) {
std::string file_name = "file" + std::to_string(i) + ".data";
generated_files.push_back(NewDummyFile(file_name, file_size,
/*dummy_files_dirs_idx*/ 0,
/*track=*/false));
}
// Concurrently delete files in different buckets and check all the buckets
// are empty.
int thread_cnt = 10;
int files_per_thread = 100;
std::atomic<int> thread_num(0);
std::vector<port::Thread> threads;
std::function<void()> delete_thread = [&]() {
std::optional<int32_t> bucket = delete_scheduler_->NewTrashBucket();
ASSERT_TRUE(bucket.has_value());
int idx = thread_num.fetch_add(1);
int range_start = idx * files_per_thread;
int range_end = range_start + files_per_thread;
for (int j = range_start; j < range_end; j++) {
ASSERT_OK(delete_scheduler_->DeleteUnaccountedFile(
generated_files[j], "", /*false_bg=*/false, bucket));
}
delete_scheduler_->WaitForEmptyTrashBucket(bucket.value());
};
for (int i = 0; i < thread_cnt; i++) {
threads.emplace_back(delete_thread);
}
for (size_t i = 0; i < threads.size(); i++) {
threads[i].join();
}
ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
ASSERT_EQ(1000, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
ASSERT_EQ(0, fg_delete_file);
ASSERT_EQ(1000, bg_delete_file);
// OK to re check an already empty bucket
delete_scheduler_->WaitForEmptyTrashBucket(9);
// Invalid bucket return too.
delete_scheduler_->WaitForEmptyTrashBucket(100);
std::optional<int32_t> next_bucket = delete_scheduler_->NewTrashBucket();
ASSERT_TRUE(next_bucket.has_value());
ASSERT_EQ(10, next_bucket.value());
delete_scheduler_->WaitForEmptyTrashBucket(10);
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}
TEST_F(DeleteSchedulerTest,
ImmediatelyDeleteUnaccountedFilesWithRemainingLinks) {
int bg_delete_file = 0;
int fg_delete_file = 0;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::DeleteTrashFile:DeleteFile",
[&](void* /*arg*/) { bg_delete_file++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec
NewDeleteScheduler();
std::string file1 = NewDummyFile("data_1", 500 * 1024,
/*dummy_files_dirs_idx*/ 0, /*track=*/false);
std::string file2 = NewDummyFile("data_2", 100 * 1024,
/*dummy_files_dirs_idx*/ 0, /*track=*/false);
ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
// Should delete in 4 batch if there is no hardlink
ASSERT_OK(
delete_scheduler_->DeleteUnaccountedFile(file1, "", /*force_bg=*/false));
ASSERT_OK(
delete_scheduler_->DeleteUnaccountedFile(file2, "", /*force_bg=*/false));
delete_scheduler_->WaitForEmptyTrash();
ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
ASSERT_EQ(0, bg_delete_file);
ASSERT_EQ(2, fg_delete_file);
ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
ASSERT_EQ(2, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) { int main(int argc, char** argv) {

View File

@ -125,8 +125,8 @@ IOStatus CreateFile(FileSystem* fs, const std::string& destination,
Status DeleteDBFile(const ImmutableDBOptions* db_options, Status DeleteDBFile(const ImmutableDBOptions* db_options,
const std::string& fname, const std::string& dir_to_sync, const std::string& fname, const std::string& dir_to_sync,
const bool force_bg, const bool force_fg) { const bool force_bg, const bool force_fg) {
SstFileManagerImpl* sfm = SstFileManagerImpl* sfm = static_cast_with_check<SstFileManagerImpl>(
static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get()); db_options->sst_file_manager.get());
if (sfm && !force_fg) { if (sfm && !force_fg) {
return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg); return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
} else { } else {
@ -134,6 +134,21 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options,
} }
} }
Status DeleteUnaccountedDBFile(const ImmutableDBOptions* db_options,
const std::string& fname,
const std::string& dir_to_sync,
const bool force_bg, const bool force_fg,
std::optional<int32_t> bucket) {
SstFileManagerImpl* sfm = static_cast_with_check<SstFileManagerImpl>(
db_options->sst_file_manager.get());
if (sfm && !force_fg) {
return sfm->ScheduleUnaccountedFileDeletion(fname, dir_to_sync, force_bg,
bucket);
} else {
return db_options->env->DeleteFile(fname);
}
}
// requested_checksum_func_name brings the function name of the checksum // requested_checksum_func_name brings the function name of the checksum
// generator in checksum_factory. Empty string is permitted, in which case the // generator in checksum_factory. Empty string is permitted, in which case the
// name of the generator created by the factory is unchecked. When // name of the generator created by the factory is unchecked. When

View File

@ -55,6 +55,16 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options,
const std::string& fname, const std::string& path_to_sync, const std::string& fname, const std::string& path_to_sync,
const bool force_bg, const bool force_fg); const bool force_bg, const bool force_fg);
// Delete an unaccounted DB file that is not tracked by SstFileManager and will
// not be tracked by its DeleteScheduler when getting deleted.
// If a legitimate bucket is provided and this file is scheduled for slow
// deletion, it will be assigned to the specified trash bucket.
Status DeleteUnaccountedDBFile(const ImmutableDBOptions* db_options,
const std::string& fname,
const std::string& dir_to_sync,
const bool force_bg, const bool force_fg,
std::optional<int32_t> bucket);
// TODO(hx235): pass the whole DBOptions intead of its individual fields // TODO(hx235): pass the whole DBOptions intead of its individual fields
IOStatus GenerateOneFileChecksum( IOStatus GenerateOneFileChecksum(
FileSystem* fs, const std::string& file_path, FileSystem* fs, const std::string& file_path,

View File

@ -388,6 +388,7 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
const std::string& dbname, uint64_t descriptor_number, const std::string& dbname, uint64_t descriptor_number,
Temperature temp,
FSDirectory* dir_contains_current_file) { FSDirectory* dir_contains_current_file) {
// Remove leading "dbname/" and add newline to manifest file name // Remove leading "dbname/" and add newline to manifest file name
std::string manifest = DescriptorFileName(dbname, descriptor_number); std::string manifest = DescriptorFileName(dbname, descriptor_number);
@ -397,8 +398,11 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
std::string tmp = TempFileName(dbname, descriptor_number); std::string tmp = TempFileName(dbname, descriptor_number);
IOOptions opts; IOOptions opts;
IOStatus s = PrepareIOFromWriteOptions(write_options, opts); IOStatus s = PrepareIOFromWriteOptions(write_options, opts);
FileOptions file_opts;
file_opts.temperature = temp;
if (s.ok()) { if (s.ok()) {
s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts); s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts,
file_opts);
} }
TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s); TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s);
if (s.ok()) { if (s.ok()) {
@ -423,7 +427,8 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
} }
Status SetIdentityFile(const WriteOptions& write_options, Env* env, Status SetIdentityFile(const WriteOptions& write_options, Env* env,
const std::string& dbname, const std::string& db_id) { const std::string& dbname, Temperature temp,
const std::string& db_id) {
std::string id; std::string id;
if (db_id.empty()) { if (db_id.empty()) {
id = env->GenerateUniqueId(); id = env->GenerateUniqueId();
@ -437,8 +442,11 @@ Status SetIdentityFile(const WriteOptions& write_options, Env* env,
Status s; Status s;
IOOptions opts; IOOptions opts;
s = PrepareIOFromWriteOptions(write_options, opts); s = PrepareIOFromWriteOptions(write_options, opts);
FileOptions file_opts;
file_opts.temperature = temp;
if (s.ok()) { if (s.ok()) {
s = WriteStringToFile(env, id, tmp, true, &opts); s = WriteStringToFile(env->GetFileSystem().get(), id, tmp,
/*should_sync=*/true, opts, file_opts);
} }
if (s.ok()) { if (s.ok()) {
s = env->RenameFile(tmp, identify_file_name); s = env->RenameFile(tmp, identify_file_name);

View File

@ -161,11 +161,12 @@ bool ParseFileName(const std::string& filename, uint64_t* number,
// when // when
IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
const std::string& dbname, uint64_t descriptor_number, const std::string& dbname, uint64_t descriptor_number,
Temperature temp,
FSDirectory* dir_contains_current_file); FSDirectory* dir_contains_current_file);
// Make the IDENTITY file for the db // Make the IDENTITY file for the db
Status SetIdentityFile(const WriteOptions& write_options, Env* env, Status SetIdentityFile(const WriteOptions& write_options, Env* env,
const std::string& dbname, const std::string& dbname, Temperature temp,
const std::string& db_id = {}); const std::string& db_id = {});
// Sync manifest file `file`. // Sync manifest file `file`.

View File

@ -421,10 +421,28 @@ Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path,
return delete_scheduler_.DeleteFile(file_path, path_to_sync, force_bg); return delete_scheduler_.DeleteFile(file_path, path_to_sync, force_bg);
} }
Status SstFileManagerImpl::ScheduleUnaccountedFileDeletion(
const std::string& file_path, const std::string& dir_to_sync,
const bool force_bg, std::optional<int32_t> bucket) {
TEST_SYNC_POINT_CALLBACK(
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion",
const_cast<std::string*>(&file_path));
return delete_scheduler_.DeleteUnaccountedFile(file_path, dir_to_sync,
force_bg, bucket);
}
void SstFileManagerImpl::WaitForEmptyTrash() { void SstFileManagerImpl::WaitForEmptyTrash() {
delete_scheduler_.WaitForEmptyTrash(); delete_scheduler_.WaitForEmptyTrash();
} }
std::optional<int32_t> SstFileManagerImpl::NewTrashBucket() {
return delete_scheduler_.NewTrashBucket();
}
void SstFileManagerImpl::WaitForEmptyTrashBucket(int32_t bucket) {
delete_scheduler_.WaitForEmptyTrashBucket(bucket);
}
void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path, void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
uint64_t file_size) { uint64_t file_size) {
auto tracked_file = tracked_files_.find(file_path); auto tracked_file = tracked_files_.find(file_path);

View File

@ -5,7 +5,7 @@
#pragma once #pragma once
#include <optional>
#include <string> #include <string>
#include "db/compaction/compaction.h" #include "db/compaction/compaction.h"
@ -118,17 +118,40 @@ class SstFileManagerImpl : public SstFileManager {
// not guaranteed // not guaranteed
bool CancelErrorRecovery(ErrorHandler* db); bool CancelErrorRecovery(ErrorHandler* db);
// Mark file as trash and schedule it's deletion. If force_bg is set, it // Mark a file as trash and schedule its deletion. If force_bg is set, it
// forces the file to be deleting in the background regardless of DB size, // forces the file to be deleting in the background regardless of DB size,
// except when rate limited delete is disabled // except when rate limited delete is disabled.
virtual Status ScheduleFileDeletion(const std::string& file_path, virtual Status ScheduleFileDeletion(const std::string& file_path,
const std::string& dir_to_sync, const std::string& dir_to_sync,
const bool force_bg = false); const bool force_bg = false);
// Wait for all files being deleteing in the background to finish or for // Delete an unaccounted file. The file is deleted immediately if slow
// deletion is disabled. A file with more than 1 hard links will be deleted
// immediately unless force_bg is set. In other cases, files will be scheduled
// for slow deletion, and assigned to the specified bucket if a legitimate one
// is provided. A legitimate bucket is one that is created with the
// `NewTrashBucket` API, and for which `WaitForEmptyTrashBucket` hasn't been
// called yet.
virtual Status ScheduleUnaccountedFileDeletion(
const std::string& file_path, const std::string& dir_to_sync,
const bool force_bg = false,
std::optional<int32_t> bucket = std::nullopt);
// Wait for all files being deleted in the background to finish or for
// destructor to be called. // destructor to be called.
virtual void WaitForEmptyTrash(); virtual void WaitForEmptyTrash();
// Creates a new trash bucket. A legitimate bucket is only created and
// returned when slow deletion is enabled.
// For each bucket that is created and used, the user should also call
// `WaitForEmptyTrashBucket` after scheduling file deletions to make sure all
// the trash files are cleared.
std::optional<int32_t> NewTrashBucket();
// Wait for all the files in the specified bucket to be deleted in the
// background or for destructor to be called.
virtual void WaitForEmptyTrashBucket(int32_t bucket);
DeleteScheduler* delete_scheduler() { return &delete_scheduler_; } DeleteScheduler* delete_scheduler() { return &delete_scheduler_; }
// Stop the error recovery background thread. This should be called only // Stop the error recovery background thread. This should be called only

View File

@ -61,18 +61,6 @@ enum CompactionPri : char {
kRoundRobin = 0x4, kRoundRobin = 0x4,
}; };
// Temperature of a file. Used to pass to FileSystem for a different
// placement and/or coding.
// Reserve some numbers in the middle, in case we need to insert new tier
// there.
enum class Temperature : uint8_t {
kUnknown = 0,
kHot = 0x04,
kWarm = 0x08,
kCold = 0x0C,
kLastTemperature,
};
struct FileTemperatureAge { struct FileTemperatureAge {
Temperature temperature = Temperature::kUnknown; Temperature temperature = Temperature::kUnknown;
uint64_t age = 0; uint64_t age = 0;
@ -813,7 +801,7 @@ struct AdvancedColumnFamilyOptions {
// If this option is set, when creating the last level files, pass this // If this option is set, when creating the last level files, pass this
// temperature to FileSystem used. Should be no-op for default FileSystem // temperature to FileSystem used. Should be no-op for default FileSystem
// and users need to plug in their own FileSystem to take advantage of it. // and users need to plug in their own FileSystem to take advantage of it.
// When using FIFO compaction, this option is ignored. // Currently only compatible with universal compaction.
// //
// Dynamically changeable through the SetOptions() API // Dynamically changeable through the SetOptions() API
Temperature last_level_temperature = Temperature::kUnknown; Temperature last_level_temperature = Temperature::kUnknown;
@ -1090,6 +1078,13 @@ struct AdvancedColumnFamilyOptions {
// Dynamically changeable through the SetOptions() API. // Dynamically changeable through the SetOptions() API.
uint32_t bottommost_file_compaction_delay = 0; uint32_t bottommost_file_compaction_delay = 0;
// Enables additional integrity checks during reads/scans.
// Specifically, for skiplist-based memtables, we verify that keys visited
// are in order. This is helpful to detect corrupted memtable keys during
// reads. Enabling this feature incurs a performance overhead due to an
// additional key comparison during memtable lookup.
bool paranoid_memory_checks = false;
// Create ColumnFamilyOptions with default values for all fields // Create ColumnFamilyOptions with default values for all fields
AdvancedColumnFamilyOptions(); AdvancedColumnFamilyOptions();
// Create ColumnFamilyOptions from Options // Create ColumnFamilyOptions from Options

View File

@ -30,6 +30,7 @@
#include "rocksdb/port_defs.h" #include "rocksdb/port_defs.h"
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "rocksdb/thread_status.h" #include "rocksdb/thread_status.h"
#include "rocksdb/types.h"
#ifdef _WIN32 #ifdef _WIN32
// Windows API macro interference // Windows API macro interference
@ -159,6 +160,9 @@ class Env : public Customizable {
// Size of file in bytes // Size of file in bytes
uint64_t size_bytes; uint64_t size_bytes;
// EXPERIMENTAL - only provided by some implementations
Temperature temperature = Temperature::kUnknown;
}; };
Env(); Env();

View File

@ -195,7 +195,9 @@ struct FileOptions : EnvOptions {
FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
FileOptions(const DBOptions& opts) FileOptions(const DBOptions& opts)
: EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} : EnvOptions(opts),
temperature(opts.metadata_write_temperature),
handoff_checksum_type(ChecksumType::kCRC32c) {}
FileOptions(const EnvOptions& opts) FileOptions(const EnvOptions& opts)
: EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
@ -1952,7 +1954,8 @@ class FSDirectoryWrapper : public FSDirectory {
// A utility routine: write "data" to the named file. // A utility routine: write "data" to the named file.
IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
const std::string& fname, bool should_sync = false, const std::string& fname, bool should_sync = false,
const IOOptions& io_options = IOOptions()); const IOOptions& io_options = IOOptions(),
const FileOptions& file_options = FileOptions());
// A utility routine: read contents of named file into *data // A utility routine: read contents of named file into *data
IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, IOStatus ReadFileToString(FileSystem* fs, const std::string& fname,

View File

@ -47,7 +47,8 @@ class FilterBitsReader;
// structs because this is expected to be a temporary, stack-allocated object. // structs because this is expected to be a temporary, stack-allocated object.
struct FilterBuildingContext { struct FilterBuildingContext {
// This constructor is for internal use only and subject to change. // This constructor is for internal use only and subject to change.
FilterBuildingContext(const BlockBasedTableOptions& table_options); // Keeps a reference to table_options.
explicit FilterBuildingContext(const BlockBasedTableOptions& table_options);
// Options for the table being built // Options for the table being built
const BlockBasedTableOptions& table_options; const BlockBasedTableOptions& table_options;

View File

@ -194,6 +194,15 @@ class MemTableRep {
virtual void Get(const LookupKey& k, void* callback_args, virtual void Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry)); bool (*callback_func)(void* arg, const char* entry));
// Same as Get() but performs data integrity validation.
virtual Status GetAndValidate(const LookupKey& /* k */,
void* /* callback_args */,
bool (* /* callback_func */)(void* arg,
const char* entry),
bool /*allow_data_in_error*/) {
return Status::NotSupported("GetAndValidate() not implemented.");
}
virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
const Slice& /*end_key*/) { const Slice& /*end_key*/) {
return 0; return 0;
@ -235,13 +244,38 @@ class MemTableRep {
// REQUIRES: Valid() // REQUIRES: Valid()
virtual void Next() = 0; virtual void Next() = 0;
// Advances to the next position and performs integrity validations on the
// skip list. Iterator becomes invalid and Corruption is returned if a
// corruption is found.
// REQUIRES: Valid()
virtual Status NextAndValidate(bool /* allow_data_in_errors */) {
return Status::NotSupported("NextAndValidate() not implemented.");
}
// Advances to the previous position. // Advances to the previous position.
// REQUIRES: Valid() // REQUIRES: Valid()
virtual void Prev() = 0; virtual void Prev() = 0;
// Advances to the previous position and performs integrity validations on
// the skip list. Iterator becomes invalid and Corruption is returned if a
// corruption is found.
// REQUIRES: Valid()
virtual Status PrevAndValidate(bool /* allow_data_in_errors */) {
return Status::NotSupported("PrevAndValidate() not implemented.");
}
// Advance to the first entry with a key >= target // Advance to the first entry with a key >= target
virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
// Seek and perform integrity validations on the skip list.
// Iterator becomes invalid and Corruption is returned if a
// corruption is found.
virtual Status SeekAndValidate(const Slice& /* internal_key */,
const char* /* memtable_key */,
bool /* allow_data_in_errors */) {
return Status::NotSupported("SeekAndValidate() not implemented.");
}
// retreat to the first entry with a key <= target // retreat to the first entry with a key <= target
virtual void SeekForPrev(const Slice& internal_key, virtual void SeekForPrev(const Slice& internal_key,
const char* memtable_key) = 0; const char* memtable_key) = 0;

View File

@ -512,6 +512,10 @@ class CompactionService : public Customizable {
return CompactionServiceJobStatus::kUseLocal; return CompactionServiceJobStatus::kUseLocal;
} }
// Optional callback function upon Installation.
virtual void OnInstallation(const std::string& /*scheduled_job_id*/,
CompactionServiceJobStatus /*status*/) {}
// Deprecated. Please implement Schedule() and Wait() API to handle remote // Deprecated. Please implement Schedule() and Wait() API to handle remote
// compaction // compaction
@ -1434,7 +1438,17 @@ struct DBOptions {
// For example, if an SST or blob file referenced by the MANIFEST is missing, // For example, if an SST or blob file referenced by the MANIFEST is missing,
// BER might be able to find a set of files corresponding to an old "point in // BER might be able to find a set of files corresponding to an old "point in
// time" version of the column family, possibly from an older MANIFEST // time" version of the column family, possibly from an older MANIFEST
// file. Some other kinds of DB files (e.g. CURRENT, LOCK, IDENTITY) are // file.
// Besides complete "point in time" version, an incomplete version with
// only a suffix of L0 files missing can also be recovered to if the
// versioning history doesn't include an atomic flush. From the users'
// perspective, missing a suffix of L0 files means missing the
// user's most recently written data. So the remaining available files still
// presents a valid point in time view, although for some previous time. It's
// not done for atomic flush because that guarantees a consistent view across
// column families. We cannot guarantee that if recovering an incomplete
// version.
// Some other kinds of DB files (e.g. CURRENT, LOCK, IDENTITY) are
// either ignored or replaced with BER, or quietly fixed regardless of BER // either ignored or replaced with BER, or quietly fixed regardless of BER
// setting. BER does require at least one valid MANIFEST to recover to a // setting. BER does require at least one valid MANIFEST to recover to a
// non-trivial DB state, unlike `ldb repair`. // non-trivial DB state, unlike `ldb repair`.
@ -1566,6 +1580,16 @@ struct DBOptions {
// Default 100ms // Default 100ms
uint64_t follower_catchup_retry_wait_ms = 100; uint64_t follower_catchup_retry_wait_ms = 100;
// When DB files other than SST, blob and WAL files are created, use this
// filesystem temperature. (See also `wal_write_temperature` and various
// `*_temperature` CF options.) When not `kUnknown`, this overrides any
// temperature set by OptimizeForManifestWrite functions.
Temperature metadata_write_temperature = Temperature::kUnknown;
// Use this filesystem temperature when creating WAL files. When not
// `kUnknown`, this overrides any temperature set by OptimizeForLogWrite
// functions.
Temperature wal_write_temperature = Temperature::kUnknown;
// End EXPERIMENTAL // End EXPERIMENTAL
}; };
@ -2107,6 +2131,8 @@ struct CompactRangeOptions {
// IngestExternalFileOptions is used by IngestExternalFile() // IngestExternalFileOptions is used by IngestExternalFile()
struct IngestExternalFileOptions { struct IngestExternalFileOptions {
// Can be set to true to move the files instead of copying them. // Can be set to true to move the files instead of copying them.
// Note that original file links will be removed after successful ingestion,
// unless `allow_db_generated_files` is true.
bool move_files = false; bool move_files = false;
// If set to true, ingestion falls back to copy when move fails. // If set to true, ingestion falls back to copy when move fails.
bool failed_move_fall_back_to_copy = true; bool failed_move_fall_back_to_copy = true;
@ -2180,22 +2206,19 @@ struct IngestExternalFileOptions {
// XXX: "bottommost" is obsolete/confusing terminology to refer to last level // XXX: "bottommost" is obsolete/confusing terminology to refer to last level
bool fail_if_not_bottommost_level = false; bool fail_if_not_bottommost_level = false;
// EXPERIMENTAL // EXPERIMENTAL
// If set to true, ingestion will // Enables ingestion of files not generated by SstFileWriter. When true:
// - allow the files to not be generated by SstFileWriter, and // - Allows files to be ingested when their cf_id doesn't match the CF they
// - ignore cf_id mismatch between cf_id in the files and the CF they are // are being ingested into.
// being ingested into. // - Preserves original file links after successful ingestion when
// // `move_files = true`.
// REQUIRES: // REQUIREMENTS:
// - files to be ingested do not overlap with existing keys. // - Ingested files must not overlap with existing keys.
// - write_global_seqno = false // - `write_global_seqno` must be false.
// - move_files = false // - All keys in ingested files should have sequence number 0. We fail
// // ingestion if any sequence numbers is non-zero.
// Warning: This ONLY works for SST files where all keys have sequence number // WARNING: If a DB contains ingested files generated by another DB/CF,
// zero and with no duplicated user keys (this should be guaranteed if the // RepairDB() may not recover these files correctly, potentially leading to
// file is generated by a DB with zero as the largest sequence number). // data loss.
// We scan the entire SST files to validate sequence numbers.
// Warning: If a DB contains ingested files generated by another DB/CF,
// RepairDB() may not correctly recover these files. It may lose these files.
bool allow_db_generated_files = false; bool allow_db_generated_files = false;
}; };

View File

@ -529,6 +529,11 @@ enum Tickers : uint32_t {
// Footer corruption detected when opening an SST file for reading // Footer corruption detected when opening an SST file for reading
SST_FOOTER_CORRUPTION_COUNT, SST_FOOTER_CORRUPTION_COUNT,
// Counters for file read retries with the verify_and_reconstruct_read
// file system option after detecting a checksum mismatch
FILE_READ_CORRUPTION_RETRY_COUNT,
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
TICKER_ENUM_MAX TICKER_ENUM_MAX
}; };

View File

@ -291,15 +291,11 @@ struct BlockBasedTableOptions {
// Same as block_restart_interval but used for the index block. // Same as block_restart_interval but used for the index block.
int index_block_restart_interval = 1; int index_block_restart_interval = 1;
// Block size for partitioned metadata. Currently applied to indexes when // Target block size for partitioned metadata. Currently applied to indexes
// kTwoLevelIndexSearch is used and to filters when partition_filters is used. // when kTwoLevelIndexSearch is used and to filters when partition_filters is
// Note: Since in the current implementation the filters and index partitions // used. When decouple_partitioned_filters=false (original behavior), there is
// are aligned, an index/filter block is created when either index or filter // much more deviation from this target size. See the comment on
// block size reaches the specified limit. // decouple_partitioned_filters.
// Note: this limit is currently applied to only index blocks; a filter
// partition is cut right after an index block is cut
// TODO(myabandeh): remove the note above when filter partitions are cut
// separately
uint64_t metadata_block_size = 4096; uint64_t metadata_block_size = 4096;
// `cache_usage_options` allows users to specify the default // `cache_usage_options` allows users to specify the default
@ -398,6 +394,23 @@ struct BlockBasedTableOptions {
// block cache even when cache_index_and_filter_blocks=false. // block cache even when cache_index_and_filter_blocks=false.
bool partition_filters = false; bool partition_filters = false;
// When both partitioned indexes and partitioned filters are enabled,
// this enables independent partitioning boundaries between the two. Most
// notably, this enables these metadata blocks to hit their target size much
// more accurately, as there is often a disparity between index sizes and
// filter sizes. This should reduce fragmentation and metadata overheads in
// the block cache, as well as treat blocks more fairly for cache eviction
// purposes.
//
// There are no SST format compatibility issues with this option. (All
// versions of RocksDB able to read partitioned filters are able to read
// decoupled partitioned filters.)
//
// decouple_partitioned_filters = false is the original behavior, because of
// limitations in the initial implementation, and the new behavior
// decouple_partitioned_filters = true is expected to become the new default.
bool decouple_partitioned_filters = false;
// Option to generate Bloom/Ribbon filters that minimize memory // Option to generate Bloom/Ribbon filters that minimize memory
// internal fragmentation. // internal fragmentation.
// //
@ -679,6 +692,11 @@ struct BlockBasedTablePropertyNames {
static const std::string kWholeKeyFiltering; static const std::string kWholeKeyFiltering;
// value is "1" for true and "0" for false. // value is "1" for true and "0" for false.
static const std::string kPrefixFiltering; static const std::string kPrefixFiltering;
// Set to "1" when partitioned filters are decoupled from partitioned indexes.
// This metadata is recorded in case a read-time optimization for coupled
// filter+index partitioning is ever developed; that optimization/assumption
// would be disabled when this is set.
static const std::string kDecoupledPartitionedFilters;
}; };
// Create default block based table factory. // Create default block based table factory.

View File

@ -74,6 +74,7 @@ struct TablePropertiesNames {
static const std::string kSequenceNumberTimeMapping; static const std::string kSequenceNumberTimeMapping;
static const std::string kTailStartOffset; static const std::string kTailStartOffset;
static const std::string kUserDefinedTimestampsPersisted; static const std::string kUserDefinedTimestampsPersisted;
static const std::string kKeyLargestSeqno;
}; };
// `TablePropertiesCollector` provides the mechanism for users to collect // `TablePropertiesCollector` provides the mechanism for users to collect
@ -125,6 +126,8 @@ class TablePropertiesCollector {
// Finish() will be called when a table has already been built and is ready // Finish() will be called when a table has already been built and is ready
// for writing the properties block. // for writing the properties block.
// It will be called only once by RocksDB internal. // It will be called only once by RocksDB internal.
// When the returned Status is not OK, the collected properties will not be
// written to the file's property block.
// //
// @params properties User will add their collected statistics to // @params properties User will add their collected statistics to
// `properties`. // `properties`.
@ -132,6 +135,7 @@ class TablePropertiesCollector {
// Return the human-readable properties, where the key is property name and // Return the human-readable properties, where the key is property name and
// the value is the human-readable form of value. // the value is the human-readable form of value.
// Returned properties are used for logging.
// It will only be called after Finish() has been called by RocksDB internal. // It will only be called after Finish() has been called by RocksDB internal.
virtual UserCollectedProperties GetReadableProperties() const = 0; virtual UserCollectedProperties GetReadableProperties() const = 0;
@ -290,6 +294,12 @@ struct TableProperties {
// it's explicitly written to meta properties block. // it's explicitly written to meta properties block.
uint64_t user_defined_timestamps_persisted = 1; uint64_t user_defined_timestamps_persisted = 1;
// The largest sequence number of keys in this file.
// UINT64_MAX means unknown.
// Only written to properties block if known (should be known unless the
// table is empty).
uint64_t key_largest_seqno = UINT64_MAX;
// DB identity // DB identity
// db_id is an identifier generated the first time the DB is created // db_id is an identifier generated the first time the DB is created
// If DB identity is unset or unassigned, `db_id` will be an empty string. // If DB identity is unset or unassigned, `db_id` will be an empty string.

View File

@ -110,4 +110,16 @@ enum class WriteStallCondition {
kNormal, kNormal,
}; };
// Temperature of a file. Used to pass to FileSystem for a different
// placement and/or coding.
// Reserve some numbers in the middle, in case we need to insert new tier
// there.
enum class Temperature : uint8_t {
kUnknown = 0,
kHot = 0x04,
kWarm = 0x08,
kCold = 0x0C,
kLastTemperature,
};
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -323,6 +323,22 @@ struct TransactionOptions {
// description. If a negative value is specified, then the default value from // description. If a negative value is specified, then the default value from
// TransactionDBOptions is used. // TransactionDBOptions is used.
int64_t write_batch_flush_threshold = -1; int64_t write_batch_flush_threshold = -1;
// DO NOT USE.
// This is only a temporary option dedicated for MyRocks that will soon be
// removed.
// In normal use cases, meta info like column family's timestamp size is
// tracked at the transaction layer, so it's not necessary and even
// detrimental to track such info inside the internal WriteBatch because it
// may let anti-patterns like bypassing Transaction write APIs and directly
// write to its internal `WriteBatch` retrieved like this:
// https://github.com/facebook/mysql-5.6/blob/fb-mysql-8.0.32/storage/rocksdb/ha_rocksdb.cc#L4949-L4950
// Setting this option to true will keep aforementioned use case continue to
// work before it's refactored out.
// When this flag is enabled, we also intentionally only track the timestamp
// size in APIs that MyRocks currently are using, including Put, Merge, Delete
// DeleteRange, SingleDelete.
bool write_batch_track_timestamp_size = false;
}; };
// The per-write optimizations that do not involve transactions. TransactionDB // The per-write optimizations that do not involve transactions. TransactionDB

View File

@ -12,7 +12,7 @@
// NOTE: in 'main' development branch, this should be the *next* // NOTE: in 'main' development branch, this should be the *next*
// minor or major version number planned for release. // minor or major version number planned for release.
#define ROCKSDB_MAJOR 9 #define ROCKSDB_MAJOR 9
#define ROCKSDB_MINOR 6 #define ROCKSDB_MINOR 7
#define ROCKSDB_PATCH 0 #define ROCKSDB_PATCH 0
// Do not use these. We made the mistake of declaring macros starting with // Do not use these. We made the mistake of declaring macros starting with

View File

@ -30,7 +30,7 @@
#include <functional> #include <functional>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <unordered_map>
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "rocksdb/write_batch_base.h" #include "rocksdb/write_batch_base.h"
@ -437,6 +437,30 @@ class WriteBatch : public WriteBatchBase {
Status UpdateTimestamps(const Slice& ts, Status UpdateTimestamps(const Slice& ts,
std::function<size_t(uint32_t /*cf*/)> ts_sz_func); std::function<size_t(uint32_t /*cf*/)> ts_sz_func);
// TODO: remove these internal APIs after MyRocks refactor to not directly
// write to a `WriteBatch` retrieved from `Transaction` via
// `Transaction::GetWriteBatch`.
void SetTrackTimestampSize(bool track_timestamp_size) {
track_timestamp_size_ = track_timestamp_size;
}
inline void MaybeTrackTimestampSize(uint32_t column_family_id, size_t ts_sz) {
if (!track_timestamp_size_) {
return;
}
auto iter = cf_id_to_ts_sz_.find(column_family_id);
if (iter == cf_id_to_ts_sz_.end()) {
cf_id_to_ts_sz_.emplace(column_family_id, ts_sz);
}
}
// Return a mapping from column family id to timestamp size of all the column
// families involved in this WriteBatch.
const std::unordered_map<uint32_t, size_t>& GetColumnFamilyToTimestampSize() {
return cf_id_to_ts_sz_;
}
// Verify the per-key-value checksums of this write batch. // Verify the per-key-value checksums of this write batch.
// Corruption status will be returned if the verification fails. // Corruption status will be returned if the verification fails.
// If this write batch does not have per-key-value checksum, // If this write batch does not have per-key-value checksum,
@ -511,6 +535,10 @@ class WriteBatch : public WriteBatchBase {
size_t default_cf_ts_sz_ = 0; size_t default_cf_ts_sz_ = 0;
bool track_timestamp_size_ = false;
std::unordered_map<uint32_t, size_t> cf_id_to_ts_sz_;
protected: protected:
std::string rep_; // See comment in write_batch.cc for the format of rep_ std::string rep_; // See comment in write_batch.cc for the format of rep_
}; };

View File

@ -5317,6 +5317,10 @@ class TickerTypeJni {
return -0x53; return -0x53;
case ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT: case ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT:
return -0x55; return -0x55;
case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_COUNT:
return -0x56;
case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT:
return -0x57;
case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
// -0x54 is the max value at this time. Since these values are exposed // -0x54 is the max value at this time. Since these values are exposed
// directly to Java clients, we'll keep the value the same till the next // directly to Java clients, we'll keep the value the same till the next
@ -5774,6 +5778,11 @@ class TickerTypeJni {
return ROCKSDB_NAMESPACE::Tickers::PREFETCH_HITS; return ROCKSDB_NAMESPACE::Tickers::PREFETCH_HITS;
case -0x55: case -0x55:
return ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT; return ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT;
case -0x56:
return ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_COUNT;
case -0x57:
return ROCKSDB_NAMESPACE::Tickers::
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT;
case -0x54: case -0x54:
// -0x54 is the max value at this time. Since these values are exposed // -0x54 is the max value at this time. Since these values are exposed
// directly to Java clients, we'll keep the value the same till the next // directly to Java clients, we'll keep the value the same till the next

View File

@ -878,6 +878,10 @@ public enum TickerType {
SST_FOOTER_CORRUPTION_COUNT((byte) -0x55), SST_FOOTER_CORRUPTION_COUNT((byte) -0x55),
FILE_READ_CORRUPTION_RETRY_COUNT((byte) -0x56),
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT((byte) -0x57),
TICKER_ENUM_MAX((byte) -0x54); TICKER_ENUM_MAX((byte) -0x54);
private final byte value; private final byte value;

View File

@ -52,6 +52,7 @@
#include "port/likely.h" #include "port/likely.h"
#include "port/port.h" #include "port/port.h"
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "test_util/sync_point.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/random.h" #include "util/random.h"
@ -169,13 +170,20 @@ class InlineSkipList {
// REQUIRES: Valid() // REQUIRES: Valid()
void Next(); void Next();
[[nodiscard]] Status NextAndValidate(bool allow_data_in_errors);
// Advances to the previous position. // Advances to the previous position.
// REQUIRES: Valid() // REQUIRES: Valid()
void Prev(); void Prev();
[[nodiscard]] Status PrevAndValidate(bool allow_data_in_errors);
// Advance to the first entry with a key >= target // Advance to the first entry with a key >= target
void Seek(const char* target); void Seek(const char* target);
[[nodiscard]] Status SeekAndValidate(const char* target,
bool allow_data_in_errors);
// Retreat to the last entry with a key <= target // Retreat to the last entry with a key <= target
void SeekForPrev(const char* target); void SeekForPrev(const char* target);
@ -237,21 +245,20 @@ class InlineSkipList {
bool KeyIsAfterNode(const DecodedKey& key, Node* n) const; bool KeyIsAfterNode(const DecodedKey& key, Node* n) const;
// Returns the earliest node with a key >= key. // Returns the earliest node with a key >= key.
// Return nullptr if there is no such node. // Returns nullptr if there is no such node.
Node* FindGreaterOrEqual(const char* key) const; // @param out_of_order_node If not null, will validate the order of visited
// nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
// returned and *out_of_order_node will be set to n2.
Node* FindGreaterOrEqual(const char* key, Node** out_of_order_node) const;
// Return the latest node with a key < key. // Returns the latest node with a key < key.
// Return head_ if there is no such node. // Returns head_ if there is no such node.
// Fills prev[level] with pointer to previous node at "level" for every // Fills prev[level] with pointer to previous node at "level" for every
// level in [0..max_height_-1], if prev is non-null. // level in [0..max_height_-1], if prev is non-null.
Node* FindLessThan(const char* key, Node** prev = nullptr) const; // @param out_of_order_node If not null, will validate the order of visited
// nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
// Return the latest node with a key < key on bottom_level. Start searching // returned and *out_of_order_node will be set to n2.
// from root node on the level below top_level. Node* FindLessThan(const char* key, Node** out_of_order_node) const;
// Fills prev[level] with pointer to previous node at "level" for every
// level in [bottom_level..top_level-1], if prev is non-null.
Node* FindLessThan(const char* key, Node** prev, Node* root, int top_level,
int bottom_level) const;
// Return the last node in the list. // Return the last node in the list.
// Return head_ if list is empty. // Return head_ if list is empty.
@ -274,6 +281,8 @@ class InlineSkipList {
// lowest_level (inclusive). // lowest_level (inclusive).
void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice, void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice,
int recompute_level); int recompute_level);
static Status Corruption(Node* prev, Node* next, bool allow_data_in_errors);
}; };
// Implementation details follow // Implementation details follow
@ -392,20 +401,68 @@ inline void InlineSkipList<Comparator>::Iterator::Next() {
node_ = node_->Next(0); node_ = node_->Next(0);
} }
template <class Comparator>
inline Status InlineSkipList<Comparator>::Iterator::NextAndValidate(
bool allow_data_in_errors) {
assert(Valid());
Node* prev_node = node_;
node_ = node_->Next(0);
// Verify that keys are increasing.
if (prev_node != list_->head_ && node_ != nullptr &&
list_->compare_(prev_node->Key(), node_->Key()) >= 0) {
Node* node = node_;
// invalidates the iterator
node_ = nullptr;
return Corruption(prev_node, node, allow_data_in_errors);
}
return Status::OK();
}
template <class Comparator> template <class Comparator>
inline void InlineSkipList<Comparator>::Iterator::Prev() { inline void InlineSkipList<Comparator>::Iterator::Prev() {
// Instead of using explicit "prev" links, we just search for the // Instead of using explicit "prev" links, we just search for the
// last node that falls before key. // last node that falls before key.
assert(Valid()); assert(Valid());
node_ = list_->FindLessThan(node_->Key()); node_ = list_->FindLessThan(node_->Key(), nullptr);
if (node_ == list_->head_) { if (node_ == list_->head_) {
node_ = nullptr; node_ = nullptr;
} }
} }
template <class Comparator>
inline Status InlineSkipList<Comparator>::Iterator::PrevAndValidate(
const bool allow_data_in_errors) {
assert(Valid());
// Skip list validation is done in FindLessThan().
Node* out_of_order_node = nullptr;
node_ = list_->FindLessThan(node_->Key(), &out_of_order_node);
if (out_of_order_node) {
Node* node = node_;
node_ = nullptr;
return Corruption(node, out_of_order_node, allow_data_in_errors);
}
if (node_ == list_->head_) {
node_ = nullptr;
}
return Status::OK();
}
template <class Comparator> template <class Comparator>
inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) { inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) {
node_ = list_->FindGreaterOrEqual(target); node_ = list_->FindGreaterOrEqual(target, nullptr);
}
template <class Comparator>
inline Status InlineSkipList<Comparator>::Iterator::SeekAndValidate(
const char* target, const bool allow_data_in_errors) {
Node* out_of_order_node = nullptr;
node_ = list_->FindGreaterOrEqual(target, &out_of_order_node);
if (out_of_order_node) {
Node* node = node_;
node_ = nullptr;
return Corruption(node, out_of_order_node, allow_data_in_errors);
}
return Status::OK();
} }
template <class Comparator> template <class Comparator>
@ -448,6 +505,7 @@ int InlineSkipList<Comparator>::RandomHeight() {
rnd->Next() < kScaledInverseBranching_) { rnd->Next() < kScaledInverseBranching_) {
height++; height++;
} }
TEST_SYNC_POINT_CALLBACK("InlineSkipList::RandomHeight::height", &height);
assert(height > 0); assert(height > 0);
assert(height <= kMaxHeight_); assert(height <= kMaxHeight_);
assert(height <= kMaxPossibleHeight); assert(height <= kMaxPossibleHeight);
@ -472,7 +530,8 @@ bool InlineSkipList<Comparator>::KeyIsAfterNode(const DecodedKey& key,
template <class Comparator> template <class Comparator>
typename InlineSkipList<Comparator>::Node* typename InlineSkipList<Comparator>::Node*
InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const { InlineSkipList<Comparator>::FindGreaterOrEqual(
const char* key, Node** const out_of_order_node) const {
// Note: It looks like we could reduce duplication by implementing // Note: It looks like we could reduce duplication by implementing
// this function as FindLessThan(key)->Next(0), but we wouldn't be able // this function as FindLessThan(key)->Next(0), but we wouldn't be able
// to exit early on equality and the result wouldn't even be correct. // to exit early on equality and the result wouldn't even be correct.
@ -486,6 +545,11 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
Node* next = x->Next(level); Node* next = x->Next(level);
if (next != nullptr) { if (next != nullptr) {
PREFETCH(next->Next(level), 0, 1); PREFETCH(next->Next(level), 0, 1);
if (out_of_order_node && x != head_ &&
compare_(x->Key(), next->Key()) >= 0) {
*out_of_order_node = next;
return x;
}
} }
// Make sure the lists are sorted // Make sure the lists are sorted
assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
@ -509,18 +573,11 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
template <class Comparator> template <class Comparator>
typename InlineSkipList<Comparator>::Node* typename InlineSkipList<Comparator>::Node*
InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev) const { InlineSkipList<Comparator>::FindLessThan(const char* key,
return FindLessThan(key, prev, head_, GetMaxHeight(), 0); Node** const out_of_order_node) const {
} int level = GetMaxHeight() - 1;
assert(level >= 0);
template <class Comparator> Node* x = head_;
typename InlineSkipList<Comparator>::Node*
InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
Node* root, int top_level,
int bottom_level) const {
assert(top_level > bottom_level);
int level = top_level - 1;
Node* x = root;
// KeyIsAfter(key, last_not_after) is definitely false // KeyIsAfter(key, last_not_after) is definitely false
Node* last_not_after = nullptr; Node* last_not_after = nullptr;
const DecodedKey key_decoded = compare_.decode_key(key); const DecodedKey key_decoded = compare_.decode_key(key);
@ -529,6 +586,11 @@ InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
Node* next = x->Next(level); Node* next = x->Next(level);
if (next != nullptr) { if (next != nullptr) {
PREFETCH(next->Next(level), 0, 1); PREFETCH(next->Next(level), 0, 1);
if (out_of_order_node && x != head_ &&
compare_(x->Key(), next->Key()) >= 0) {
*out_of_order_node = next;
return x;
}
} }
assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
assert(x == head_ || KeyIsAfterNode(key_decoded, x)); assert(x == head_ || KeyIsAfterNode(key_decoded, x));
@ -537,10 +599,7 @@ InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
assert(next != nullptr); assert(next != nullptr);
x = next; x = next;
} else { } else {
if (prev != nullptr) { if (level == 0) {
prev[level] = x;
}
if (level == bottom_level) {
return x; return x;
} else { } else {
// Switch to next list, reuse KeyIsAfterNode() result // Switch to next list, reuse KeyIsAfterNode() result
@ -910,12 +969,12 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
while (true) { while (true) {
// Checking for duplicate keys on the level 0 is sufficient // Checking for duplicate keys on the level 0 is sufficient
if (UNLIKELY(i == 0 && splice->next_[i] != nullptr && if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
compare_(x->Key(), splice->next_[i]->Key()) >= 0)) { compare_(splice->next_[i]->Key(), key_decoded) <= 0)) {
// duplicate key // duplicate key
return false; return false;
} }
if (UNLIKELY(i == 0 && splice->prev_[i] != head_ && if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) { compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) {
// duplicate key // duplicate key
return false; return false;
} }
@ -953,12 +1012,12 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
} }
// Checking for duplicate keys on the level 0 is sufficient // Checking for duplicate keys on the level 0 is sufficient
if (UNLIKELY(i == 0 && splice->next_[i] != nullptr && if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
compare_(x->Key(), splice->next_[i]->Key()) >= 0)) { compare_(splice->next_[i]->Key(), key_decoded) <= 0)) {
// duplicate key // duplicate key
return false; return false;
} }
if (UNLIKELY(i == 0 && splice->prev_[i] != head_ && if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) { compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) {
// duplicate key // duplicate key
return false; return false;
} }
@ -999,7 +1058,7 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
template <class Comparator> template <class Comparator>
bool InlineSkipList<Comparator>::Contains(const char* key) const { bool InlineSkipList<Comparator>::Contains(const char* key) const {
Node* x = FindGreaterOrEqual(key); Node* x = FindGreaterOrEqual(key, nullptr);
if (x != nullptr && Equal(key, x->Key())) { if (x != nullptr && Equal(key, x->Key())) {
return true; return true;
} else { } else {
@ -1048,4 +1107,14 @@ void InlineSkipList<Comparator>::TEST_Validate() const {
} }
} }
template <class Comparator>
Status InlineSkipList<Comparator>::Corruption(Node* prev, Node* next,
bool allow_data_in_errors) {
std::string msg = "Out-of-order keys found in skiplist.";
if (allow_data_in_errors) {
msg.append(" prev key: " + Slice(prev->Key()).ToString(true));
msg.append(" next key: " + Slice(next->Key()).ToString(true));
}
return Status::Corruption(msg);
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -92,6 +92,20 @@ class SkipListRep : public MemTableRep {
} }
} }
Status GetAndValidate(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry),
bool allow_data_in_errors) override {
SkipListRep::Iterator iter(&skip_list_);
Slice dummy_slice;
Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(),
allow_data_in_errors);
for (; iter.Valid() && status.ok() &&
callback_func(callback_args, iter.key());
status = iter.NextAndValidate(allow_data_in_errors)) {
}
return status;
}
uint64_t ApproximateNumEntries(const Slice& start_ikey, uint64_t ApproximateNumEntries(const Slice& start_ikey,
const Slice& end_ikey) override { const Slice& end_ikey) override {
std::string tmp; std::string tmp;
@ -181,15 +195,24 @@ class SkipListRep : public MemTableRep {
// Returns the key at the current position. // Returns the key at the current position.
// REQUIRES: Valid() // REQUIRES: Valid()
const char* key() const override { return iter_.key(); } const char* key() const override {
assert(Valid());
return iter_.key();
}
// Advances to the next position. // Advances to the next position.
// REQUIRES: Valid() // REQUIRES: Valid()
void Next() override { iter_.Next(); } void Next() override {
assert(Valid());
iter_.Next();
}
// Advances to the previous position. // Advances to the previous position.
// REQUIRES: Valid() // REQUIRES: Valid()
void Prev() override { iter_.Prev(); } void Prev() override {
assert(Valid());
iter_.Prev();
}
// Advance to the first entry with a key >= target // Advance to the first entry with a key >= target
void Seek(const Slice& user_key, const char* memtable_key) override { void Seek(const Slice& user_key, const char* memtable_key) override {
@ -219,6 +242,26 @@ class SkipListRep : public MemTableRep {
// Final state of iterator is Valid() iff list is not empty. // Final state of iterator is Valid() iff list is not empty.
void SeekToLast() override { iter_.SeekToLast(); } void SeekToLast() override { iter_.SeekToLast(); }
Status NextAndValidate(bool allow_data_in_errors) override {
assert(Valid());
return iter_.NextAndValidate(allow_data_in_errors);
}
Status SeekAndValidate(const Slice& user_key, const char* memtable_key,
bool allow_data_in_errors) override {
if (memtable_key != nullptr) {
return iter_.SeekAndValidate(memtable_key, allow_data_in_errors);
} else {
return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key),
allow_data_in_errors);
}
}
Status PrevAndValidate(bool allow_data_in_error) override {
assert(Valid());
return iter_.PrevAndValidate(allow_data_in_error);
}
protected: protected:
std::string tmp_; // For passing to EncodeKey std::string tmp_; // For passing to EncodeKey
}; };

View File

@ -266,6 +266,10 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"}, {PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"},
{PREFETCH_HITS, "rocksdb.prefetch.hits"}, {PREFETCH_HITS, "rocksdb.prefetch.hits"},
{SST_FOOTER_CORRUPTION_COUNT, "rocksdb.footer.corruption.count"}, {SST_FOOTER_CORRUPTION_COUNT, "rocksdb.footer.corruption.count"},
{FILE_READ_CORRUPTION_RETRY_COUNT,
"rocksdb.file.read.corruption.retry.count"},
{FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
"rocksdb.file.read.corruption.retry.success.count"},
}; };
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = { const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {

View File

@ -531,6 +531,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct MutableCFOptions, block_protection_bytes_per_key), {offsetof(struct MutableCFOptions, block_protection_bytes_per_key),
OptionType::kUInt8T, OptionVerificationType::kNormal, OptionType::kUInt8T, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}}, OptionTypeFlags::kMutable}},
{"paranoid_memory_checks",
{offsetof(struct MutableCFOptions, paranoid_memory_checks),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{kOptNameCompOpts, {kOptNameCompOpts,
OptionTypeInfo::Struct( OptionTypeInfo::Struct(
kOptNameCompOpts, &compression_options_type_info, kOptNameCompOpts, &compression_options_type_info,
@ -1104,6 +1108,8 @@ void MutableCFOptions::Dump(Logger* log) const {
ttl); ttl);
ROCKS_LOG_INFO(log, " periodic_compaction_seconds: %" PRIu64, ROCKS_LOG_INFO(log, " periodic_compaction_seconds: %" PRIu64,
periodic_compaction_seconds); periodic_compaction_seconds);
ROCKS_LOG_INFO(log, " paranoid_memory_checks: %d",
paranoid_memory_checks);
std::string result; std::string result;
char buf[10]; char buf[10];
for (const auto m : max_bytes_for_level_multiplier_additional) { for (const auto m : max_bytes_for_level_multiplier_additional) {

View File

@ -168,6 +168,7 @@ struct MutableCFOptions {
memtable_protection_bytes_per_key( memtable_protection_bytes_per_key(
options.memtable_protection_bytes_per_key), options.memtable_protection_bytes_per_key),
block_protection_bytes_per_key(options.block_protection_bytes_per_key), block_protection_bytes_per_key(options.block_protection_bytes_per_key),
paranoid_memory_checks(options.paranoid_memory_checks),
sample_for_compression( sample_for_compression(
options.sample_for_compression), // TODO: is 0 fine here? options.sample_for_compression), // TODO: is 0 fine here?
compression_per_level(options.compression_per_level), compression_per_level(options.compression_per_level),
@ -317,6 +318,7 @@ struct MutableCFOptions {
Temperature default_write_temperature; Temperature default_write_temperature;
uint32_t memtable_protection_bytes_per_key; uint32_t memtable_protection_bytes_per_key;
uint8_t block_protection_bytes_per_key; uint8_t block_protection_bytes_per_key;
bool paranoid_memory_checks;
uint64_t sample_for_compression; uint64_t sample_for_compression;
std::vector<CompressionType> compression_per_level; std::vector<CompressionType> compression_per_level;

View File

@ -576,6 +576,14 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct ImmutableDBOptions, follower_catchup_retry_wait_ms), {offsetof(struct ImmutableDBOptions, follower_catchup_retry_wait_ms),
OptionType::kUInt64T, OptionVerificationType::kNormal, OptionType::kUInt64T, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}}, OptionTypeFlags::kNone}},
{"metadata_write_temperature",
{offsetof(struct ImmutableDBOptions, metadata_write_temperature),
OptionType::kTemperature, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"wal_write_temperature",
{offsetof(struct ImmutableDBOptions, wal_write_temperature),
OptionType::kTemperature, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
}; };
const std::string OptionsHelper::kDBOptionsName = "DBOptions"; const std::string OptionsHelper::kDBOptionsName = "DBOptions";
@ -778,7 +786,9 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
follower_refresh_catchup_period_ms( follower_refresh_catchup_period_ms(
options.follower_refresh_catchup_period_ms), options.follower_refresh_catchup_period_ms),
follower_catchup_retry_count(options.follower_catchup_retry_count), follower_catchup_retry_count(options.follower_catchup_retry_count),
follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms) { follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms),
metadata_write_temperature(options.metadata_write_temperature),
wal_write_temperature(options.wal_write_temperature) {
fs = env->GetFileSystem(); fs = env->GetFileSystem();
clock = env->GetSystemClock().get(); clock = env->GetSystemClock().get();
logger = info_log.get(); logger = info_log.get();
@ -956,6 +966,10 @@ void ImmutableDBOptions::Dump(Logger* log) const {
db_host_id.c_str()); db_host_id.c_str());
ROCKS_LOG_HEADER(log, " Options.enforce_single_del_contracts: %s", ROCKS_LOG_HEADER(log, " Options.enforce_single_del_contracts: %s",
enforce_single_del_contracts ? "true" : "false"); enforce_single_del_contracts ? "true" : "false");
ROCKS_LOG_HEADER(log, " Options.metadata_write_temperature: %s",
temperature_to_string[metadata_write_temperature].c_str());
ROCKS_LOG_HEADER(log, " Options.wal_write_temperature: %s",
temperature_to_string[wal_write_temperature].c_str());
} }
bool ImmutableDBOptions::IsWalDirSameAsDBPath() const { bool ImmutableDBOptions::IsWalDirSameAsDBPath() const {

View File

@ -103,6 +103,8 @@ struct ImmutableDBOptions {
uint64_t follower_refresh_catchup_period_ms; uint64_t follower_refresh_catchup_period_ms;
uint64_t follower_catchup_retry_count; uint64_t follower_catchup_retry_count;
uint64_t follower_catchup_retry_wait_ms; uint64_t follower_catchup_retry_wait_ms;
Temperature metadata_write_temperature;
Temperature wal_write_temperature;
// Beginning convenience/helper objects that are not part of the base // Beginning convenience/helper objects that are not part of the base
// DBOptions // DBOptions

View File

@ -180,6 +180,15 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
options.enforce_single_del_contracts = options.enforce_single_del_contracts =
immutable_db_options.enforce_single_del_contracts; immutable_db_options.enforce_single_del_contracts;
options.daily_offpeak_time_utc = mutable_db_options.daily_offpeak_time_utc; options.daily_offpeak_time_utc = mutable_db_options.daily_offpeak_time_utc;
options.follower_refresh_catchup_period_ms =
immutable_db_options.follower_refresh_catchup_period_ms;
options.follower_catchup_retry_count =
immutable_db_options.follower_catchup_retry_count;
options.follower_catchup_retry_wait_ms =
immutable_db_options.follower_catchup_retry_wait_ms;
options.metadata_write_temperature =
immutable_db_options.metadata_write_temperature;
options.wal_write_temperature = immutable_db_options.wal_write_temperature;
return options; return options;
} }
@ -213,6 +222,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
moptions.memtable_protection_bytes_per_key; moptions.memtable_protection_bytes_per_key;
cf_opts->block_protection_bytes_per_key = cf_opts->block_protection_bytes_per_key =
moptions.block_protection_bytes_per_key; moptions.block_protection_bytes_per_key;
cf_opts->paranoid_memory_checks = moptions.paranoid_memory_checks;
cf_opts->bottommost_file_compaction_delay = cf_opts->bottommost_file_compaction_delay =
moptions.bottommost_file_compaction_delay; moptions.bottommost_file_compaction_delay;

View File

@ -69,8 +69,9 @@ Status PersistRocksDBOptions(const WriteOptions& write_options,
} }
std::unique_ptr<FSWritableFile> wf; std::unique_ptr<FSWritableFile> wf;
Status s = FileOptions file_options;
fs->NewWritableFile(file_name, FileOptions(), &wf, nullptr); file_options.temperature = db_opt.metadata_write_temperature;
Status s = fs->NewWritableFile(file_name, file_options, &wf, nullptr);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }

View File

@ -188,6 +188,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
"block_size_deviation=8;block_restart_interval=4; " "block_size_deviation=8;block_restart_interval=4; "
"metadata_block_size=1024;" "metadata_block_size=1024;"
"partition_filters=false;" "partition_filters=false;"
"decouple_partitioned_filters=true;"
"optimize_filters_for_memory=true;" "optimize_filters_for_memory=true;"
"use_delta_encoding=true;" "use_delta_encoding=true;"
"index_block_restart_interval=4;" "index_block_restart_interval=4;"
@ -366,7 +367,12 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
"lowest_used_cache_tier=kNonVolatileBlockTier;" "lowest_used_cache_tier=kNonVolatileBlockTier;"
"allow_data_in_errors=false;" "allow_data_in_errors=false;"
"enforce_single_del_contracts=false;" "enforce_single_del_contracts=false;"
"daily_offpeak_time_utc=08:30-19:00;", "daily_offpeak_time_utc=08:30-19:00;"
"follower_refresh_catchup_period_ms=123;"
"follower_catchup_retry_count=456;"
"follower_catchup_retry_wait_ms=789;"
"metadata_write_temperature=kCold;"
"wal_write_temperature=kHot;",
new_options)); new_options));
ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions), ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
@ -567,7 +573,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
"block_protection_bytes_per_key=1;" "block_protection_bytes_per_key=1;"
"memtable_max_range_deletions=999999;" "memtable_max_range_deletions=999999;"
"bottommost_file_compaction_delay=7200;" "bottommost_file_compaction_delay=7200;"
"uncache_aggressiveness=1234;", "uncache_aggressiveness=1234;"
"paranoid_memory_checks=1;",
new_options)); new_options));
ASSERT_NE(new_options->blob_cache.get(), nullptr); ASSERT_NE(new_options->blob_cache.get(), nullptr);

View File

@ -96,7 +96,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
mopt.prefix_extractor.get(), table_opt.whole_key_filtering, mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
filter_bits_builder, table_opt.index_block_restart_interval, filter_bits_builder, table_opt.index_block_restart_interval,
use_delta_encoding_for_index_values, p_index_builder, partition_size, use_delta_encoding_for_index_values, p_index_builder, partition_size,
ts_sz, persist_user_defined_timestamps); ts_sz, persist_user_defined_timestamps,
table_opt.decouple_partitioned_filters);
} else { } else {
return new FullFilterBlockBuilder(mopt.prefix_extractor.get(), return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
table_opt.whole_key_filtering, table_opt.whole_key_filtering,
@ -213,10 +214,11 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
public: public:
explicit BlockBasedTablePropertiesCollector( explicit BlockBasedTablePropertiesCollector(
BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering, BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering,
bool prefix_filtering) bool prefix_filtering, bool decoupled_partitioned_filters)
: index_type_(index_type), : index_type_(index_type),
whole_key_filtering_(whole_key_filtering), whole_key_filtering_(whole_key_filtering),
prefix_filtering_(prefix_filtering) {} prefix_filtering_(prefix_filtering),
decoupled_partitioned_filters_(decoupled_partitioned_filters) {}
Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/, Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
uint64_t /*file_size*/) override { uint64_t /*file_size*/) override {
@ -240,6 +242,11 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
whole_key_filtering_ ? kPropTrue : kPropFalse}); whole_key_filtering_ ? kPropTrue : kPropFalse});
properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering, properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering,
prefix_filtering_ ? kPropTrue : kPropFalse}); prefix_filtering_ ? kPropTrue : kPropFalse});
if (decoupled_partitioned_filters_) {
properties->insert(
{BlockBasedTablePropertyNames::kDecoupledPartitionedFilters,
kPropTrue});
}
return Status::OK(); return Status::OK();
} }
@ -257,6 +264,7 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
BlockBasedTableOptions::IndexType index_type_; BlockBasedTableOptions::IndexType index_type_;
bool whole_key_filtering_; bool whole_key_filtering_;
bool prefix_filtering_; bool prefix_filtering_;
bool decoupled_partitioned_filters_;
}; };
struct BlockBasedTableBuilder::Rep { struct BlockBasedTableBuilder::Rep {
@ -296,7 +304,7 @@ struct BlockBasedTableBuilder::Rep {
std::string index_separator_scratch; std::string index_separator_scratch;
PartitionedIndexBuilder* p_index_builder_ = nullptr; PartitionedIndexBuilder* p_index_builder_ = nullptr;
std::string last_key; std::string last_ikey; // Internal key or empty (unset)
const Slice* first_key_in_next_block = nullptr; const Slice* first_key_in_next_block = nullptr;
CompressionType compression_type; CompressionType compression_type;
uint64_t sample_for_compression; uint64_t sample_for_compression;
@ -594,7 +602,8 @@ struct BlockBasedTableBuilder::Rep {
table_properties_collectors.emplace_back( table_properties_collectors.emplace_back(
new BlockBasedTablePropertiesCollector( new BlockBasedTablePropertiesCollector(
table_options.index_type, table_options.whole_key_filtering, table_options.index_type, table_options.whole_key_filtering,
prefix_extractor != nullptr)); prefix_extractor != nullptr,
table_options.decouple_partitioned_filters));
if (ts_sz > 0 && persist_user_defined_timestamps) { if (ts_sz > 0 && persist_user_defined_timestamps) {
table_properties_collectors.emplace_back( table_properties_collectors.emplace_back(
new TimestampTablePropertiesCollector( new TimestampTablePropertiesCollector(
@ -618,6 +627,9 @@ struct BlockBasedTableBuilder::Rep {
if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) { if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) {
ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set"); ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set");
} }
// Default is UINT64_MAX for unknown. Setting it to 0 here
// to allow updating it by taking max in BlockBasedTableBuilder::Add().
props.key_largest_seqno = 0;
if (FormatVersionUsesContextChecksum(table_options.format_version)) { if (FormatVersionUsesContextChecksum(table_options.format_version)) {
// Must be non-zero and semi- or quasi-random // Must be non-zero and semi- or quasi-random
@ -654,6 +666,7 @@ struct BlockBasedTableBuilder::Rep {
}; };
struct BlockBasedTableBuilder::ParallelCompressionRep { struct BlockBasedTableBuilder::ParallelCompressionRep {
// TODO: consider replacing with autovector or similar
// Keys is a wrapper of vector of strings avoiding // Keys is a wrapper of vector of strings avoiding
// releasing string memories during vector clear() // releasing string memories during vector clear()
// in order to save memory allocation overhead // in order to save memory allocation overhead
@ -998,24 +1011,27 @@ BlockBasedTableBuilder::~BlockBasedTableBuilder() {
delete rep_; delete rep_;
} }
void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
Rep* r = rep_; Rep* r = rep_;
assert(rep_->state != Rep::State::kClosed); assert(rep_->state != Rep::State::kClosed);
if (!ok()) { if (!ok()) {
return; return;
} }
ValueType value_type = ExtractValueType(key); ValueType value_type;
SequenceNumber seq;
UnPackSequenceAndType(ExtractInternalKeyFooter(ikey), &seq, &value_type);
r->props.key_largest_seqno = std::max(r->props.key_largest_seqno, seq);
if (IsValueType(value_type)) { if (IsValueType(value_type)) {
#ifndef NDEBUG #ifndef NDEBUG
if (r->props.num_entries > r->props.num_range_deletions) { if (r->props.num_entries > r->props.num_range_deletions) {
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0); assert(r->internal_comparator.Compare(ikey, Slice(r->last_ikey)) > 0);
} }
#endif // !NDEBUG #endif // !NDEBUG
auto should_flush = r->flush_block_policy->Update(key, value); auto should_flush = r->flush_block_policy->Update(ikey, value);
if (should_flush) { if (should_flush) {
assert(!r->data_block.empty()); assert(!r->data_block.empty());
r->first_key_in_next_block = &key; r->first_key_in_next_block = &ikey;
Flush(); Flush();
if (r->state == Rep::State::kBuffered) { if (r->state == Rep::State::kBuffered) {
bool exceeds_buffer_limit = bool exceeds_buffer_limit =
@ -1050,7 +1066,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
if (r->IsParallelCompressionEnabled()) { if (r->IsParallelCompressionEnabled()) {
r->pc_rep->curr_block_keys->Clear(); r->pc_rep->curr_block_keys->Clear();
} else { } else {
r->index_builder->AddIndexEntry(r->last_key, &key, r->pending_handle, r->index_builder->AddIndexEntry(r->last_ikey, &ikey,
r->pending_handle,
&r->index_separator_scratch); &r->index_separator_scratch);
} }
} }
@ -1060,27 +1077,31 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
// builder after being added to index builder. // builder after being added to index builder.
if (r->state == Rep::State::kUnbuffered) { if (r->state == Rep::State::kUnbuffered) {
if (r->IsParallelCompressionEnabled()) { if (r->IsParallelCompressionEnabled()) {
r->pc_rep->curr_block_keys->PushBack(key); r->pc_rep->curr_block_keys->PushBack(ikey);
} else { } else {
if (r->filter_builder != nullptr) { if (r->filter_builder != nullptr) {
r->filter_builder->Add( r->filter_builder->AddWithPrevKey(
ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz),
r->last_ikey.empty()
? Slice{}
: ExtractUserKeyAndStripTimestamp(r->last_ikey, r->ts_sz));
} }
} }
} }
r->data_block.AddWithLastKey(key, value, r->last_key); r->data_block.AddWithLastKey(ikey, value, r->last_ikey);
r->last_key.assign(key.data(), key.size()); r->last_ikey.assign(ikey.data(), ikey.size());
assert(!r->last_ikey.empty());
if (r->state == Rep::State::kBuffered) { if (r->state == Rep::State::kBuffered) {
// Buffered keys will be replayed from data_block_buffers during // Buffered keys will be replayed from data_block_buffers during
// `Finish()` once compression dictionary has been finalized. // `Finish()` once compression dictionary has been finalized.
} else { } else {
if (!r->IsParallelCompressionEnabled()) { if (!r->IsParallelCompressionEnabled()) {
r->index_builder->OnKeyAdded(key); r->index_builder->OnKeyAdded(ikey);
} }
} }
// TODO offset passed in is not accurate for parallel compression case // TODO offset passed in is not accurate for parallel compression case
NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
r->table_properties_collectors, r->table_properties_collectors,
r->ioptions.logger); r->ioptions.logger);
@ -1094,9 +1115,9 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
if (r->ts_sz > 0 && !r->persist_user_defined_timestamps) { if (r->ts_sz > 0 && !r->persist_user_defined_timestamps) {
persisted_end = StripTimestampFromUserKey(value, r->ts_sz); persisted_end = StripTimestampFromUserKey(value, r->ts_sz);
} }
r->range_del_block.Add(key, persisted_end); r->range_del_block.Add(ikey, persisted_end);
// TODO offset passed in is not accurate for parallel compression case // TODO offset passed in is not accurate for parallel compression case
NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
r->table_properties_collectors, r->table_properties_collectors,
r->ioptions.logger); r->ioptions.logger);
} else { } else {
@ -1108,7 +1129,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
} }
r->props.num_entries++; r->props.num_entries++;
r->props.raw_key_size += key.size(); r->props.raw_key_size += ikey.size();
if (!r->persist_user_defined_timestamps) { if (!r->persist_user_defined_timestamps) {
r->props.raw_key_size -= r->ts_sz; r->props.raw_key_size -= r->ts_sz;
} }
@ -1452,6 +1473,8 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
Rep* r = rep_; Rep* r = rep_;
ParallelCompressionRep::BlockRepSlot* slot = nullptr; ParallelCompressionRep::BlockRepSlot* slot = nullptr;
ParallelCompressionRep::BlockRep* block_rep = nullptr; ParallelCompressionRep::BlockRep* block_rep = nullptr;
// Starts empty; see FilterBlockBuilder::AddWithPrevKey
std::string prev_block_last_key_no_ts;
while (r->pc_rep->write_queue.pop(slot)) { while (r->pc_rep->write_queue.pop(slot)) {
assert(slot != nullptr); assert(slot != nullptr);
slot->Take(block_rep); slot->Take(block_rep);
@ -1465,13 +1488,20 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
continue; continue;
} }
Slice prev_key_no_ts = prev_block_last_key_no_ts;
for (size_t i = 0; i < block_rep->keys->Size(); i++) { for (size_t i = 0; i < block_rep->keys->Size(); i++) {
auto& key = (*block_rep->keys)[i]; auto& key = (*block_rep->keys)[i];
if (r->filter_builder != nullptr) { if (r->filter_builder != nullptr) {
r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); Slice key_no_ts = ExtractUserKeyAndStripTimestamp(key, r->ts_sz);
r->filter_builder->AddWithPrevKey(key_no_ts, prev_key_no_ts);
prev_key_no_ts = key_no_ts;
} }
r->index_builder->OnKeyAdded(key); r->index_builder->OnKeyAdded(key);
} }
if (r->filter_builder != nullptr) {
prev_block_last_key_no_ts.assign(prev_key_no_ts.data(),
prev_key_no_ts.size());
}
r->pc_rep->file_size_estimator.SetCurrBlockUncompSize( r->pc_rep->file_size_estimator.SetCurrBlockUncompSize(
block_rep->data->size()); block_rep->data->size());
@ -1563,6 +1593,13 @@ void BlockBasedTableBuilder::WriteFilterBlock(
// No filter block needed // No filter block needed
return; return;
} }
if (!rep_->last_ikey.empty()) {
// We might have been using AddWithPrevKey, so need PrevKeyBeforeFinish
// to be safe. And because we are re-synchronized after buffered/parallel
// operation, rep_->last_ikey is accurate.
rep_->filter_builder->PrevKeyBeforeFinish(
ExtractUserKeyAndStripTimestamp(rep_->last_ikey, rep_->ts_sz));
}
BlockHandle filter_block_handle; BlockHandle filter_block_handle;
bool is_partitioned_filter = rep_->table_options.partition_filters; bool is_partitioned_filter = rep_->table_options.partition_filters;
if (ok()) { if (ok()) {
@ -1578,9 +1615,10 @@ void BlockBasedTableBuilder::WriteFilterBlock(
// See FilterBlockBuilder::Finish() for more on the difference in // See FilterBlockBuilder::Finish() for more on the difference in
// transferred filter data payload among different FilterBlockBuilder // transferred filter data payload among different FilterBlockBuilder
// subtypes. // subtypes.
std::unique_ptr<const char[]> filter_data; std::unique_ptr<const char[]> filter_owner;
Slice filter_content = Slice filter_content;
rep_->filter_builder->Finish(filter_block_handle, &s, &filter_data); s = rep_->filter_builder->Finish(filter_block_handle, &filter_content,
&filter_owner);
assert(s.ok() || s.IsIncomplete() || s.IsCorruption()); assert(s.ok() || s.IsIncomplete() || s.IsCorruption());
if (s.IsCorruption()) { if (s.IsCorruption()) {
@ -1749,6 +1787,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
rep_->props.user_defined_timestamps_persisted = rep_->props.user_defined_timestamps_persisted =
rep_->persist_user_defined_timestamps; rep_->persist_user_defined_timestamps;
assert(IsEmpty() || rep_->props.key_largest_seqno != UINT64_MAX);
// Add basic properties // Add basic properties
property_block_builder.AddTableProperty(rep_->props); property_block_builder.AddTableProperty(rep_->props);
@ -1976,6 +2015,10 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
for (; iter->Valid(); iter->Next()) { for (; iter->Valid(); iter->Next()) {
Slice key = iter->key(); Slice key = iter->key();
if (r->filter_builder != nullptr) { if (r->filter_builder != nullptr) {
// NOTE: AddWithPrevKey here would only save key copying if prev is
// pinned (iter->IsKeyPinned()), which is probably rare with delta
// encoding. OK to go from Add() here to AddWithPrevKey() in
// unbuffered operation.
r->filter_builder->Add( r->filter_builder->Add(
ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
} }
@ -1989,6 +2032,7 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
Slice* first_key_in_next_block_ptr = &first_key_in_next_block; Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
iter->SeekToLast(); iter->SeekToLast();
assert(iter->Valid());
r->index_builder->AddIndexEntry( r->index_builder->AddIndexEntry(
iter->key(), first_key_in_next_block_ptr, r->pending_handle, iter->key(), first_key_in_next_block_ptr, r->pending_handle,
&r->index_separator_scratch); &r->index_separator_scratch);
@ -2027,7 +2071,7 @@ Status BlockBasedTableBuilder::Finish() {
// block, we will finish writing all index entries first. // block, we will finish writing all index entries first.
if (ok() && !empty_data_block) { if (ok() && !empty_data_block) {
r->index_builder->AddIndexEntry( r->index_builder->AddIndexEntry(
r->last_key, nullptr /* no next data block */, r->pending_handle, r->last_ikey, nullptr /* no next data block */, r->pending_handle,
&r->index_separator_scratch); &r->index_separator_scratch);
} }
} }

View File

@ -304,6 +304,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct BlockBasedTableOptions, partition_filters), {offsetof(struct BlockBasedTableOptions, partition_filters),
OptionType::kBoolean, OptionVerificationType::kNormal, OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}}, OptionTypeFlags::kNone}},
{"decouple_partitioned_filters",
{offsetof(struct BlockBasedTableOptions, decouple_partitioned_filters),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"optimize_filters_for_memory", {"optimize_filters_for_memory",
{offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory), {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory),
OptionType::kBoolean, OptionVerificationType::kNormal, OptionType::kBoolean, OptionVerificationType::kNormal,
@ -971,6 +975,8 @@ const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
"rocksdb.block.based.table.whole.key.filtering"; "rocksdb.block.based.table.whole.key.filtering";
const std::string BlockBasedTablePropertyNames::kPrefixFiltering = const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
"rocksdb.block.based.table.prefix.filtering"; "rocksdb.block.based.table.prefix.filtering";
const std::string BlockBasedTablePropertyNames::kDecoupledPartitionedFilters =
"rocksdb.block.based.table.decoupled.partitioned.filters";
const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes"; const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
const std::string kHashIndexPrefixesMetadataBlock = const std::string kHashIndexPrefixesMetadataBlock =
"rocksdb.hashindex.metadata"; "rocksdb.hashindex.metadata";

Some files were not shown because too many files have changed in this diff Show More